[Polly] Remove Polly-ACC.

Polly-ACC is unmaintained and since it has never been ported to the NPM pipeline, since D136621 it is not even accessible anymore without manually specifying the passes on the `opt` command line. Since there is no plan to put it to a maintainable state, remove it from Polly. Reviewed By: grosser Differential Revision: https://reviews.llvm.org/D142580
author: Michael Kruse <llvm-project@meinersbur.de> 2023-01-25 14:03:57 -0600
committer: Michael Kruse <llvm-project@meinersbur.de> 2023-03-08 17:33:04 -0600
commit: 19afbfe33156d211fa959dadeea46cd17b9c723c (patch)
tree: db53498143b16127c6c0e22a671a8d11eece4152 /polly/test
parent: 115c7beda74f3cfaf83b91d14bc97a39bff4cf19 (diff)
download: llvm-19afbfe33156d211fa959dadeea46cd17b9c723c.tar.gz
76 files changed, 0 insertions, 6002 deletions
diff --git a/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll b/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll
deleted file mode 100644
index 3f4c4a0aa610..000000000000
--- a/polly/test/GPGPU/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-define float @__nv_expf(float %a) {
-  ret float %a
-}
-define float @__nv_cosf(float %a) {
-  ret float %a
-}
-define float @__nv_logf(float %a) {
-  ret float %a
-}
diff --git a/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll b/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll
deleted file mode 100644
index 64b4cc4aa100..000000000000
--- a/polly/test/GPGPU/add-scalars-in-scop-to-kills.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; Check that we detect a scop.
-; SCOP:       Function: checkScalarKill
-; SCOP-NEXT: Region: %XLoopInit---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-
-; Check that we have a scalar that is not a phi node in the scop.
-; SCOP: i32 MemRef_x_0; // Element size 4
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-; Check that we add variables that are local to a scop into the kills that we
-; pass to PPCG. This should enable PPCG to codegen this example.
-; void checkScalarKill(int A[], int B[], int C[], const int control1, int control2) {
-; int x;
-; #pragma scop
-;     for(int i = 0; i < 1000; i++) {
-; XLoopInit:        x = 0;
-;
-;         if (control1 > 2)
-;             C1Add: x += 10;
-;         if (control2 > 3)
-;             C2Add: x += A[i];
-;
-; BLoopAccumX:        B[i] += x;
-;     }
-;
-; #pragma endscop
-; }
-; ModuleID = 'test.ll'
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @checkScalarKill(ptr %A, ptr %B, ptr %C, i32 %control1, i32 %control2) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %XLoopInit
-
-XLoopInit:                                        ; preds = %entry.split, %BLoopAccumX
-  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %BLoopAccumX ]
-  %cmp1 = icmp sgt i32 %control1, 2
-  %x.0 = select i1 %cmp1, i32 10, i32 0
-  %cmp2 = icmp sgt i32 %control2, 3
-  br i1 %cmp2, label %C2Add, label %BLoopAccumX
-
-C2Add:                                            ; preds = %XLoopInit
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
-  %tmp6 = load i32, ptr %arrayidx, align 4
-  %add4 = add nsw i32 %tmp6, %x.0
-  br label %BLoopAccumX
-
-BLoopAccumX:                                      ; preds = %XLoopInit, %C2Add
-  %x.1 = phi i32 [ %add4, %C2Add ], [ %x.0, %XLoopInit ]
-  %arrayidx7 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
-  %tmp11 = load i32, ptr %arrayidx7, align 4
-  %add8 = add nsw i32 %tmp11, %x.1
-  store i32 %add8, ptr %arrayidx7, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %XLoopInit, label %for.end
-
-for.end:                                          ; preds = %BLoopAccumX
-  ret void
-}
diff --git a/polly/test/GPGPU/align-params-in-schedule.ll b/polly/test/GPGPU/align-params-in-schedule.ll
deleted file mode 100644
index fa9a8f3eb4e5..000000000000
--- a/polly/test/GPGPU/align-params-in-schedule.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting -polly-ignore-parameter-bounds < %s | \
-; RUN: FileCheck %s
-
-; REQUIRES: pollyacc
-
-; CHECK: polly_launchKernel
-
-; Verify that this program compiles. At some point, this compilation crashed
-; due to insufficient parameters being available.
-
-source_filename = "bugpoint-output-4d01492.bc"
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.barney = type { ptr, i64, i64, [2 x %struct.widget] }
-%struct.widget = type { i64, i64, i64 }
-
-@global = external unnamed_addr global %struct.barney, align 32
-
-; Function Attrs: nounwind uwtable
-define void @wobble(ptr noalias %arg) #0 {
-bb:
-  %tmp = load i32, ptr %arg, align 4
-  br label %bb1
-
-bb1:                                              ; preds = %bb13, %bb
-  %tmp2 = phi i32 [ %tmp15, %bb13 ], [ 1, %bb ]
-  br label %bb3
-
-bb3:                                              ; preds = %bb3, %bb1
-  %tmp4 = load ptr, ptr @global, align 32
-  %tmp5 = sext i32 %tmp2 to i64
-  %tmp6 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 3, i64 1, i32 0), align 8
-  %tmp7 = mul i64 %tmp6, %tmp5
-  %tmp8 = add i64 %tmp7, 0
-  %tmp9 = load i64, ptr getelementptr inbounds (%struct.barney, ptr @global, i64 0, i32 1), align 8
-  %tmp10 = add i64 %tmp8, %tmp9
-  %tmp11 = getelementptr i32, ptr %tmp4, i64 %tmp10
-  store i32 undef, ptr %tmp11, align 4
-  %tmp12 = icmp eq i32 0, 0
-  br i1 %tmp12, label %bb13, label %bb3
-
-bb13:                                             ; preds = %bb3
-  %tmp14 = icmp eq i32 %tmp2, %tmp
-  %tmp15 = add i32 %tmp2, 1
-  br i1 %tmp14, label %bb16, label %bb1
-
-bb16:                                             ; preds = %bb13
-  ret void
-}
-
-attributes #0 = { nounwind uwtable }
diff --git a/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll b/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll
deleted file mode 100644
index 12b872d55192..000000000000
--- a/polly/test/GPGPU/array-with-elem-type-smaller-than-byte.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: opt %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-use-llvm-names < %s
-; ModuleID = 'test/GPGPU/zero-size-array.ll'
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-
-; We used to divide the element size by 8 to arrive at the 'actual' size
-; of an array element. This used to cause arrays that have an element size
-; of less than 8 to collapse to size 0. This test makes sure that it does
-; not happen anymore.
-
-; f(int *niters_ptr, int *arr[0]) {
-;     const int inters = *niters_ptr;
-;     for(int i = 0; i < niters; i++) {
-;       arr[0][i + 1] = 0
-;     }
-; }
-
-; Function Attrs: nounwind uwtable
-define void @f(ptr noalias %niters.ptr, ptr noalias %arr) #0 {
-entry:
-  %niters = load i32, ptr %niters.ptr, align 4
-  br label %loop.body
-
-loop.body:                                             ; preds = %loop.body, %entry
-  %indvar = phi i32 [ %indvar.next, %loop.body ], [ 1, %entry ]
-  %indvar.sext = sext i32 %indvar to i64
-  %arr.slot = getelementptr [0 x i32], ptr %arr, i64 0, i64 %indvar.sext
-  store i32 0, ptr %arr.slot, align 4
-  %tmp8 = icmp eq i32 %indvar, %niters
-  %indvar.next = add i32 %indvar, 1
-  br i1 %tmp8, label %loop.exit, label %loop.body
-
-loop.exit:                                    ; preds = %loop.body
-  %tmp10 = icmp sgt i32 undef, 0
-  br label %auxiliary.loop
-
-auxiliary.loop:                                            ; preds = %"101", %loop.exit
-  %tmp11 = phi i1 [ %tmp10, %loop.exit ], [ undef, %auxiliary.loop ]
-  br i1 undef, label %auxiliary.loop, label %exit
-
-exit:                              ; preds = %auxiliary.loop
-  ret void
-}
-
-attributes #0 = { nounwind uwtable }
diff --git a/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll b/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll
deleted file mode 100644
index a60744289885..000000000000
--- a/polly/test/GPGPU/bounds-construction-with-ignore-param-bounds.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: opt %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-ignore-parameter-bounds \
-; RUN: -polly-invariant-load-hoisting < %s| FileCheck %s -check-prefix=HOST-IR
-;
-; REQUIRES: pollyacc
-
-; When we have `-polly-ignore-parameter-bounds`, `Scop::Context` does not contain
-; all the parameters present in the program.
-;
-; The construction of the `isl_multi_pw_aff` requires all the indivisual `pw_aff`
-; to have the same parameter dimensions. To achieve this, we used to realign
-; every `pw_aff` with `Scop::Context`. However, in conjunction with
-; `-polly-ignore-parameter-bounds`, this is now incorrect, since `Scop::Context`
-; does not contain all parameters.
-;
-; We check that Polly does the right thing in this case and sets up the parameter
-; dimensions correctly.
-
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-; ModuleID = 'test/GPGPU/bounds-construction-with-ignore-param-bounds.ll'
-
-; C pseudocode
-; ------------
-; void f(int *arr, long niters, long stride) {
-;     for(int i = 0; i < niters; i++) {
-;       arr[i * stride] = 1;
-;     }
-; }
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: nounwind uwtable
-define void @f(ptr %arr, i64 %niters, i64 %stride) unnamed_addr #1 {
-entry:
-  br label %loop
-
-loop:                                             ; preds = %loop, %entry
-  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ]
-  %idx = mul nuw nsw i64 %indvar, %stride
-  %slot = getelementptr i32, ptr %arr, i64 %idx
-  store i32 1, ptr %slot, align 4
-  %indvar.next = add nuw nsw i64 %indvar, 1
-  %check = icmp sgt i64 %indvar.next, %niters
-  br i1 %check, label %exit, label %loop
-
-exit:                                             ; preds = %loop
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind uwtable }
diff --git a/polly/test/GPGPU/cuda-annotations.ll b/polly/test/GPGPU/cuda-annotations.ll
deleted file mode 100644
index cbb0296d48ef..000000000000
--- a/polly/test/GPGPU/cuda-annotations.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; REQUIRES: pollyacc
-
-; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n) #0 {
-
-; KERNEL: !nvvm.annotations = !{!0}
-
-; KERNEL: !0 = !{ptr @FUNC_foo_SCOP_0_KERNEL_0, !"maxntidx", i32 32, !"maxntidy", i32 1, !"maxntidz", i32 1}
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, i64 %n) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb6, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
-  %tmp = icmp slt i64 %i.0, %n
-  br i1 %tmp, label %bb2, label %bb8
-
-bb2:                                              ; preds = %bb1
-  %tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp4 = load i64, ptr %tmp3, align 8
-  %tmp5 = add nsw i64 %tmp4, 100
-  store i64 %tmp5, ptr %tmp3, align 8
-  br label %bb6
-
-bb6:                                              ; preds = %bb2
-  %tmp7 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb8:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/cuda-managed-memory-simple.ll b/polly/test/GPGPU/cuda-managed-memory-simple.ll
deleted file mode 100644
index 8ef7e336cfad..000000000000
--- a/polly/test/GPGPU/cuda-managed-memory-simple.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -S  -polly-process-unprofitable -polly-acc-mincompute=0 -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \
-; RUN: FileCheck %s
-
-; REQUIRES: pollyacc
-
-;
-;    #include <cuda_runtime.h>
-;
-;    static const int N = 45;
-;
-;    void copy(int *R, int *A) {
-;      for (int i = 0; i < N; i++) {
-;        R[i] = A[i] * 10;
-;      }
-;    }
-;
-;    int main() {
-;      int *A, *R;
-;
-;      cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal);
-;      cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal);
-;
-;      for (int i = 0; i < N; i++) {
-;        A[i] = i;
-;        R[i] = 0;
-;      }
-;      copy(R, A);
-;
-;      return 0;
-;    }
-;
-
-; CHECK-NOT: polly_copyFromHostToDevice
-; CHECK-NOT: polly_copyFromDeviceToHost
-; CHECK-NOT: polly_freeDeviceMemory
-; CHECK-NOT: polly_allocateMemoryForDevice
-
-; CHECK:       %[[REGCTX:[0-9]+]] = call i8* @polly_initContextCUDA()
-; CHECK-NEXT:  %[[REGCA:[0-9]+]] = bitcast i32* %A to i8*
-; CHECK-NEXT:  %[[REGCR:[0-9]+]] = bitcast i32* %R to i8*
-; CHECK-NEXT:  %[[REGGEP0:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
-; CHECK-NEXT:  store i8* %[[REGCA]], i8** %polly_launch_0_param_0
-; CHECK-NEXT:  %[[REGCP0:[0-9]+]] = bitcast i8** %polly_launch_0_param_0 to i8*
-; CHECK-NEXT:  store i8* %[[REGCP0]], i8** %[[REGGEP0]]
-; CHECK-NEXT:  %[[REGGEP1:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
-; CHECK-NEXT:  store i8* %[[REGCR]], i8** %polly_launch_0_param_1
-; CHECK-NEXT:  %[[REGCP1:[0-9]+]] = bitcast i8** %polly_launch_0_param_1 to i8*
-; CHECK-NEXT:  store i8* %[[REGCP1]], i8** %[[REGGEP1]]
-; CHECK-NEXT:  %[[REGKERNEL:[0-9]+]] = call i8* @polly_getKernel(i8* getelementptr inbounds ([863 x i8], [863 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0))
-; CHECK-NEXT:  call void @polly_launchKernel(i8* %[[REGKERNEL]], i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
-; CHECK-NEXT:  call void @polly_freeKernel(i8* %[[REGKERNEL]])
-; CHECK-NEXT:  call void @polly_synchronizeDevice()
-; CHECK-NEXT:  call void @polly_freeContext(i8* %[[REGCTX]])
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @copy(i32* %R, i32* %A) {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
-  %exitcond = icmp ne i64 %indvars.iv, 45
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
-  %tmp = load i32, i32* %arrayidx, align 4
-  %mul = mul nsw i32 %tmp, 10
-  %arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv
-  store i32 %mul, i32* %arrayidx2, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  ret void
-}
-
-define i32 @main() {
-entry:
-  %A = alloca i32*, align 8
-  %R = alloca i32*, align 8
-  %tmp = bitcast i32** %A to i8**
-  %call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2
-  %tmp1 = bitcast i32** %R to i8**
-  %call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
-  %exitcond = icmp ne i64 %indvars.iv, 45
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %tmp2 = load i32*, i32** %A, align 8
-  %arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv
-  %tmp3 = trunc i64 %indvars.iv to i32
-  store i32 %tmp3, i32* %arrayidx, align 4
-  %tmp4 = load i32*, i32** %R, align 8
-  %arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv
-  store i32 0, i32* %arrayidx3, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %tmp5 = load i32*, i32** %R, align 8
-  %tmp6 = load i32*, i32** %A, align 8
-  call void @copy(i32* %tmp5, i32* %tmp6)
-  ret i32 0
-}
-
-declare i32 @cudaMallocManaged(i8**, i64, i32) #1
diff --git a/polly/test/GPGPU/debug-metadata-leak.ll b/polly/test/GPGPU/debug-metadata-leak.ll
deleted file mode 100644
index c90926c318e8..000000000000
--- a/polly/test/GPGPU/debug-metadata-leak.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; RUN: opt %loadPolly %s -polly-process-unprofitable -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: | FileCheck --check-prefix=KERNEL-IR %s
-
-; REQUIRES: pollyacc
-
-; KERNEL-IR: define ptx_kernel void @FUNC_vec_add_1_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arr, i32 %N) #0 {
-
-; The instruction marked <<<LeakyInst>>> is copied into the GPUModule,
-; with changes only to the parameters to access data on the device instead of
-; the host, i.e., MemRef_arr becomes polly.access.cast.MemRef_arr. Since the
-; instruction is annotated with a DILocation, copying the instruction also copies
-; the metadata into the GPUModule. This stops codegenerating the ptx_kernel by
-; failing the verification of the Module in GPUNodeBuilder::finalize, due to the
-; copied DICompileUnit not being listed in a llvm.dbg.cu which was neither copied
-; nor created.
-;
-; https://reviews.llvm.org/D35630 removes this debug metadata before the
-; instruction is copied to the GPUModule.
-;
-; vec_add_1.c:
-;      void vec_add_1(int N, int arr[N]) {
-;        int i=0;
-;        for( i=0 ; i<N ; i++) arr[i] += 1;
-;      }
-;
-source_filename = "vec_add_1.c"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define void @vec_add_1(i32 %N, ptr %arr) !dbg !7 {
-entry:
-  call void @llvm.dbg.value(metadata i32 %N, i64 0, metadata !13, metadata !16), !dbg !17
-  call void @llvm.dbg.value(metadata ptr %arr, i64 0, metadata !14, metadata !16), !dbg !18
-  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
-  %tmp = sext i32 %N to i64, !dbg !20
-  br label %for.cond, !dbg !20
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
-  call void @llvm.dbg.value(metadata i32 undef, i64 0, metadata !15, metadata !16), !dbg !19
-  %cmp = icmp slt i64 %indvars.iv, %tmp, !dbg !22
-  br i1 %cmp, label %for.body, label %for.end, !dbg !24
-
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %indvars.iv, !dbg !25
-  %tmp1 = load i32, ptr %arrayidx, align 4, !dbg !26, !tbaa !27
-  %add = add nsw i32 %tmp1, 1, !dbg !26    ;   <<<LeakyInst>>>
-  store i32 %add, ptr %arrayidx, align 4, !dbg !26, !tbaa !27
-  br label %for.inc, !dbg !25
-
-for.inc:                                          ; preds = %for.body
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !31
-  call void @llvm.dbg.value(metadata !2, i64 0, metadata !15, metadata !16), !dbg !19
-  br label %for.cond, !dbg !32, !llvm.loop !33
-
-for.end:                                          ; preds = %for.cond
-  ret void, !dbg !35
-}
-
-declare void @llvm.dbg.declare(metadata, metadata, metadata)
-
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
-
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "vec_add_1.c", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 5.0.0"}
-!7 = distinct !DISubprogram(name: "vec_add_1", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
-!8 = !DISubroutineType(types: !9)
-!9 = !{null, !10, !11}
-!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
-!12 = !{!13, !14, !15}
-!13 = !DILocalVariable(name: "N", arg: 1, scope: !7, file: !1, line: 1, type: !10)
-!14 = !DILocalVariable(name: "arr", arg: 2, scope: !7, file: !1, line: 1, type: !11)
-!15 = !DILocalVariable(name: "i", scope: !7, file: !1, line: 2, type: !10)
-!16 = !DIExpression()
-!17 = !DILocation(line: 1, column: 20, scope: !7)
-!18 = !DILocation(line: 1, column: 27, scope: !7)
-!19 = !DILocation(line: 2, column: 7, scope: !7)
-!20 = !DILocation(line: 3, column: 8, scope: !21)
-!21 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
-!22 = !DILocation(line: 3, column: 15, scope: !23)
-!23 = distinct !DILexicalBlock(scope: !21, file: !1, line: 3, column: 3)
-!24 = !DILocation(line: 3, column: 3, scope: !21)
-!25 = !DILocation(line: 3, column: 25, scope: !23)
-!26 = !DILocation(line: 3, column: 32, scope: !23)
-!27 = !{!28, !28, i64 0}
-!28 = !{!"int", !29, i64 0}
-!29 = !{!"omnipotent char", !30, i64 0}
-!30 = !{!"Simple C/C++ TBAA"}
-!31 = !DILocation(line: 3, column: 21, scope: !23)
-!32 = !DILocation(line: 3, column: 3, scope: !23)
-!33 = distinct !{!33, !24, !34}
-!34 = !DILocation(line: 3, column: 35, scope: !21)
-!35 = !DILocation(line: 4, column: 1, scope: !7)
diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll
deleted file mode 100644
index 4aeee035a407..000000000000
--- a/polly/test/GPGPU/double-parallel-loop.ll
+++ /dev/null
@@ -1,254 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-schedule \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=SCHED %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-IR
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-asm \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-ASM
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today due to extensive output differences from when the test was written.
-
-; CHECK: Stmt_bb5
-; CHECK-NEXT:       Domain :=
-; CHECK-NEXT:           { Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 };
-; CHECK-NEXT:       Schedule :=
-; CHECK-NEXT:           { Stmt_bb5[i0, i1] -> [i0, i1] };
-; CHECK-NEXT:       ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
-; CHECK-NEXT:           { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
-; CHECK-NEXT:       MustWriteAccess :=  [Reduction Type: NONE] [Scalar: 0]
-; CHECK-NEXT:           { Stmt_bb5[i0, i1] -> MemRef_A[i0, i1] };
-
-; SCHED: domain: "{ Stmt_bb5[i0, i1] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 }"
-; SCHED-NEXT: child:
-; SCHED-NEXT:   context: "{ [] }"
-; SCHED-NEXT:   child:
-; SCHED-NEXT:     extension: "{ [] -> from_device_MemRef_A[]; [] -> to_device_MemRef_A[] }"
-; SCHED-NEXT:     child:
-; SCHED-NEXT:       sequence:
-; SCHED-NEXT:       - filter: "{ to_device_MemRef_A[] }"
-; SCHED-NEXT:         child:
-; SCHED-NEXT:           set:
-; SCHED-NEXT:           - filter: "{ to_device_MemRef_A[] }"
-; SCHED-NEXT:             child:
-; SCHED-NEXT:               guard: "{ [] }"
-; SCHED-NEXT:       - filter: "{ Stmt_bb5[i0, i1] }"
-; SCHED-NEXT:         child:
-; SCHED-NEXT:           guard: "{ [] }"
-; SCHED-NEXT:           child:
-; SCHED-NEXT:             mark: "kernel"
-; SCHED-NEXT:             child:
-; SCHED-NEXT:               context: "[b0, b1, t0, t1] -> { [] : 0 <= b0 <= 31 and 0 <= b1 <= 31 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
-; SCHED-NEXT:               child:
-; SCHED-NEXT:                 filter: "[b0, b1] -> { Stmt_bb5[i0, i1] : -31 - 32b0 + i0 <= 8192*floor((i0)/8192) <= -32b0 + i0 and -31 - 32b1 + i1 <= 8192*floor((i1)/8192) <= -32b1 + i1 }"
-; SCHED-NEXT:                 child:
-; SCHED-NEXT:                   schedule: "[{ Stmt_bb5[i0, i1] -> [(floor((i0)/8192))] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/8192))] }]"
-; SCHED-NEXT:                   permutable: 1
-; SCHED-NEXT:                   coincident: [ 1, 1 ]
-; SCHED-NEXT:                   child:
-; SCHED-NEXT:                     filter: "[t0, t1] -> { Stmt_bb5[i0, i1] : 32*floor((-t0 + i0)/32) = -t0 + i0 and 16*floor((-t1 + i1)/16) = -t1 + i1 and 0 <= t0 <= 31 and 0 <= t1 <= 15 }"
-; SCHED-NEXT:                     child:
-; SCHED-NEXT:                       schedule: "[{ Stmt_bb5[i0, i1] -> [(0)] }, { Stmt_bb5[i0, i1] -> [(floor((i1)/16) - 2*floor((i1)/32))] }]"
-; SCHED-NEXT:                       permutable: 1
-; SCHED-NEXT:                       coincident: [ 1, 1 ]
-; SCHED-NEXT:       - filter: "{ from_device_MemRef_A[] }"
-; SCHED-NEXT:         child:
-; SCHED-NEXT:           set:
-; SCHED-NEXT:           - filter: "{ from_device_MemRef_A[] }"
-; SCHED-NEXT:             child:
-; SCHED-NEXT:               guard: "{ [] }"
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(16, 32);
-; CODE-NEXT:     dim3 k0_dimGrid(32, 32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * (1024) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
-; CODE-NEXT:   Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
-
-; IR: polly.split_new_and_old:
-; IR-NEXT:   %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024)
-; IR-NEXT:   %.obit = extractvalue { i64, i1 } %0, 1
-; IR-NEXT:   %polly.overflow.state = or i1 false, %.obit
-; IR-NEXT:   %.res = extractvalue { i64, i1 } %0, 0
-; IR-NEXT:   %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024)
-; IR-NEXT:   %.obit1 = extractvalue { i64, i1 } %1, 1
-; IR-NEXT:   %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1
-; IR-NEXT:   %.res3 = extractvalue { i64, i1 } %1, 0
-; IR-NEXT:   %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3)
-; IR-NEXT:   %.obit4 = extractvalue { i64, i1 } %2, 1
-; IR-NEXT:   %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4
-; IR-NEXT:   %.res6 = extractvalue { i64, i1 } %2, 0
-; IR-NEXT:   %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6)
-; IR-NEXT:   %.obit7 = extractvalue { i64, i1 } %3, 1
-; IR-NEXT:   %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7
-; IR-NEXT:   %.res9 = extractvalue { i64, i1 } %3, 0
-; IR-NEXT:   %4 = icmp sge i64 %.res9, 2621440
-; IR-NEXT:   %5 = and i1 true, %4
-; IR-NEXT:   %polly.rtc.overflown = xor i1 %polly.overflow.state8, true
-; IR-NEXT:   %polly.rtc.result = and i1 %5, %polly.rtc.overflown
-; IR-NEXT:   br i1 %polly.rtc.result, label %polly.start, label %bb2
-
-; IR: polly.start:
-; IR-NEXT: br label %polly.acc.initialize
-
-; IR: polly.acc.initialize:
-; IR-NEXT:    [[GPUContext:%.*]] = call ptr @polly_initContext()
-; IR-NEXT:    %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 4194304)
-; IR-NEXT:    call void @polly_copyFromHostToDevice(ptr %A, ptr %p_dev_array_MemRef_A, i64 4194304)
-; IR-NEXT:    [[DevPtr:%.*]]  = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A)
-; IR-NEXT:    store ptr [[DevPtr]], ptr %polly_launch_0_param_0
-; IR-NEXT:    store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params
-; IR-NEXT:    call ptr @polly_getKernel
-; IR-NEXT:    call void @polly_launchKernel(ptr %11, i32 32, i32 32, i32 32, i32 16, i32 1, ptr %polly_launch_0_params_i8ptr)
-; IR-NEXT:    call void @polly_freeKernel
-; IR-NEXT:    call void @polly_copyFromDeviceToHost(ptr %p_dev_array_MemRef_A, ptr %A, i64 4194304)
-; IR-NEXT:    call void @polly_freeDeviceMemory(ptr %p_dev_array_MemRef_A)
-; IR-NEXT:    call void @polly_freeContext(ptr [[GPUContext]])
-; IR-NEXT:    br label %polly.exiting
-
-; IR: polly.exiting:
-; IR-NEXT:    br label %polly.merge_new_and_old
-
-; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(ptr %MemRef_A) #0 {
-; KERNEL-IR-NEXT: entry:
-; KERNEL-IR-NEXT:   %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-; KERNEL-IR-NEXT:   %b0 = zext i32 %0 to i64
-; KERNEL-IR-NEXT:   %1 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
-; KERNEL-IR-NEXT:   %b1 = zext i32 %1 to i64
-; KERNEL-IR-NEXT:   %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-; KERNEL-IR-NEXT:   %t0 = zext i32 %2 to i64
-; KERNEL-IR-NEXT:   %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
-; KERNEL-IR-NEXT:   %t1 = zext i32 %3 to i64
-; KERNEL-IR-NEXT:   br label %polly.loop_preheader
-
-; KERNEL-IR-LABEL: polly.loop_exit:                                  ; preds = %polly.stmt.bb5
-; KERNEL-IR-NEXT:   ret void
-
-; KERNEL-IR-LABEL: polly.loop_header:                                ; preds = %polly.stmt.bb5, %polly.loop_preheader
-; KERNEL-IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
-; KERNEL-IR-NEXT:   %4 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %5 = add nsw i64 %4, %t0
-; KERNEL-IR-NEXT:   %6 = mul nsw i64 32, %b1
-; KERNEL-IR-NEXT:   %7 = add nsw i64 %6, %t1
-; KERNEL-IR-NEXT:   %8 = mul nsw i64 16, %polly.indvar
-; KERNEL-IR-NEXT:   %9 = add nsw i64 %7, %8
-; KERNEL-IR-NEXT:   br label %polly.stmt.bb5
-
-; KERNEL-IR-LABEL: polly.stmt.bb5:                                   ; preds = %polly.loop_header
-; KERNEL-IR-NEXT:   %10 = mul i64 %5, %9
-; KERNEL-IR-NEXT:   %p_tmp6 = sitofp i64 %10 to float
-; KERNEL-IR-NEXT:   %11 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %12 = add nsw i64 %11, %t0
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024
-; KERNEL-IR-NEXT:   %13 = mul nsw i64 32, %b1
-; KERNEL-IR-NEXT:   %14 = add nsw i64 %13, %t1
-; KERNEL-IR-NEXT:   %15 = mul nsw i64 16, %polly.indvar
-; KERNEL-IR-NEXT:   %16 = add nsw i64 %14, %15
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A
-; KERNEL-IR-NEXT:   %tmp8_p_scalar_ = load float, ptr %polly.access.MemRef_A, align 4
-; KERNEL-IR-NEXT:   %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
-; KERNEL-IR-NEXT:   %17 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %18 = add nsw i64 %17, %t0
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024
-; KERNEL-IR-NEXT:   %19 = mul nsw i64 32, %b1
-; KERNEL-IR-NEXT:   %20 = add nsw i64 %19, %t1
-; KERNEL-IR-NEXT:   %21 = mul nsw i64 16, %polly.indvar
-; KERNEL-IR-NEXT:   %22 = add nsw i64 %20, %21
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A4 = getelementptr float, ptr %MemRef_A, i64 %polly.access.add.MemRef_A3
-; KERNEL-IR-NEXT:   store float %p_tmp9, ptr %polly.access.MemRef_A4, align 4
-; KERNEL-IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
-; KERNEL-IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar, 0
-; KERNEL-IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
-
-; KERNEL-IR-LABEL: polly.loop_preheader:                             ; preds = %entry
-; KERNEL-IR-NEXT:   br label %polly.loop_header
-
-; KERNEL-IR: attributes #0 = { "polly.skip.fn" }
-
-; KERNEL-ASM: .version 3.2
-; KERNEL-ASM-NEXT: .target sm_30
-; KERNEL-ASM-NEXT: .address_size 64
-
-; KERNEL-ASM:   // .globl     kernel_0
-
-; KERNEL-ASM: .visible .entry kernel_0(
-; KERNEL-ASM-NEXT:   .param .u64 kernel_0_param_0
-; KERNEL-ASM-NEXT: )
-
-;    void double_parallel_loop(float A[][1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        for (long j = 0; j < 1024; j++)
-;          A[i][j] += i * j;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @double_parallel_loop(ptr %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb13, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
-  %exitcond1 = icmp ne i64 %i.0, 1024
-  br i1 %exitcond1, label %bb3, label %bb15
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb10, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %j.0, 1024
-  br i1 %exitcond, label %bb5, label %bb12
-
-bb5:                                              ; preds = %bb4
-  %tmp = mul nuw nsw i64 %i.0, %j.0
-  %tmp6 = sitofp i64 %tmp to float
-  %tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0
-  %tmp8 = load float, ptr %tmp7, align 4
-  %tmp9 = fadd float %tmp8, %tmp6
-  store float %tmp9, ptr %tmp7, align 4
-  br label %bb10
-
-bb10:                                             ; preds = %bb5
-  %tmp11 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb12:                                             ; preds = %bb4
-  br label %bb13
-
-bb13:                                             ; preds = %bb12
-  %tmp14 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb15:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/failing-invariant-load-handling.ll b/polly/test/GPGPU/failing-invariant-load-handling.ll
deleted file mode 100644
index 70f88667bd60..000000000000
--- a/polly/test/GPGPU/failing-invariant-load-handling.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOPS
-; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
-
-%S = type { i32, i32, [12 x %L] }
-%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
-
-define void @test(ptr %cpi, i1 %b) {
-; SCOPS-LABEL: Region: %if.then14---%exit
-; SCOPS:         Invariant Accesses: {
-; SCOPS-NEXT:            ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
-; SCOPS-NEXT:                [l2, l1] -> { Stmt_for_body_i[i0] -> MemRef_cpi[0, 0] };
-; SCOPS-NEXT:            Execution Context: [l2, l1] -> {  :  }
-; SCOPS-NEXT:            ReadAccess :=       [Reduction Type: NONE] [Scalar: 0]
-; SCOPS-NEXT:                [l2, l1] -> { Stmt_for_body_lr_ph_i[] -> MemRef_cpi[0, 1] };
-; SCOPS-NEXT:            Execution Context: [l2, l1] -> {  : l2 > 0 }
-; SCOPS-NEXT:    }
-; SCOPS:         Arrays {
-; SCOPS-NEXT:        i32 MemRef_cpi[*][(10 * %l1)]; // Element size 4
-; SCOPS-NEXT:    }
-
-; Check that we gracefully handle failing invariant loads.
-; This test case is taken from:
-; test/Isl/CodeGen/invariant-load-dimension.ll
-
-; FIXME: Figure out how to actually generate code for this loop.
-; CODEGEN-NOT: LLVM ERROR: preloading invariant loads failed in function
-
-entry:
-  %nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1
-  br i1 %b, label %if.then14, label %exit
-
-if.then14:
-  %l0 = load i32, ptr %cpi, align 8
-  %cmp12.i = icmp sgt i32 %l0, 0
-  br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
-
-for.body.lr.ph.i:
-  %l1 = load i32, ptr %nt, align 4
-  br label %for.body.i
-
-for.body.i:
-  %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
-  %mul.i163 = mul nsw i32 %phi, %l1
-  %cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
-  store i32 0, ptr %cv, align 8
-  %inc = add nuw nsw i32 %phi, 1
-  %l2 = load i32, ptr %cpi, align 8
-  %cmp.i164 = icmp slt i32 %inc, %l2
-  br i1 %cmp.i164, label %for.body.i, label %exit
-
-exit:
-  ret void
-}
diff --git a/polly/test/GPGPU/failing-invariant-load-hoisting.ll b/polly/test/GPGPU/failing-invariant-load-hoisting.ll
deleted file mode 100644
index aa62921e1af5..000000000000
--- a/polly/test/GPGPU/failing-invariant-load-hoisting.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: opt %loadPolly -S < %s -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
-
-%S = type { i32, i32, [12 x %L] }
-%L = type { i32, i32, double, i32, i32, i32, i32, i32 }
-
-define void @test(ptr %cpi, i1 %b) {
-; CODEGEN-LABEL: @test(
-; CODEGEN:    polly.preload.begin:
-; CODEGEN-NEXT:  br i1 false
-
-entry:
-  %nt = getelementptr inbounds %S, ptr %cpi, i32 0, i32 1
-  br i1 %b, label %if.then14, label %exit
-
-if.then14:
-  %l0 = load i32, ptr %cpi, align 8
-  %cmp12.i = icmp sgt i32 %l0, 0
-  br i1 %cmp12.i, label %for.body.lr.ph.i, label %exit
-
-for.body.lr.ph.i:
-  %l1 = load i32, ptr %nt, align 4
-  br label %for.body.i
-
-for.body.i:
-  %phi = phi i32 [ 0, %for.body.lr.ph.i ], [ %inc, %for.body.i ]
-  %mul.i163 = mul nsw i32 %phi, %l1
-  %cv = getelementptr inbounds %S, ptr %cpi, i32 0, i32 2, i32 %mul.i163, i32 0
-  store i32 0, ptr %cv, align 8
-  %inc = add nuw nsw i32 %phi, 1
-  %l2 = load i32, ptr %cpi, align 8
-  %cmp.i164 = icmp slt i32 %inc, %l2
-  br i1 %cmp.i164, label %for.body.i, label %exit
-
-exit:
-  ret void
-}
diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll
deleted file mode 100644
index 5ba65d60819c..000000000000
--- a/polly/test/GPGPU/host-control-flow.ll
+++ /dev/null
@@ -1,176 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \
-; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -disable-output \
-; RUN: -polly-acc-dump-kernel-ir < %s | FileCheck %s -check-prefix=KERNEL-IR
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -S < %s | FileCheck %s -check-prefix=IR
-;    void foo(float A[2][100]) {
-;      for (long t = 0; t < 100; t++)
-;        for (long i = 1; i < 99; i++)
-;          A[(t + 1) % 2][i] += A[t % 2][i - 1] + A[t % 2][i] + A[t % 2][i + 1];
-;    }
-
-; REQUIRES: pollyacc
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   for (int c0 = 0; c0 <= 99; c0 += 1)
-; CODE-NEXT:     {
-; CODE-NEXT:       dim3 k0_dimBlock(32);
-; CODE-NEXT:       dim3 k0_dimGrid(4);
-; CODE-NEXT:       kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, c0);
-; CODE-NEXT:       cudaCheckKernel();
-; CODE-NEXT:     }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A));
-; CODE-NEXT: }
-
-; IR-LABEL: polly.loop_header:                                ; preds = %polly.loop_header, %polly.loop_preheader
-; IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
-; ...
-; IR:  store i64 %polly.indvar, i64* %polly_launch_0_param_1
-; IR-NEXT:  [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
-; IR-NEXT:  [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*
-; IR-NEXT:  store i8* [[REGB]], i8** [[REGA]]
-; IR: call i8* @polly_getKernel
-; ...
-; IR: call void @polly_freeKernel
-; IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
-; IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar_next, 99
-; IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
-
-; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A, i64 %c0)
-; KERNEL-IR-LABEL: entry:
-; KERNEL-IR-NEXT:   %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-; KERNEL-IR-NEXT:   %b0 = zext i32 %0 to i64
-; KERNEL-IR-NEXT:   %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-; KERNEL-IR-NEXT:   %t0 = zext i32 %1 to i64
-; KERNEL-IR-NEXT:   br label %polly.cond
-
-; KERNEL-IR-LABEL: polly.cond:                                       ; preds = %entry
-; KERNEL-IR-NEXT:   %2 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %3 = add nsw i64 %2, %t0
-; KERNEL-IR-NEXT:   %4 = icmp sle i64 %3, 97
-; KERNEL-IR-NEXT:   br i1 %4, label %polly.then, label %polly.else
-
-; KERNEL-IR-LABEL: polly.merge:                                      ; preds = %polly.else, %polly.stmt.for.body3
-; KERNEL-IR-NEXT:   ret void
-
-; KERNEL-IR-LABEL: polly.then:                                       ; preds = %polly.cond
-; KERNEL-IR-NEXT:   %5 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %6 = add nsw i64 %5, %t0
-; KERNEL-IR-NEXT:   br label %polly.stmt.for.body3
-
-; KERNEL-IR-LABEL: polly.stmt.for.body3:                             ; preds = %polly.then
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %pexp.pdiv_r = urem i64 %c0, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100
-; KERNEL-IR-NEXT:   %7 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %8 = add nsw i64 %7, %t0
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
-; KERNEL-IR-NEXT:   %tmp_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %pexp.pdiv_r2 = urem i64 %c0, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100
-; KERNEL-IR-NEXT:   %9 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %10 = add nsw i64 %9, %t0
-; KERNEL-IR-NEXT:   %11 = add nsw i64 %10, 1
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %11
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4
-; KERNEL-IR-NEXT:   %tmp2_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A5, align 4
-; KERNEL-IR-NEXT:   %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A6 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %pexp.pdiv_r7 = urem i64 %c0, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100
-; KERNEL-IR-NEXT:   %12 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %13 = add nsw i64 %12, %t0
-; KERNEL-IR-NEXT:   %14 = add nsw i64 %13, 2
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %14
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A10 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9
-; KERNEL-IR-NEXT:   %tmp3_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A10, align 4
-; KERNEL-IR-NEXT:   %p_add12 = fadd float %p_add, %tmp3_p_scalar_
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A11 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %15 = add nsw i64 %c0, 1
-; KERNEL-IR-NEXT:   %pexp.pdiv_r12 = urem i64 %15, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100
-; KERNEL-IR-NEXT:   %16 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %17 = add nsw i64 %16, %t0
-; KERNEL-IR-NEXT:   %18 = add nsw i64 %17, 1
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %18
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A15 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14
-; KERNEL-IR-NEXT:   %tmp4_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A15, align 4
-; KERNEL-IR-NEXT:   %p_add17 = fadd float %tmp4_p_scalar_, %p_add12
-; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A16 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-IR-NEXT:   %19 = add nsw i64 %c0, 1
-; KERNEL-IR-NEXT:   %pexp.pdiv_r17 = urem i64 %19, 2
-; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100
-; KERNEL-IR-NEXT:   %20 = mul nsw i64 32, %b0
-; KERNEL-IR-NEXT:   %21 = add nsw i64 %20, %t0
-; KERNEL-IR-NEXT:   %22 = add nsw i64 %21, 1
-; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %22
-; KERNEL-IR-NEXT:   %polly.access.MemRef_A20 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19
-; KERNEL-IR-NEXT:   store float %p_add17, float addrspace(1)* %polly.access.MemRef_A20, align 4
-; KERNEL-IR-NEXT:   br label %polly.merge
-
-; KERNEL-IR-LABEL: polly.else:                                       ; preds = %polly.cond
-; KERNEL-IR-NEXT:   br label %polly.merge
-; KERNEL-IR-NEXT: }
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo([100 x float]* %A) {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc18, %entry
-  %t.0 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ]
-  %exitcond1 = icmp ne i64 %t.0, 100
-  br i1 %exitcond1, label %for.body, label %for.end20
-
-for.body:                                         ; preds = %for.cond
-  br label %for.cond1
-
-for.cond1:                                        ; preds = %for.inc, %for.body
-  %i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ]
-  %exitcond = icmp ne i64 %i.0, 99
-  br i1 %exitcond, label %for.body3, label %for.end
-
-for.body3:                                        ; preds = %for.cond1
-  %sub = add nsw i64 %i.0, -1
-  %rem = srem i64 %t.0, 2
-  %arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem, i64 %sub
-  %tmp = load float, float* %arrayidx4, align 4
-  %rem5 = srem i64 %t.0, 2
-  %arrayidx7 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem5, i64 %i.0
-  %tmp2 = load float, float* %arrayidx7, align 4
-  %add = fadd float %tmp, %tmp2
-  %add8 = add nuw nsw i64 %i.0, 1
-  %rem9 = srem i64 %t.0, 2
-  %arrayidx11 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem9, i64 %add8
-  %tmp3 = load float, float* %arrayidx11, align 4
-  %add12 = fadd float %add, %tmp3
-  %add13 = add nuw nsw i64 %t.0, 1
-  %rem14 = srem i64 %add13, 2
-  %arrayidx16 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem14, i64 %i.0
-  %tmp4 = load float, float* %arrayidx16, align 4
-  %add17 = fadd float %tmp4, %add12
-  store float %add17, float* %arrayidx16, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body3
-  %inc = add nuw nsw i64 %i.0, 1
-  br label %for.cond1
-
-for.end:                                          ; preds = %for.cond1
-  br label %for.inc18
-
-for.inc18:                                        ; preds = %for.end
-  %inc19 = add nuw nsw i64 %t.0, 1
-  br label %for.cond
-
-for.end20:                                        ; preds = %for.cond
-  ret void
-}
diff --git a/polly/test/GPGPU/host-statement.ll b/polly/test/GPGPU/host-statement.ll
deleted file mode 100644
index d7232b2fa538..000000000000
--- a/polly/test/GPGPU/host-statement.ll
+++ /dev/null
@@ -1,204 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -polly-invariant-load-hoisting=false \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -polly-invariant-load-hoisting=false \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=KERNEL-IR %s
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @llvm.lifetime.start(i64, ptr nocapture) #0
-
-; This test case tests that we can correctly handle a ScopStmt that is
-; scheduled on the host, instead of within a kernel.
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(16);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   if (p_0 <= 510 && p_1 <= 510) {
-; CODE-NEXT:     {
-; CODE-NEXT:       dim3 k1_dimBlock(32);
-; CODE-NEXT:       dim3 k1_dimGrid(p_1 <= -1048034 ? 32768 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
-; CODE-NEXT:       kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
-; CODE-NEXT:       cudaCheckKernel();
-; CODE-NEXT:     }
-
-; CODE:     {
-; CODE-NEXT:       dim3 k2_dimBlock(16, 32);
-; CODE-NEXT:       dim3 k2_dimGrid(16, p_1 <= -7650 ? 256 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
-; CODE-NEXT:       kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_A, dev_MemRef_R, dev_MemRef_Q, p_0, p_1);
-; CODE-NEXT:       cudaCheckKernel();
-; CODE-NEXT:     }
-
-; CODE:   }
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_R, dev_MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_Q, dev_MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT:     Stmt_for_cond33_preheader_last();
-
-; CODE: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_for_body16(32 * b0 + t0);
-
-; CODE: # kernel1
-; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 1048576; c0 += 1)
-; CODE-NEXT:   for (int c1 = 0; c1 <= 15; c1 += 1) {
-; CODE-NEXT:     if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510 && c1 == 0)
-; CODE-NEXT:       Stmt_for_body35(32 * b0 + t0 + 1048576 * c0);
-; CODE-NEXT:     if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510)
-; CODE-NEXT:       for (int c3 = 0; c3 <= 31; c3 += 1)
-; CODE-NEXT:         Stmt_for_body42(32 * b0 + t0 + 1048576 * c0, 32 * c1 + c3);
-; CODE-NEXT:     sync0();
-; CODE-NEXT:   }
-
-; CODE: # kernel2
-; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 8192; c0 += 1)
-; CODE-NEXT:   if (p_1 + 32 * b0 + t0 + 8192 * c0 <= 510)
-; CODE-NEXT:     for (int c3 = 0; c3 <= 1; c3 += 1)
-; CODE-NEXT:       Stmt_for_body62(32 * b0 + t0 + 8192 * c0, 32 * b1 + t1 + 16 * c3);
-
-; KERNEL-IR: call void @llvm.nvvm.barrier0()
-
-; Function Attrs: nounwind uwtable
-define internal void @kernel_gramschmidt(i32 %ni, i32 %nj, ptr %A, ptr %R, ptr %Q) #1 {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %entry.split, %for.inc86
-  %indvars.iv24 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next25, %for.inc86 ]
-  %indvars.iv19 = phi i64 [ 1, %entry.split ], [ %indvars.iv.next20, %for.inc86 ]
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.cond1.preheader, %for.inc
-  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
-  %nrm.02 = phi double [ 0.000000e+00, %for.cond1.preheader ], [ %add, %for.inc ]
-  %arrayidx5 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24
-  %tmp = load double, ptr %arrayidx5, align 8, !tbaa !1
-  %arrayidx9 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv, i64 %indvars.iv24
-  %tmp27 = load double, ptr %arrayidx9, align 8, !tbaa !1
-  %mul = fmul double %tmp, %tmp27
-  %add = fadd double %nrm.02, %mul
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 512
-  br i1 %exitcond, label %for.inc, label %for.end
-
-for.end:                                          ; preds = %for.inc
-  %add.lcssa = phi double [ %add, %for.inc ]
-  %call = tail call double @sqrt(double %add.lcssa) #2
-  %arrayidx13 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24
-  store double %call, ptr %arrayidx13, align 8, !tbaa !1
-  br label %for.body16
-
-for.cond33.preheader:                             ; preds = %for.body16
-  %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
-  %cmp347 = icmp slt i64 %indvars.iv.next25, 512
-  br i1 %cmp347, label %for.body35.lr.ph, label %for.inc86
-
-for.body35.lr.ph:                                 ; preds = %for.cond33.preheader
-  br label %for.body35
-
-for.body16:                                       ; preds = %for.end, %for.body16
-  %indvars.iv10 = phi i64 [ 0, %for.end ], [ %indvars.iv.next11, %for.body16 ]
-  %arrayidx20 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv10, i64 %indvars.iv24
-  %tmp28 = load double, ptr %arrayidx20, align 8, !tbaa !1
-  %arrayidx24 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv24
-  %tmp29 = load double, ptr %arrayidx24, align 8, !tbaa !1
-  %div = fdiv double %tmp28, %tmp29
-  %arrayidx28 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv10, i64 %indvars.iv24
-  store double %div, ptr %arrayidx28, align 8, !tbaa !1
-  %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
-  %exitcond12 = icmp ne i64 %indvars.iv.next11, 512
-  br i1 %exitcond12, label %for.body16, label %for.cond33.preheader
-
-for.cond33.loopexit:                              ; preds = %for.body62
-  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next22 to i32
-  %exitcond23 = icmp ne i32 %lftr.wideiv, 512
-  br i1 %exitcond23, label %for.body35, label %for.cond33.for.inc86_crit_edge
-
-for.body35:                                       ; preds = %for.body35.lr.ph, %for.cond33.loopexit
-  %indvars.iv21 = phi i64 [ %indvars.iv19, %for.body35.lr.ph ], [ %indvars.iv.next22, %for.cond33.loopexit ]
-  %arrayidx39 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
-  store double 0.000000e+00, ptr %arrayidx39, align 8, !tbaa !1
-  br label %for.body42
-
-for.cond60.preheader:                             ; preds = %for.body42
-  br label %for.body62
-
-for.body42:                                       ; preds = %for.body35, %for.body42
-  %indvars.iv13 = phi i64 [ 0, %for.body35 ], [ %indvars.iv.next14, %for.body42 ]
-  %arrayidx46 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv13, i64 %indvars.iv24
-  %tmp30 = load double, ptr %arrayidx46, align 8, !tbaa !1
-  %arrayidx50 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv13, i64 %indvars.iv21
-  %tmp31 = load double, ptr %arrayidx50, align 8, !tbaa !1
-  %mul51 = fmul double %tmp30, %tmp31
-  %arrayidx55 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
-  %tmp32 = load double, ptr %arrayidx55, align 8, !tbaa !1
-  %add56 = fadd double %tmp32, %mul51
-  store double %add56, ptr %arrayidx55, align 8, !tbaa !1
-  %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
-  %exitcond15 = icmp ne i64 %indvars.iv.next14, 512
-  br i1 %exitcond15, label %for.body42, label %for.cond60.preheader
-
-for.body62:                                       ; preds = %for.cond60.preheader, %for.body62
-  %indvars.iv16 = phi i64 [ 0, %for.cond60.preheader ], [ %indvars.iv.next17, %for.body62 ]
-  %arrayidx66 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21
-  %tmp33 = load double, ptr %arrayidx66, align 8, !tbaa !1
-  %arrayidx70 = getelementptr inbounds [512 x double], ptr %Q, i64 %indvars.iv16, i64 %indvars.iv24
-  %tmp34 = load double, ptr %arrayidx70, align 8, !tbaa !1
-  %arrayidx74 = getelementptr inbounds [512 x double], ptr %R, i64 %indvars.iv24, i64 %indvars.iv21
-  %tmp35 = load double, ptr %arrayidx74, align 8, !tbaa !1
-  %mul75 = fmul double %tmp34, %tmp35
-  %sub = fsub double %tmp33, %mul75
-  %arrayidx79 = getelementptr inbounds [512 x double], ptr %A, i64 %indvars.iv16, i64 %indvars.iv21
-  store double %sub, ptr %arrayidx79, align 8, !tbaa !1
-  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
-  %exitcond18 = icmp ne i64 %indvars.iv.next17, 512
-  br i1 %exitcond18, label %for.body62, label %for.cond33.loopexit
-
-for.cond33.for.inc86_crit_edge:                   ; preds = %for.cond33.loopexit
-  br label %for.inc86
-
-for.inc86:                                        ; preds = %for.cond33.for.inc86_crit_edge, %for.cond33.preheader
-  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
-  %exitcond26 = icmp ne i64 %indvars.iv.next25, 512
-  br i1 %exitcond26, label %for.cond1.preheader, label %for.end88
-
-for.end88:                                        ; preds = %for.inc86
-  ret void
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #0
-
-; Function Attrs: nounwind
-declare double @sqrt(double) #2
-
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"double", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/ignore-parameter-bounds.ll b/polly/test/GPGPU/ignore-parameter-bounds.ll
deleted file mode 100644
index 1d0b5482941e..000000000000
--- a/polly/test/GPGPU/ignore-parameter-bounds.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; REQUIRES: pollyacc
-
-; CODE: Code
-; CODE: ====
-; CODE: No code generated
-
-source_filename = "bugpoint-output-83bcdeb.bc"
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-@__data_radiation_MOD_cobi = external global [168 x double], align 32
-
-; Function Attrs: nounwind uwtable
-define void @__radiation_rg_MOD_coe_so() #0 {
-entry:
-  %polly.access.kspec.load = load i32, ptr undef, align 4
-  %0 = or i1 undef, undef
-  br label %polly.preload.cond29
-
-polly.preload.cond29:                             ; preds = %entry
-  br i1 %0, label %polly.preload.exec31, label %polly.preload.merge30
-
-polly.preload.merge30:                            ; preds = %polly.preload.exec31, %polly.preload.cond29
-  %polly.preload..merge32 = phi double [ %polly.access.__data_radiation_MOD_cobi.load, %polly.preload.exec31 ], [ 0.000000e+00, %polly.preload.cond29 ]
-  ret void
-
-polly.preload.exec31:                             ; preds = %polly.preload.cond29
-  %1 = sext i32 %polly.access.kspec.load to i64
-  %2 = mul nsw i64 7, %1
-  %3 = add nsw i64 0, %2
-  %4 = add nsw i64 %3, 48
-  %polly.access.__data_radiation_MOD_cobi = getelementptr double, ptr @__data_radiation_MOD_cobi, i64 %4
-  %polly.access.__data_radiation_MOD_cobi.load = load double, ptr %polly.access.__data_radiation_MOD_cobi, align 8
-  br label %polly.preload.merge30
-}
-
-attributes #0 = { nounwind uwtable }
diff --git a/polly/test/GPGPU/intrinsic-copied-into-kernel.ll b/polly/test/GPGPU/intrinsic-copied-into-kernel.ll
deleted file mode 100644
index 7c1e3672abb5..000000000000
--- a/polly/test/GPGPU/intrinsic-copied-into-kernel.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR
-; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg  < %s | FileCheck %s --check-prefix=HOST-IR
-
-; Test that we do recognise and codegen a kernel that has intrinsics.
-
-; REQUIRES: pollyacc
-
-; Check that we model the kernel as a scop.
-; SCOP:      Function: f
-; SCOP-NEXT:       Region: %entry.split---%for.end
-
-; Check that the intrinsic call is present in the kernel IR.
-; KERNEL-IR:   %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
-; KERNEL-IR:   declare float @llvm.sqrt.f32(float)
-; KERNEL-IR:   declare float @llvm.fabs.f32(float)
-
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
-
-
-; void f(float *A, float *B, int N) {
-;   for(int i = 0; i < N; i++) {
-;       float tmp0 = A[i];
-;       float tmp1 = sqrt(tmp1);
-;       float tmp2 = fabs(tmp2);
-;       float tmp3 = copysignf(tmp1, tmp2);
-;       B[i] = tmp4;
-;   }
-; }
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @f(float* %A, float* %B, i32 %N) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %cmp1 = icmp sgt i32 %N, 0
-  br i1 %cmp1, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv
-  %A.arr.i.val = load float, float* %A.arr.i, align 4
-  ; Call to intrinsics that should be part of the kernel.
-  %sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
-  %fabs = tail call float @llvm.fabs.f32(float %sqrt);
-  %copysign = tail call float @llvm.copysign.f32(float %sqrt, float %fabs);
-  %B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
-  store float %copysign, float* %B.arr.i, align 4
-
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %wide.trip.count = zext i32 %N to i64
-  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare float @llvm.sqrt.f32(float) #0
-declare float @llvm.fabs.f32(float) #0
-declare float @llvm.copysign.f32(float, float) #0
-
-attributes #0 = { nounwind readnone }
-
diff --git a/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll b/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll
deleted file mode 100644
index 4b9139f0b44c..000000000000
--- a/polly/test/GPGPU/invalid-kernel-assert-verifymodule.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg  -polly-acc-fail-on-verify-module-failure \
-; RUN: -disable-output < %s
-
-; Make sure that if -polly-acc-fail-on-verify-module-failure is on, we actually
-; fail on an illegal module.
-
-; REQUIRES: pollyacc, asserts
-; XFAIL: *
-;
-;    void foo(long A[1024], long B[1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += (B[i] + (long)&B[i]);
-;    }
-
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, ptr %B) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb10, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb12
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i64, ptr %B, i64 %i.0
-  %tmp3 = load i64, ptr %tmp, align 8
-  %tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0
-  %tmp5 = ptrtoint ptr %tmp4 to i64
-  %tmp6 = add nsw i64 %tmp3, %tmp5
-  %tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp8 = load i64, ptr %tmp7, align 8
-  %tmp9 = add nsw i64 %tmp8, %tmp6
-  store i64 %tmp9, ptr %tmp7, align 8
-  br label %bb10
-
-bb10:                                             ; preds = %bb2
-  %tmp11 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb12:                                             ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/invalid-kernel.ll b/polly/test/GPGPU/invalid-kernel.ll
deleted file mode 100644
index 9dd32eac97c0..000000000000
--- a/polly/test/GPGPU/invalid-kernel.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: not FileCheck %s -check-prefix=KERNEL-IR
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; REQUIRES: pollyacc
-;
-;    void foo(long A[1024], long B[1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += (B[i] + (long)&B[i]);
-;    }
-
-; This kernel loads/stores a pointer address we model. This is a rare case,
-; were we still lack proper code-generation support. We check here that we
-; detect the invalid IR and bail out gracefully.
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B, dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost));
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; KERNEL-IR: kernel
-
-; IR: br i1 false, label %polly.start, label %bb1
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, ptr %B) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb10, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb12
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i64, ptr %B, i64 %i.0
-  %tmp3 = load i64, ptr %tmp, align 8
-  %tmp4 = getelementptr inbounds i64, ptr %B, i64 %i.0
-  %tmp5 = ptrtoint ptr %tmp4 to i64
-  %tmp6 = add nsw i64 %tmp3, %tmp5
-  %tmp7 = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp8 = load i64, ptr %tmp7, align 8
-  %tmp9 = add nsw i64 %tmp8, %tmp6
-  store i64 %tmp9, ptr %tmp7, align 8
-  br label %bb10
-
-bb10:                                             ; preds = %bb2
-  %tmp11 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb12:                                             ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/invariant-load-array-access.ll b/polly/test/GPGPU/invariant-load-array-access.ll
deleted file mode 100644
index 02c0330a7e7e..000000000000
--- a/polly/test/GPGPU/invariant-load-array-access.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-; RUN: opt %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
-
-
-; REQUIRES: pollyacc
-
-; Check that we detect a scop.
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP-NEXT: Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp] -> { Stmt_for_body[i0] -> MemRef_control[0] };
-; SCOP-NEXT:         Execution Context: [tmp] -> {  :  }
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp] -> { Stmt_if_then[i0] -> MemRef_readarr[0] };
-; SCOP-NEXT:         Execution Context: [tmp] -> {  : tmp >= 4 }
-; SCOP-NEXT: }
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-; This test makes sure that such an access pattern is handled correctly
-; by PPCGCodeGeneration. It appears that not calling `preloadInvariantLoads`
-; was the main reason that caused this test case to crash.
-;
-; void f(int *arr, const int *control, const int *readarr) {
-;     for(int i = 0; i < 1000; i++) {
-;         int t = 0;
-;         if (*control > 3) {
-;             t += *readarr;
-;         }
-;         arr[i] = t;
-;     }
-; }
-
-
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-target triple = "i386-apple-macosx10.12.0"
-define void @f(ptr %arr, ptr %control, ptr %readarr) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %if.end
-  %i.01 = phi i32 [ 0, %entry.split ], [ %inc, %if.end ]
-  %tmp = load i32, ptr %control, align 4
-  %cmp1 = icmp sgt i32 %tmp, 3
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %tmp1 = load i32, ptr %readarr, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.0 = phi i32 [ %tmp1, %if.then ], [ 0, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.01
-  store i32 %t.0, ptr %arrayidx, align 4
-  %inc = add nuw nsw i32 %i.01, 1
-  %exitcond = icmp eq i32 %inc, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %if.end
-  ret void
-}
diff --git a/polly/test/GPGPU/invariant-load-escaping-values.ll b/polly/test/GPGPU/invariant-load-escaping-values.ll
deleted file mode 100644
index 54f4b43fdb92..000000000000
--- a/polly/test/GPGPU/invariant-load-escaping-values.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: opt %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s
-
-; REQUIRES: pollyacc
-
-; CHECK: store i64 %polly.access.B.load, ptr %invariant.preload.s2a
-; CHECK: %invariant.final_reload = load i64, ptr %invariant.preload.s2a
-
-; Verify that the final reload of an invariant scalar memory access uses the
-; same stack slot that into which the invariant memory access was stored
-; originally. Earlier, this was broken as we introduce a new stack slot aside
-; of the preload stack slot, which remained uninitialized and caused our escaping
-; loads to contain garbage.
-
-define i64 @foo(ptr %A, ptr %B) {
-entry:
-  br label %loop
-
-loop:
-  %indvar = phi i64 [0, %entry], [%indvar.next, %loop]
-  %indvar.next = add nsw i64 %indvar, 1
-  %idx = getelementptr float, ptr %A, i64 %indvar
-  store float 42.0, ptr %idx
-  %invariant = load i64, ptr %B
-  %cmp = icmp sle i64 %indvar, 1024
-  br i1 %cmp, label %loop, label %exit
-
-exit:
-  ret i64 %invariant
-}
diff --git a/polly/test/GPGPU/invariant-load-hoisting-of-array.ll b/polly/test/GPGPU/invariant-load-hoisting-of-array.ll
deleted file mode 100644
index 015a3dacbe10..000000000000
--- a/polly/test/GPGPU/invariant-load-hoisting-of-array.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-; RUN: opt -opaque-pointers=0 %loadPolly -S -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; Entry: Contains (%loaded.ptr.preload.s2a = alloca double*) which is
-;   |    invariant load hoisted `%loaded.ptr`
-;   v
-; Run-time check --(failure branch)--> { old code - contains `%loaded.ptr` }
-;   |
-;  (success branch)
-;   |
-;   v
-; New Code: Should refer to `%loaded.ptr.preload.s2a`, which is
-;           the invariant load hoisted value, NOT `%loaded.ptr`.
-
-; In Polly, we preserve the old code and create a separate branch that executes
-; the GPU code if a run-time check succeeds.
-
-; We need to make sure that in the new branch, we pick up invariant load hoisted
-; values. The old values will belong to the old code branch.
-
-; In this case, we use to try to load the 'original' %loaded.ptr in the
-; 'New Code' branch,which is wrong. Check that this does not happen.
-
-; Check that we have a Scop with an invariant load of the array.
-; SCOP:       Function: f
-; SCOP-NEXT:  Region: %arrload---%for.exit
-; SCOP-NEXT:  Max Loop Depth:  1
-; SCOP-NEXT:  Invariant Accesses: {
-; SCOP-NEXT:          ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:              { Stmt_arrload[] -> MemRef_arr_of_ptrs[0] };
-
-
-
-; Check that we have the preloaded array.
-; HOST-IR: entry:
-; HOST-IR-NEXT:  %loaded.ptr.preload.s2a = alloca double*
-
-; Chek that we store the correct value in the preload.
-; polly.preload.begin:                              ; preds = %polly.split_new_and_old
-; HOST-IR: %polly.access.arr.of.ptrs = getelementptr double*, double** %arr.of.ptrs, i64 0
-; HOST-IR-NEXT: %polly.access.arr.of.ptrs.load = load double*, double** %polly.access.arr.of.ptrs
-; HOST-IR-NEXT: store double* %polly.access.arr.of.ptrs.load, double** %loaded.ptr.preload.s2a
-
-; Check that we get back data from the kernel.
-; HOST-IR: polly.acc.initialize:                             ; preds = %polly.start
-; HOST-IR: [[FIRSTINDEX:%.+]] = getelementptr double, double* %polly.access.arr.of.ptrs.load, i64 1
-; HOST-IR: [[BITCASTED:%.+]] = bitcast double* [[FIRSTINDEX]] to i8*
-; HOST-IR: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_loaded_ptr, i8* [[BITCASTED]], i64 800)
-
-; Check that the kernel launch is generated in the host IR.
-; This declaration would not have been generated unless a kernel launch exists.
-; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
-
-
-; C pseudocode equivalent
-; void f(double **arr_of_ptrs) {
-;     double *loaded_ptr = arr_of_ptrs[0];
-;     if (false) { return; }
-;     else {
-;         for(int i = 1; i < 100; i++) {
-;             loaded_ptr[i] = 42.0;
-;         }
-;     }
-; }
-
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-
-; Function Attrs: nounwind uwtable
-define void @f(double **%arr.of.ptrs) #0 {
-entry:
-  br label %arrload
-
-arrload:                                             ; preds = %"7"
-  %loaded.ptr = load double*, double** %arr.of.ptrs, align 8
-  br i1 false, label %"for.exit", label %"for.preheader"
-
-"for.preheader":                                       ; preds = %"51"
-  br label %"for.body"
-
-"for.body":                                             ; preds = %"53", %"53.lr.ph"
-  %indvar = phi i64 [ 1, %"for.preheader" ], [ %indvar.next, %"for.body" ]
-  %slot = getelementptr double, double* %loaded.ptr, i64 %indvar
-  store double 42.0, double* %slot, align 8
-
-  %indvar.next = add nuw nsw i64 %indvar, 1
-
-  %check = icmp sgt i64 %indvar.next, 100
-  br i1 %check, label %"for.exit", label %"for.body"
-
-"for.exit":                                             ; preds = %"52.54_crit_edge", %"51"
-    ret void
-}
-
-attributes #0 = { nounwind uwtable }
diff --git a/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll b/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll
deleted file mode 100644
index ad30ef6f9b24..000000000000
--- a/polly/test/GPGPU/invariant-load-hoisting-read-in-kernel.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \
-; RUN: -S < %s | \
-; RUN: FileCheck -check-prefix=HOST-IR %s
-
-; RUN: opt %loadPolly -disable-output -polly-acc-dump-kernel-ir \
-; RUN: -polly-codegen-ppcg -polly-scops \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck -check-prefix=KERNEL-IR %s
-
-; REQUIRES: pollyacc
-
-; Verify that invariant loads used in a kernel statement are correctly forwarded
-; as subtree value to the GPU kernel.
-
-; HOST-IR: store float %polly.access.p.load, ptr %invariant.preload.s2a, align 4
-
-; KERNEL-IR:  define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_2({{.*}}ptr addrspace(1) %MemRef_indvar2f__phi{{.*}})
-; KERNEL-IR:   %indvar2f.phiops.reload = load float, ptr %indvar2f.phiops, align 4
-; KERNEL-IR:   store float %indvar2f.phiops.reload, ptr addrspace(1) %polly.access.MemRef_A, align 4
-
-; FIXME: store float %indvar2f.phiops.reload, ptr %indvar2f.phiops, align 4
-; For some reason the above instruction is emitted that stores back to the addess it was just loaded from.
-
-define void @foo(ptr %A, ptr %p) {
-entry:
-  br label %loop
-
-loop:
-  %indvar = phi i64 [0, %entry], [%indvar.next, %loop]
-  %indvar.next = add i64 %indvar, 1
-  %invariant = load float, ptr %p
-  %ptr = getelementptr float, ptr %A, i64 %indvar
-  store float 42.0, ptr %ptr
-  %cmp = icmp sle i64 %indvar, 1024
-  br i1 %cmp, label %loop, label %anotherloop
-
-anotherloop:
-  %indvar2 = phi i64 [0, %loop], [%indvar2.next, %anotherloop]
-  %indvar2f = phi float [%invariant, %loop], [%indvar2f, %anotherloop]
-  %indvar2.next = add i64 %indvar2, 1
-  store float %indvar2f, ptr %A
-  %cmp2 = icmp sle i64 %indvar2, 1024
-  br i1 %cmp2, label %anotherloop, label %end
-
-end:
-  ret void
-
-}
diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll
deleted file mode 100644
index 7a650eeb22ee..000000000000
--- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-bounds.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-
-; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %entry.split---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP-NEXT: Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp1, tmp4] -> { Stmt_entry_split[] -> MemRef_begin[0] };
-; SCOP-NEXT:         Execution Context: [tmp1, tmp4] -> {  :  }
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp1, tmp4] -> { Stmt_for_body[i0] -> MemRef_end[0] };
-; SCOP-NEXT:         Execution Context: [tmp1, tmp4] -> {  :  }
-; SCOP-NEXT: }
-
-
-; Check that the kernel launch is generated in the host IR.
-; This declaration would not have been generated unless a kernel launch exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-;    void f(int *begin, int *end, int *arr) {
-;      for (int i = *begin; i < *end; i++) {
-;        arr[i] = 0;
-;      }
-;    }
-;
-
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-
-define void @f(ptr %begin, ptr %end, ptr %arr) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %tmp1 = load i32, ptr %begin, align 4
-  %tmp41 = load i32, ptr %end, align 4
-  %cmp2 = icmp slt i32 %tmp1, %tmp41
-  br i1 %cmp2, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.03 = phi i32 [ %tmp1, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %i.03
-  store i32 0, ptr %arrayidx, align 4
-  %inc = add nsw i32 %i.03, 1
-  %tmp4 = load i32, ptr %end, align 4
-  %cmp = icmp slt i32 %inc, %tmp4
-  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll
deleted file mode 100644
index a637cc44c7a3..000000000000
--- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-lower-bound.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-
-; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; Check that we detect a scop with invariant accesses.
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %entry.split---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP-NEXT: Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [beginval] -> { Stmt_entry_split[] -> MemRef_begin[0] };
-; SCOP-NEXT:         Execution Context: [beginval] -> {  :  }
-; SCOP-NEXT: }
-
-; Check that the kernel launch is generated in the host IR.
-; This declaration would not have been generated unless a kernel launch exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-;
-; void f(int *begin, int *arr) {
-;     for (int i = *begin; i < 100; i++) {
-;         arr[i] = 0;
-;     }
-; }
-
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-
-define void @f(ptr %begin, ptr %arr) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %beginval = load i32, ptr %begin, align 4
-  %cmp1 = icmp slt i32 %beginval, 100
-  br i1 %cmp1, label %for.body, label %for.end
-
-
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %ival = phi i32 [ %beginval, %entry.split ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i32 %ival
-  store i32 0, ptr %arrayidx, align 4
-  %inc = add nsw i32 %ival, 1
-  %cmp = icmp slt i32 %ival, 99
-  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
diff --git a/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll b/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll
deleted file mode 100644
index 3c19a306734a..000000000000
--- a/polly/test/GPGPU/invariant-load-hoisting-with-variable-upper-bound.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; Check that we detect a scop with invariant accesses.
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %entry.split---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP-NEXT: Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp2] -> { Stmt_for_body[i0] -> MemRef_idx[0] };
-; SCOP-NEXT:         Execution Context: [tmp2] -> {  :  }
-; SCOP-NEXT: }
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-; Check if we generate GPU code for simple loop with variable upper bound.
-; This always worked, but have this test to prevent regressions.
-;    void f(int *idx, int *arr) {
-;      for (int i = 0; i < *idx; i++) {
-;        arr[i] = 0;
-;      }
-;    }
-;
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @f(ptr %idx, ptr %arr) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %tmp21 = load i32, ptr %idx, align 4
-  %cmp2 = icmp sgt i32 %tmp21, 0
-  br i1 %cmp2, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %indvars.iv
-  store i32 0, ptr %arrayidx, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %tmp2 = load i32, ptr %idx, align 4
-  %0 = sext i32 %tmp2 to i64
-  %cmp = icmp slt i64 %indvars.iv.next, %0
-  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
diff --git a/polly/test/GPGPU/invariant-load-hoisting.ll b/polly/test/GPGPU/invariant-load-hoisting.ll
deleted file mode 100644
index 5ae1cfae255d..000000000000
--- a/polly/test/GPGPU/invariant-load-hoisting.ll
+++ /dev/null
@@ -1,116 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-;
-; RUN: opt %loadPolly -polly-scops -S  -polly-invariant-load-hoisting \
-; RUN: -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
-;
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-codegen-ppcg -polly-acc-dump-kernel-ir -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=KERNEL-IR
-;
-; REQUIRES: pollyacc
-;
-; SCOP:       Function: f
-; SCOP-NEXT:  Region: %entry.split---%for.end26
-; SCOP-NEXT:  Max Loop Depth:  3
-; SCOP-NEXT:  Invariant Accesses: {
-; SCOP-NEXT:          ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:              [n, tmp12] -> { Stmt_for_body6[i0, i1, i2] -> MemRef_invariant[0] };
-; SCOP-NEXT:          Execution Context: [n, tmp12] -> {  : n > 0 }
-; SCOP-NEXT:  }
-; HOST-IR:      call void @polly_launchKernel(ptr %[[REGC:[0-9]+]], i32 %{{[0-9]+}}, i32 1, i32 32, i32 1, i32 1, ptr %polly_launch_0_params_i8ptr)
-; HOST-IR-NEXT: call void @polly_freeKernel(ptr %[[REGC]])
-
-; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_B, ptr addrspace(1) %MemRef_A, i32 %n, i32 %tmp12, i32 %polly.preload.tmp21.merge)
-
-
-; Check that we generate correct GPU code in case of invariant load hoisting.
-;
-;
-;    static const int N = 3000;
-;
-;    void f(int A[N][N], int *invariant, int B[N][N], int n) {
-;      for (int i = 0; i < n; i++) {
-;        for (int j = 0; j < n; j++) {
-;          for (int k = 0; k < n; k++) {
-;
-;            A[*invariant][k] = B[k][k];
-;            A[k][*invariant] += B[k][k];
-;          }
-;        }
-;      }
-;    }
-;
-
-define void @f(ptr %A, ptr %invariant, ptr %B, i32 %n) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %cmp6 = icmp sgt i32 %n, 0
-  br i1 %cmp6, label %for.cond1.preheader.lr.ph, label %for.end26
-
-for.cond1.preheader.lr.ph:                        ; preds = %entry.split
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.inc24
-  %i.07 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc25, %for.inc24 ]
-  %cmp23 = icmp sgt i32 %n, 0
-  br i1 %cmp23, label %for.cond4.preheader.lr.ph, label %for.inc24
-
-for.cond4.preheader.lr.ph:                        ; preds = %for.cond1.preheader
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %for.cond4.preheader.lr.ph, %for.inc21
-  %j.04 = phi i32 [ 0, %for.cond4.preheader.lr.ph ], [ %inc22, %for.inc21 ]
-  %cmp51 = icmp sgt i32 %n, 0
-  br i1 %cmp51, label %for.body6.lr.ph, label %for.inc21
-
-for.body6.lr.ph:                                  ; preds = %for.cond4.preheader
-  br label %for.body6
-
-for.body6:                                        ; preds = %for.body6.lr.ph, %for.body6
-  %k.02 = phi i32 [ 0, %for.body6.lr.ph ], [ %inc, %for.body6 ]
-  %idxprom = sext i32 %k.02 to i64
-  %idxprom7 = sext i32 %k.02 to i64
-  %arrayidx8 = getelementptr inbounds [3000 x i32], ptr %B, i64 %idxprom, i64 %idxprom7
-  %tmp9 = load i32, ptr %arrayidx8, align 4
-  %tmp12 = load i32, ptr %invariant, align 4
-  %idxprom9 = sext i32 %tmp12 to i64
-  %idxprom11 = sext i32 %k.02 to i64
-  %arrayidx12 = getelementptr inbounds [3000 x i32], ptr %A, i64 %idxprom9, i64 %idxprom11
-  store i32 %tmp9, ptr %arrayidx12, align 4
-  %idxprom13 = sext i32 %k.02 to i64
-  %idxprom15 = sext i32 %k.02 to i64
-  %arrayidx16 = getelementptr inbounds [3000 x i32], ptr %B, i64 %idxprom13, i64 %idxprom15
-  %tmp17 = load i32, ptr %arrayidx16, align 4
-  %idxprom17 = sext i32 %k.02 to i64
-  %tmp21 = load i32, ptr %invariant, align 4
-  %idxprom19 = sext i32 %tmp21 to i64
-  %arrayidx20 = getelementptr inbounds [3000 x i32], ptr %A, i64 %idxprom17, i64 %idxprom19
-  %tmp22 = load i32, ptr %arrayidx20, align 4
-  %add = add nsw i32 %tmp22, %tmp17
-  store i32 %add, ptr %arrayidx20, align 4
-  %inc = add nuw nsw i32 %k.02, 1
-  %cmp5 = icmp slt i32 %inc, %n
-  br i1 %cmp5, label %for.body6, label %for.cond4.for.inc21_crit_edge
-
-for.cond4.for.inc21_crit_edge:                    ; preds = %for.body6
-  br label %for.inc21
-
-for.inc21:                                        ; preds = %for.cond4.for.inc21_crit_edge, %for.cond4.preheader
-  %inc22 = add nuw nsw i32 %j.04, 1
-  %cmp2 = icmp slt i32 %inc22, %n
-  br i1 %cmp2, label %for.cond4.preheader, label %for.cond1.for.inc24_crit_edge
-
-for.cond1.for.inc24_crit_edge:                    ; preds = %for.inc21
-  br label %for.inc24
-
-for.inc24:                                        ; preds = %for.cond1.for.inc24_crit_edge, %for.cond1.preheader
-  %inc25 = add nuw nsw i32 %i.07, 1
-  %cmp = icmp slt i32 %inc25, %n
-  br i1 %cmp, label %for.cond1.preheader, label %for.cond.for.end26_crit_edge
-
-for.cond.for.end26_crit_edge:                     ; preds = %for.inc24
-  br label %for.end26
-
-for.end26:                                        ; preds = %for.cond.for.end26_crit_edge, %entry.split
-  ret void
-}
diff --git a/polly/test/GPGPU/invariant-load-of-scalar.ll b/polly/test/GPGPU/invariant-load-of-scalar.ll
deleted file mode 100644
index fbc1d4d7ecee..000000000000
--- a/polly/test/GPGPU/invariant-load-of-scalar.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck -check-prefix=SCOP %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \
-; RUN: -S < %s | \
-; RUN: FileCheck -check-prefix=HOST-IR %s
-
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-invariant-load-hoisting \
-; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
-; RUN: FileCheck -check-prefix=KERNEL-IR %s
-
-; REQUIRES: pollyacc
-
-; Check that we offload invariant loads of scalars correctly.
-
-; Check that invariant loads are present.
-; SCOP:      Function: checkPrivatization
-; SCOP-NEXT: Region: %entry.split---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP-NEXT: Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp, tmp2] -> { Stmt_entry_split[] -> MemRef_begin[0] };
-; SCOP-NEXT:         Execution Context: [tmp, tmp2] -> {  :  }
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             [tmp, tmp2] -> { Stmt_for_body[i0] -> MemRef_end[0] };
-; SCOP-NEXT:         Execution Context: [tmp, tmp2] -> {  :  }
-; SCOP-NEXT: }
-;
-
-; Check that we do not actually allocate arrays for %begin, %end, since they are
-; invariant load hoisted.
-; HOST-IR: %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice
-; HOST-IR-NOT: call ptr @polly_allocateMemoryForDevice
-
-; Check that we send the invariant loaded scalars as parameters to the
-; kernel function.
-; KERNEL-IR: define ptx_kernel void @FUNC_checkPrivatization_SCOP_0_KERNEL_0
-; KERNEL-IR-SAME: (ptr addrspace(1) %MemRef_A, i32 %tmp,
-; KERNEL-IR-SAME: i32 %tmp2, i32 %polly.access.begin.load)
-
-
-; void checkScalarPointerOffload(int A[], int *begin, int *end) {
-;     for(int i = *begin; i < *end; i++) {
-;         A[i] = 10;
-;     }
-; }
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.12.0"
-
-define void @checkPrivatization(ptr %A, ptr %begin, ptr %end) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %tmp = load i32, ptr %begin, align 4
-  %tmp21 = load i32, ptr %end, align 4
-  %cmp3 = icmp slt i32 %tmp, %tmp21
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  %tmp1 = sext i32 %tmp to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %indvars.iv4 = phi i64 [ %tmp1, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv4
-  store i32 10, ptr %arrayidx, align 4
-  %indvars.iv.next = add i64 %indvars.iv4, 1
-  %tmp2 = load i32, ptr %end, align 4
-  %tmp3 = sext i32 %tmp2 to i64
-  %cmp = icmp slt i64 %indvars.iv.next, %tmp3
-  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
-
diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll
deleted file mode 100644
index 87ae470e29bc..000000000000
--- a/polly/test/GPGPU/kernel-params-only-some-arrays.ll
+++ /dev/null
@@ -1,106 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -S < %s | \
-; RUN: FileCheck -check-prefix=IR %s
-
-; REQUIRES: pollyacc
-;
-;    void kernel_params_only_some_arrays(float A[], float B[]) {
-;      for (long i = 0; i < 32; i++)
-;        A[i] += 42;
-;
-;      for (long i = 0; i < 32; i++)
-;        B[i] += 42;
-;    }
-
-; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0'
-; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0"
-; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
-
-; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B)
-; KERNEL-NEXT:   entry:
-; KERNEL-NEXT:     %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
-; KERNEL-NEXT:     %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-; KERNEL-NEXT:     %t0 = zext i32 %1 to i64
-
-; KERNEL:     ret void
-; KERNEL-NEXT: }
-
-; KERNEL: ; ModuleID = 'FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1'
-; KERNEL-NEXT: source_filename = "FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1"
-; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
-
-; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_A)
-; KERNEL-NEXT:   entry:
-; KERNEL-NEXT:     %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
-; KERNEL-NEXT:     %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-; KERNEL-NEXT:     %t0 = zext i32 %1 to i64
-
-; KERNEL:     ret void
-; KERNEL-NEXT: }
-
-
-; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
-; IR-NEXT:  [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0
-; IR-NEXT:  store i8* [[DEVPTR]], i8** %polly_launch_0_param_0
-; IR-NEXT:  [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
-; IR-NEXT:  store i8* [[DATA]], i8** [[SLOT]]
-
-; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
-; IR-NEXT:  [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0
-; IR-NEXT:  store i8* [[DEVPTR]], i8** %polly_launch_1_param_0
-; IR-NEXT:  [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8*
-; IR-NEXT:  store i8* [[DATA]], i8** [[SLOT]]
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @kernel_params_only_some_arrays(float* %A, float* %B) {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ]
-  %exitcond1 = icmp ne i64 %i.0, 32
-  br i1 %exitcond1, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds float, float* %A, i64 %i.0
-  %tmp = load float, float* %arrayidx, align 4
-  %add = fadd float %tmp, 4.200000e+01
-  store float %add, float* %arrayidx, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %inc = add nuw nsw i64 %i.0, 1
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  br label %for.cond2
-
-for.cond2:                                        ; preds = %for.inc7, %for.end
-  %i1.0 = phi i64 [ 0, %for.end ], [ %inc8, %for.inc7 ]
-  %exitcond = icmp ne i64 %i1.0, 32
-  br i1 %exitcond, label %for.body4, label %for.end9
-
-for.body4:                                        ; preds = %for.cond2
-  %arrayidx5 = getelementptr inbounds float, float* %B, i64 %i1.0
-  %tmp2 = load float, float* %arrayidx5, align 4
-  %add6 = fadd float %tmp2, 4.200000e+01
-  store float %add6, float* %arrayidx5, align 4
-  br label %for.inc7
-
-for.inc7:                                         ; preds = %for.body4
-  %inc8 = add nuw nsw i64 %i1.0, 1
-  br label %for.cond2
-
-for.end9:                                         ; preds = %for.cond2
-  ret void
-}
diff --git a/polly/test/GPGPU/kernel-params-scop-parameter.ll b/polly/test/GPGPU/kernel-params-scop-parameter.ll
deleted file mode 100644
index 527492bfd5fb..000000000000
--- a/polly/test/GPGPU/kernel-params-scop-parameter.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=KERNEL-IR %s
-
-; REQUIRES: pollyacc
-
-;    void kernel_params_scop_parameter(float A[], long n) {
-;      for (long i = 0; i < n; i++)
-;        A[i] += 42;
-;    }
-
-; KERNEL-IR: define ptx_kernel void @FUNC_kernel_params_scop_parameter_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i64 %n)
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @kernel_params_scop_parameter(ptr %A, i64 %n) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb6, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
-  %tmp = icmp slt i64 %i.0, %n
-  br i1 %tmp, label %bb2, label %bb8
-
-bb2:                                              ; preds = %bb1
-  %tmp3 = getelementptr inbounds float, ptr %A, i64 %i.0
-  %tmp4 = load float, ptr %tmp3, align 4
-  %tmp5 = fadd float %tmp4, 4.200000e+01
-  store float %tmp5, ptr %tmp3, align 4
-  br label %bb6
-
-bb6:                                              ; preds = %bb2
-  %tmp7 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb8:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/kernels-names-across-scops-funcs.ll b/polly/test/GPGPU/kernels-names-across-scops-funcs.ll
deleted file mode 100644
index 57fe70ec0d9b..000000000000
--- a/polly/test/GPGPU/kernels-names-across-scops-funcs.ll
+++ /dev/null
@@ -1,124 +0,0 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen-ppcg \
-; RUN: -polly-acc-dump-kernel-ir -disable-output < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; REQUIRES: pollyacc
-
-; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 {
-; KERNEL: define ptx_kernel void @FUNC_foo_SCOP_1_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 {
-; KERNEL: define ptx_kernel void @FUNC_foo2_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_arg1, i32 %arg) #0 {
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; Function Attrs: nounwind uwtable
-define void @foo(i32 %arg, ptr %arg1) #0 {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb
-  %tmp = icmp sgt i32 %arg, 0
-  br i1 %tmp, label %bb3, label %bb13
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb4, %bb3
-  %tmp5 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb4 ]
-  %tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp5
-  %tmp7 = load i32, ptr %tmp6, align 4, !tbaa !2
-  %tmp8 = add nsw i32 %tmp7, 1
-  store i32 %tmp8, ptr %tmp6, align 4, !tbaa !2
-  %tmp9 = add nuw nsw i64 %tmp5, 1
-  %tmp10 = zext i32 %arg to i64
-  %tmp11 = icmp ne i64 %tmp9, %tmp10
-  br i1 %tmp11, label %bb4, label %bb12
-
-bb12:                                             ; preds = %bb4
-  br label %bb13
-
-bb13:                                             ; preds = %bb12, %bb2
-  %tmp14 = tail call i64 @clock() #3
-  %tmp15 = icmp eq i64 %tmp14, 0
-  br i1 %tmp15, label %bb16, label %bb29
-
-bb16:                                             ; preds = %bb13
-  %tmp17 = icmp sgt i32 %arg, 0
-  br i1 %tmp17, label %bb18, label %bb28
-
-bb18:                                             ; preds = %bb16
-  br label %bb19
-
-bb19:                                             ; preds = %bb19, %bb18
-  %tmp20 = phi i64 [ 0, %bb18 ], [ %tmp24, %bb19 ]
-  %tmp21 = getelementptr inbounds i32, ptr %arg1, i64 %tmp20
-  %tmp22 = load i32, ptr %tmp21, align 4, !tbaa !2
-  %tmp23 = add nsw i32 %tmp22, 1
-  store i32 %tmp23, ptr %tmp21, align 4, !tbaa !2
-  %tmp24 = add nuw nsw i64 %tmp20, 1
-  %tmp25 = zext i32 %arg to i64
-  %tmp26 = icmp ne i64 %tmp24, %tmp25
-  br i1 %tmp26, label %bb19, label %bb27
-
-bb27:                                             ; preds = %bb19
-  br label %bb28
-
-bb28:                                             ; preds = %bb27, %bb16
-  br label %bb29
-
-bb29:                                             ; preds = %bb28, %bb13
-  ret void
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
-
-; Function Attrs: nounwind
-declare i64 @clock() #2
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
-
-; Function Attrs: nounwind uwtable
-define void @foo2(i32 %arg, ptr %arg1) #0 {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb
-  %tmp = icmp sgt i32 %arg, 0
-  br i1 %tmp, label %bb3, label %bb13
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb4, %bb3
-  %tmp5 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb4 ]
-  %tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp5
-  %tmp7 = load i32, ptr %tmp6, align 4, !tbaa !2
-  %tmp8 = add nsw i32 %tmp7, 1
-  store i32 %tmp8, ptr %tmp6, align 4, !tbaa !2
-  %tmp9 = add nuw nsw i64 %tmp5, 1
-  %tmp10 = zext i32 %arg to i64
-  %tmp11 = icmp ne i64 %tmp9, %tmp10
-  br i1 %tmp11, label %bb4, label %bb12
-
-bb12:                                             ; preds = %bb4
-  br label %bb13
-
-bb13:                                             ; preds = %bb12, %bb2
-  ret void
-}
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{!"clang version 5.0.0"}
-!2 = !{!3, !3, i64 0}
-!3 = !{!"int", !4, i64 0}
-!4 = !{!"omnipotent char", !5, i64 0}
-!5 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll b/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll
deleted file mode 100644
index 0f8405dad7e8..000000000000
--- a/polly/test/GPGPU/libdevice-functions-copied-into-kernel.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; RUN: opt %loadPolly -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll -disable-output < %s | FileCheck %s --check-prefix=KERNEL-IR
-; RUN: opt %loadPolly -S -polly-codegen-ppcg  < %s \
-; RUN: -polly-acc-libdevice=%S/Inputs/libdevice-functions-copied-into-kernel_libdevice.ll \
-; RUN:     | FileCheck %s --check-prefix=HOST-IR
-
-; Test that we do recognise and codegen a kernel that has functions that can
-; be mapped to NVIDIA's libdevice
-
-; REQUIRES: pollyacc
-
-; Check that we model the kernel as a scop.
-; SCOP:      Function: f
-; SCOP-NEXT:       Region: %entry.split---%for.end
-
-; Check that the intrinsic call is present in the kernel IR.
-; KERNEL-IR:   %p_expf = tail call float @__nv_expf(float %A.arr.i.val_p_scalar_)
-; KERNEL-IR:   %p_cosf = tail call float @__nv_cosf(float %p_expf)
-; KERNEL-IR:   %p_logf = tail call float @__nv_logf(float %p_cosf)
-
-; Powi and exp cannot be lowered directly. Rather, we expect them to be
-; lowered by libdevice.
-; KERNEL-IR: %p_powi = tail call float @__nv_powif(float %p_logf, i32 2)
-; KERNEL-IR: %p_exp = tail call float @__nv_expf(float %p_powi)
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-
-; void f(float *A, float *B, int N) {
-;   for(int i = 0; i < N; i++) {
-;       float tmp0 = A[i];
-;       float expf  = expf(tmp1);
-;       cosf = cosf(expf);
-;       logf = logf(cosf);
-;       powi = powi(logf, 2);
-;       exp = exp(powi);
-;       B[i] = logf;
-;   }
-; }
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @f(ptr %A, ptr %B, i32 %N) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %cmp1 = icmp sgt i32 %N, 0
-  br i1 %cmp1, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %A.arr.i = getelementptr inbounds float, ptr %A, i64 %indvars.iv
-  %A.arr.i.val = load float, ptr %A.arr.i, align 4
-  ; Call to intrinsics that should be part of the kernel.
-  %expf = tail call float @expf(float %A.arr.i.val)
-  %cosf = tail call float @cosf(float %expf)
-  %logf = tail call float @logf(float %cosf)
-  %powi = tail call float @llvm.powi.f32.i32(float %logf, i32 2)
-  %exp = tail call float @llvm.exp.f32(float %powi)
-  %B.arr.i = getelementptr inbounds float, ptr %B, i64 %indvars.iv
-  store float %exp, ptr %B.arr.i, align 4
-
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %wide.trip.count = zext i32 %N to i64
-  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.body
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare float @expf(float) #0
-declare float @cosf(float) #0
-declare float @logf(float) #0
-declare float @llvm.powi.f32.i32(float, i32) #0
-declare float @llvm.exp.f32(float) #0
-
-attributes #0 = { nounwind readnone }
-
diff --git a/polly/test/GPGPU/live-range-reordering-with-privatization.ll b/polly/test/GPGPU/live-range-reordering-with-privatization.ll
deleted file mode 100644
index 3b047fd557ff..000000000000
--- a/polly/test/GPGPU/live-range-reordering-with-privatization.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-  ; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \
-; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \
-; RUN: -polly-acc-dump-code -disable-output \
-; RUN:   < %s | FileCheck %s -check-prefix=CODE
-
-; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \
-; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \
-; RUN: -polly-acc-dump-kernel-ir -disable-output \
-; RUN:   < %s | FileCheck %s -check-prefix=KERNELIR
-
-; REQUIRES: pollyacc
-
-;    void f(const int *end, int *arr, const int *control, const int *readarr) {
-;      for (int i = 0; i < *end; i++) {
-;        int t = 0;
-;        if (*control > 3) {
-;          t += readarr[i];
-;        }
-;        arr[i] = t;
-;      }
-;    }
-
-; This test case tests the ability to infer that `t` is local to each loop
-; iteration, and can therefore be privatized.
-
-; CODE: # kernel0
-; CODE-NEXT: for (int c0 = 0; c0 <= (tmp - 32 * b0 - 1) / 1048576; c0 += 1)
-; CODE-NEXT:   if (tmp >= 32 * b0 + t0 + 1048576 * c0 + 1) {
-; CODE-NEXT:     Stmt_for_body_last(32 * b0 + t0 + 1048576 * c0);
-; CODE-NEXT:     if (tmp1 >= 4)
-; CODE-NEXT:       Stmt_if_then(32 * b0 + t0 + 1048576 * c0);
-; CODE-NEXT:     Stmt_if_end(32 * b0 + t0 + 1048576 * c0);
-; CODE-NEXT:   }
-
-; KERNELIR: %private_array = alloca i32
-
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-target triple = "i386-apple-macosx10.12.0"
-
-define void @f(ptr %end, ptr %arr, ptr %control, ptr %readarr) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %tmp3 = load i32, ptr %end, align 4
-  %cmp4 = icmp sgt i32 %tmp3, 0
-  br i1 %cmp4, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry.split
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %if.end
-  %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ]
-  %tmp1 = load i32, ptr %control, align 4
-  %cmp1 = icmp sgt i32 %tmp1, 3
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:                                          ; preds = %for.body
-  %arrayidx = getelementptr inbounds i32, ptr %readarr, i32 %i.05
-  %tmp2 = load i32, ptr %arrayidx, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.then, %for.body
-  %t.0 = phi i32 [ %tmp2, %if.then ], [ 0, %for.body ]
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i32 %i.05
-  store i32 %t.0, ptr %arrayidx2, align 4
-  %inc = add nuw nsw i32 %i.05, 1
-  %tmp = load i32, ptr %end, align 4
-  %cmp = icmp slt i32 %inc, %tmp
-  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %if.end
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
-  ret void
-}
-
diff --git a/polly/test/GPGPU/loops-outside-scop.ll b/polly/test/GPGPU/loops-outside-scop.ll
deleted file mode 100644
index 36b3a706338a..000000000000
--- a/polly/test/GPGPU/loops-outside-scop.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-; There is no FileCheck because we want to make sure that this doesn't crash.
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-fail-on-verify-module-failure \
-; RUN: -disable-output < %s
-
-; REQUIRES: pollyacc
-
-; Due to the existence of the `fence` call, We can only detect the inner loop
-; and not the outer loop. PPCGCodeGeneration had not implemented this case.
-; The fix was to pull the implementation from `IslNodeBuilder.
-
-; Make sure that we only capture the inner loop
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for2.body---%for2.body.fence
-; SCOP-NEXT: Max Loop Depth:  1
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @fn_to_fence(ptr %val)
-
-; void f(int *arr, bool shouldcont) {
-;     for(int i = 0; ; i++) {
-;         for(int j = 0; j < 10; j++) {
-;             arr[j] = i;
-;         }
-;         fence(arr);
-;         if (!shouldcont) break;
-;     }
-; }
-
-
-; Function Attrs: nounwind uwtable
-define void @f(ptr %arr, i1 %shouldcont) #1 {
-entry:
-  br label %for.init
-
-for.init:                                             ; preds = %for.end, %entry.split
-  %i = phi i32 [ %i.next, %for.end ], [ 0, %entry ]
-  br label %for2.body
-
-for2.body:                                             ; preds = %"65", %"64"
-  %j = phi i32 [ %j.next, %for2.body ], [ 0, %for.init ]
-  %j.sext = sext i32 %j to i64
-  %arr.slot = getelementptr i32, ptr %arr, i64 %j.sext
-  store i32 %i, ptr %arr.slot, align 4
-  %exitcond = icmp eq i32 %j, 10
-  %j.next = add i32 %j, 1
-  br i1 %exitcond, label %for2.body.fence, label %for2.body
-
-for2.body.fence:                                             ; preds = %"65"
-  call void @fn_to_fence(ptr %arr) #2
-  br i1 %shouldcont, label %for.end, label %exit
-for.end:                                             ; preds = %"69"
-  %i.next = add i32 %i, 1
-  br label %for.init
-
-exit:                                             ; preds = %"69"
-  ret void
-
-}
-
-
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind uwtable }
-attributes #2 = { nounwind }
diff --git a/polly/test/GPGPU/managed-memory-rewrite-alloca.ll b/polly/test/GPGPU/managed-memory-rewrite-alloca.ll
deleted file mode 100644
index 6dbd87db5eb5..000000000000
--- a/polly/test/GPGPU/managed-memory-rewrite-alloca.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-
-; RUN: opt %loadPolly -S  -polly-process-unprofitable -polly-acc-mincompute=0 \
-; RUN: -polly-codegen-ppcg -polly-acc-codegen-managed-memory \
-; RUN: -polly-acc-rewrite-managed-memory  -polly-acc-rewrite-allocas < %s | FileCheck %s --check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP: i32 MemRef_arr[*];
-
-; Check that we generate a constructor call for @A.toptr
-; HOST-IR-NOT:   %arr = alloca [100 x i32]
-
-source_filename = "test.c"
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.12.0"
-
-
-define void @f() {
-entry:
-  %arr = alloca [100 x i32]
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %for.body
-  %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds [100 x i32], ptr %arr, i64 0, i64 %indvars.iv1
-  store i32 42, ptr %arrayidx, align 4, !tbaa !3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 100
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0
-
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0
-
-attributes #0 = { argmemonly nounwind }
-
-!llvm.module.flags = !{!0, !1}
-!llvm.ident = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"PIC Level", i32 2}
-!2 = !{!"clang version 6.0.0"}
-!3 = !{!4, !4, i64 0}
-!4 = !{!"int", !5, i64 0}
-!5 = !{!"omnipotent char", !6, i64 0}
-!6 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll b/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll
deleted file mode 100644
index 946da40919ec..000000000000
--- a/polly/test/GPGPU/managed-memory-rewrite-malloc-free-inside-constexpr.ll
+++ /dev/null
@@ -1,93 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -S -polly-acc-codegen-managed-memory \
-; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR
-;
-; REQUIRES: pollyacc
-;
-; Check that we can correctly rewrite `malloc` to `polly_mallocManaged`, and
-; `free` to `polly_freeManaged` with the `polly-acc-rewrite-managed-memory`
-; pass, even inside `constantExpr`. This is necessary because a cookie cutter
-; Inst->replaceUsesOfWith(...) call does not actually work, because this does
-; not replace the instruction within a ConstantExpr.
-;
-; #include <memory.h>
-;
-; static const int N = 100;
-; int* f(int *ToFree) {
-;     free(ToFree);
-;     int *A = (int *)malloc(sizeof(int) * N);
-;     for(int i = 0; i < N; i++) {
-;         A[i] = 42;
-;     }
-;     return A;
-;
-; }
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-
-; SCOP:      Arrays {
-; SCOP-NEXT:     i32 MemRef_tmp[*]; // Element size 4
-; SCOP-NEXT: }
-
-; // Check that polly_mallocManaged is declared and used correctly.
-; HOST-IR: declare ptr @polly_mallocManaged(i64)
-
-; // Check that polly_freeManaged is declared and used correctly.
-; HOST-IR  call void @polly_freeManaged(i8* %toFree)
-; HOST-IR: declare void @polly_freeManaged(ptr)
-
-; // Check that we remove the original malloc,free
-; HOST-IR-NOT: declare ptr @malloc(i64)
-; HOST-IR-NOT: declare void @free(ptr)
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.12.0"
-
-define ptr @f(ptr %toFree) {
-entry:
-  ; Free inside bitcast
-  call void @free (ptr %toFree)
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  ; malloc inside bitcast.
-  %tmp = call ptr @malloc (i64 400)
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %for.body
-  %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %tmp, i64 %indvars.iv1
-  store i32 42, ptr %arrayidx, align 4, !tbaa !3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 100
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret ptr %tmp
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0
-
-declare ptr @malloc(i64)
-declare void @free(ptr)
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0
-
-attributes #0 = { argmemonly nounwind }
-
-!llvm.module.flags = !{!0, !1}
-!llvm.ident = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"PIC Level", i32 2}
-!2 = !{!"clang version 6.0.0"}
-!3 = !{!4, !4, i64 0}
-!4 = !{!"int", !5, i64 0}
-!5 = !{!"omnipotent char", !6, i64 0}
-!6 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll b/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll
deleted file mode 100644
index 8e456127b127..000000000000
--- a/polly/test/GPGPU/managed-memory-rewrite-malloc-free.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -S -polly-acc-codegen-managed-memory \
-; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR
-;
-; REQUIRES: pollyacc
-;
-; Check that we can correctly rewrite `malloc` to `polly_mallocManaged`, and
-; `free` to `polly_freeManaged` with the `polly-acc-rewrite-managed-memory`
-; pass.
-;
-; #include <memory.h>
-;
-; static const int N = 100;
-; int* f(int *ToFree) {
-;     free(ToFree);
-;     int *A = (int *)malloc(sizeof(int) * N);
-;     for(int i = 0; i < N; i++) {
-;         A[i] = 42;
-;     }
-;     return A;
-;
-; }
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-
-; SCOP:      Arrays {
-; SCOP-NEXT:     i32 MemRef_call[*]; // Element size 4
-; SCOP-NEXT: }
-
-; // Check that polly_mallocManaged is declared and used correctly.
-; HOST-IR: %call = tail call ptr @polly_mallocManaged(i64 400)
-; HOST-IR: declare ptr @polly_mallocManaged(i64)
-
-; // Check that polly_freeManaged is declared and used correctly.
-; HOST-IR  %toFreeBitcast = bitcast i32* %toFree to i8*
-; HOST-IR  call void @polly_freeManaged(i8* %toFreeBitcast)
-; HOST-IR: declare void @polly_freeManaged(ptr)
-
-; // Check that we remove the original malloc,free
-; HOST-IR-NOT: declare ptr @malloc(i64)
-; HOST-IR-NOT: declare void @free(ptr)
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.12.0"
-
-define ptr @f(ptr %toFree) {
-entry:
-  call void @free(ptr %toFree)
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %call = tail call ptr @malloc(i64 400)
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %for.body
-  %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32, ptr %call, i64 %indvars.iv1
-  store i32 42, ptr %arrayidx, align 4, !tbaa !3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 100
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret ptr %call
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0
-
-declare ptr @malloc(i64)
-declare void @free(ptr)
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0
-
-attributes #0 = { argmemonly nounwind }
-
-!llvm.module.flags = !{!0, !1}
-!llvm.ident = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"PIC Level", i32 2}
-!2 = !{!"clang version 6.0.0"}
-!3 = !{!4, !4, i64 0}
-!4 = !{!"int", !5, i64 0}
-!5 = !{!"omnipotent char", !6, i64 0}
-!6 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/memory-only-referenced-from-access.ll b/polly/test/GPGPU/memory-only-referenced-from-access.ll
deleted file mode 100644
index b3828950324a..000000000000
--- a/polly/test/GPGPU/memory-only-referenced-from-access.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -polly-invariant-load-hoisting -polly-ignore-aliasing \
-; RUN: -polly-process-unprofitable -polly-ignore-parameter-bounds \
-; RUN: -polly-acc-fail-on-verify-module-failure \
-; RUN: -polly-acc-codegen-managed-memory \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s
-
-; REQUIRES: pollyacc
-
-; Verify that we correctly generate a kernel even if certain invariant load
-; hoisted parameters appear only in memory accesses, but not domain elements.
-
-; CHECK: @FUNC_quux_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_tmp4, i32 %tmp3, i32 %tmp, i32 %tmp31, i32 %tmp2)
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.hoge = type { ptr, i64, i64, [1 x %struct.widget] }
-%struct.widget = type { i64, i64, i64 }
-
-@global = external unnamed_addr global %struct.hoge, align 32
-
-define void @quux(ptr noalias %arg, ptr noalias %arg1) {
-bb:
-  %tmp = load i32, ptr %arg, align 4
-  %tmp2 = sext i32 %tmp to i64
-  %tmp3 = load i32, ptr %arg1, align 4
-  %tmp4 = load ptr, ptr @global, align 32
-  br label %bb5
-
-bb5:                                              ; preds = %bb5, %bb
-  %tmp6 = phi i32 [ %tmp11, %bb5 ], [ 0, %bb ]
-  %tmp7 = sext i32 %tmp6 to i64
-  %tmp8 = sub nsw i64 %tmp7, %tmp2
-  %tmp9 = getelementptr [0 x double], ptr %tmp4, i64 0, i64 %tmp8
-  store double undef, ptr %tmp9, align 8
-  %tmp10 = icmp eq i32 %tmp6, %tmp3
-  %tmp11 = add i32 %tmp6, 1
-  br i1 %tmp10, label %bb12, label %bb5
-
-bb12:                                             ; preds = %bb5
-  ret void
-}
diff --git a/polly/test/GPGPU/mostly-sequential.ll b/polly/test/GPGPU/mostly-sequential.ll
deleted file mode 100644
index c42c24482a38..000000000000
--- a/polly/test/GPGPU/mostly-sequential.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; REQUIRES: pollyacc
-
-;    void foo(float A[]) {
-;      for (long i = 0; i < 128; i++)
-;        A[i] += i;
-;
-;      for (long i = 0; i < 128; i++)
-;        for (long j = 0; j < 128; j++)
-;          A[42] += i + j;
-;    }
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(4);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:            {
-; CODE-NEXT:         dim3 k1_dimBlock;
-; CODE-NEXT:         dim3 k1_dimGrid;
-; CODE-NEXT:         kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:         cudaCheckKernel();
-; CODE-NEXT:       }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (128) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb4(32 * b0 + t0);
-
-; CODE: # kernel1
-; CODE-NEXT: for (int c0 = 0; c0 <= 127; c0 += 1)
-; CODE-NEXT:   for (int c1 = 0; c1 <= 127; c1 += 1)
-; CODE-NEXT:     Stmt_bb14(c0, c1);
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A) {
-bb:
-  br label %bb3
-
-bb3:                                              ; preds = %bb8, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp9, %bb8 ]
-  %exitcond2 = icmp ne i64 %i.0, 128
-  br i1 %exitcond2, label %bb4, label %bb10
-
-bb4:                                              ; preds = %bb3
-  %tmp = sitofp i64 %i.0 to float
-  %tmp5 = getelementptr inbounds float, ptr %A, i64 %i.0
-  %tmp6 = load float, ptr %tmp5, align 4
-  %tmp7 = fadd float %tmp6, %tmp
-  store float %tmp7, ptr %tmp5, align 4
-  br label %bb8
-
-bb8:                                              ; preds = %bb4
-  %tmp9 = add nuw nsw i64 %i.0, 1
-  br label %bb3
-
-bb10:                                             ; preds = %bb3
-  br label %bb11
-
-bb11:                                             ; preds = %bb23, %bb10
-  %i1.0 = phi i64 [ 0, %bb10 ], [ %tmp24, %bb23 ]
-  %exitcond1 = icmp ne i64 %i1.0, 128
-  br i1 %exitcond1, label %bb12, label %bb25
-
-bb12:                                             ; preds = %bb11
-  br label %bb13
-
-bb13:                                             ; preds = %bb20, %bb12
-  %j.0 = phi i64 [ 0, %bb12 ], [ %tmp21, %bb20 ]
-  %exitcond = icmp ne i64 %j.0, 128
-  br i1 %exitcond, label %bb14, label %bb22
-
-bb14:                                             ; preds = %bb13
-  %tmp15 = add nuw nsw i64 %i1.0, %j.0
-  %tmp16 = sitofp i64 %tmp15 to float
-  %tmp17 = getelementptr inbounds float, ptr %A, i64 42
-  %tmp18 = load float, ptr %tmp17, align 4
-  %tmp19 = fadd float %tmp18, %tmp16
-  store float %tmp19, ptr %tmp17, align 4
-  br label %bb20
-
-bb20:                                             ; preds = %bb14
-  %tmp21 = add nuw nsw i64 %j.0, 1
-  br label %bb13
-
-bb22:                                             ; preds = %bb13
-  br label %bb23
-
-bb23:                                             ; preds = %bb22
-  %tmp24 = add nuw nsw i64 %i1.0, 1
-  br label %bb11
-
-bb25:                                             ; preds = %bb11
-  ret void
-}
diff --git a/polly/test/GPGPU/non-read-only-scalars.ll b/polly/test/GPGPU/non-read-only-scalars.ll
deleted file mode 100644
index 1ce6e0991ebb..000000000000
--- a/polly/test/GPGPU/non-read-only-scalars.ll
+++ /dev/null
@@ -1,168 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-IR
-;
-; REQUIRES: pollyacc
-;
-; #include <stdio.h>
-;
-; float foo(float A[]) {
-;   float sum = 0;
-;
-;   for (long i = 0; i < 32; i++)
-;     A[i] = i;
-;
-;   for (long i = 0; i < 32; i++)
-;     A[i] += i;
-;
-;   for (long i = 0; i < 32; i++)
-;     sum += A[i];
-;
-;   return sum;
-; }
-;
-; int main() {
-;   float A[32];
-;   float sum = foo(A);
-;   printf("%f\n", sum);
-; }
-
-; CODE:          dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(1);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   {
-; CODE-NEXT:     dim3 k1_dimBlock;
-; CODE-NEXT:     dim3 k1_dimGrid;
-; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_sum_0__phi);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:          {
-; CODE-NEXT:       dim3 k2_dimBlock;
-; CODE-NEXT:       dim3 k2_dimGrid;
-; CODE-NEXT:       kernel2 <<<k2_dimGrid, k2_dimBlock>>> (dev_MemRef_A, dev_MemRef_sum_0__phi, dev_MemRef_sum_0);
-; CODE-NEXT:       cudaCheckKernel();
-; CODE-NEXT:     }
-
-; CODE:        cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (32) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(&MemRef_sum_0, dev_MemRef_sum_0, sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaFree(dev_MemRef_A));
-; CODE-NEXT:   cudaCheckReturn(cudaFree(dev_MemRef_sum_0__phi));
-; CODE-NEXT:   cudaCheckReturn(cudaFree(dev_MemRef_sum_0));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: {
-; CODE-NEXT:   Stmt_bb4(t0);
-; CODE-NEXT:   Stmt_bb10(t0);
-; CODE-NEXT: }
-
-; CODE: # kernel1
-; CODE-NEXT: Stmt_bb17();
-
-; CODE: # kernel2
-; TODO-NEXT: {
-; TODO-NEXT:   read();
-; TODO-NEXT:   for (int c0 = 0; c0 <= 32; c0 += 1) {
-; TODO-NEXT:     Stmt_bb18(c0);
-; TODO-NEXT:     if (c0 <= 31)
-; TODO-NEXT:       Stmt_bb20(c0);
-; TODO-NEXT:   }
-; TODO-NEXT:   write();
-; TODO-NEXT: }
-
-
-; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_1(ptr addrspace(1) %MemRef_sum_0__phi)
-; KERNEL-IR:  store float 0.000000e+00, ptr %sum.0.phiops
-; KERNEL-IR:  [[REGA:%.+]] = addrspacecast ptr addrspace(1) %MemRef_sum_0__phi to ptr
-; KERNEL-IR:  [[REGB:%.+]] = load float, ptr %sum.0.phiops
-; KERNEL-IR:  store float [[REGB]], ptr [[REGA]]
-
-; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_2(ptr addrspace(1) %MemRef_A, ptr addrspace(1) %MemRef_sum_0__phi, ptr addrspace(1) %MemRef_sum_0)
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-@.str = private unnamed_addr constant [4 x i8] c"%f\0A\00", align 1
-
-define float @foo(ptr %A) {
-bb:
-  br label %bb3
-
-bb3:                                              ; preds = %bb6, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
-  %exitcond2 = icmp ne i64 %i.0, 32
-  br i1 %exitcond2, label %bb4, label %bb8
-
-bb4:                                              ; preds = %bb3
-  %tmp = sitofp i64 %i.0 to float
-  %tmp5 = getelementptr inbounds float, ptr %A, i64 %i.0
-  store float %tmp, ptr %tmp5, align 4
-  br label %bb6
-
-bb6:                                              ; preds = %bb4
-  %tmp7 = add nuw nsw i64 %i.0, 1
-  br label %bb3
-
-bb8:                                              ; preds = %bb3
-  br label %bb9
-
-bb9:                                              ; preds = %bb15, %bb8
-  %i1.0 = phi i64 [ 0, %bb8 ], [ %tmp16, %bb15 ]
-  %exitcond1 = icmp ne i64 %i1.0, 32
-  br i1 %exitcond1, label %bb10, label %bb17
-
-bb10:                                             ; preds = %bb9
-  %tmp11 = sitofp i64 %i1.0 to float
-  %tmp12 = getelementptr inbounds float, ptr %A, i64 %i1.0
-  %tmp13 = load float, ptr %tmp12, align 4
-  %tmp14 = fadd float %tmp13, %tmp11
-  store float %tmp14, ptr %tmp12, align 4
-  br label %bb15
-
-bb15:                                             ; preds = %bb10
-  %tmp16 = add nuw nsw i64 %i1.0, 1
-  br label %bb9
-
-bb17:                                             ; preds = %bb9
-  br label %bb18
-
-bb18:                                             ; preds = %bb20, %bb17
-  %sum.0 = phi float [ 0.000000e+00, %bb17 ], [ %tmp23, %bb20 ]
-  %i2.0 = phi i64 [ 0, %bb17 ], [ %tmp24, %bb20 ]
-  %exitcond = icmp ne i64 %i2.0, 32
-  br i1 %exitcond, label %bb19, label %bb25
-
-bb19:                                             ; preds = %bb18
-  br label %bb20
-
-bb20:                                             ; preds = %bb19
-  %tmp21 = getelementptr inbounds float, ptr %A, i64 %i2.0
-  %tmp22 = load float, ptr %tmp21, align 4
-  %tmp23 = fadd float %sum.0, %tmp22
-  %tmp24 = add nuw nsw i64 %i2.0, 1
-  br label %bb18
-
-bb25:                                             ; preds = %bb18
-  %sum.0.lcssa = phi float [ %sum.0, %bb18 ]
-  ret float %sum.0.lcssa
-}
-
-define i32 @main() {
-bb:
-  %A = alloca [32 x float], align 16
-  %tmp1 = call float @foo(ptr %A)
-  %tmp2 = fpext float %tmp1 to double
-  %tmp3 = call i32 (ptr, ...) @printf(ptr @.str, double %tmp2) #2
-  ret i32 0
-}
-
-declare i32 @printf(ptr, ...) #1
-
diff --git a/polly/test/GPGPU/non-zero-array-offset.ll b/polly/test/GPGPU/non-zero-array-offset.ll
deleted file mode 100644
index f18f6828a47f..000000000000
--- a/polly/test/GPGPU/non-zero-array-offset.ll
+++ /dev/null
@@ -1,116 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-;
-; REQUIRES: pollyacc
-
-; CODE:      cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice));
-
-; CODE:          dim3 k0_dimBlock(8);
-; CODE-NEXT:     dim3 k0_dimGrid(1);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:        {
-; CODE-NEXT:     dim3 k1_dimBlock(8);
-; CODE-NEXT:     dim3 k1_dimGrid(1);
-; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (16) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost));
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb3(t0);
-
-; CODE: # kernel1
-; CODE-NEXT: Stmt_bb11(t0);
-
-; IR:       %p_dev_array_MemRef_B = call ptr @polly_allocateMemoryForDevice(i64 32)
-; IR-NEXT:  %p_dev_array_MemRef_A = call ptr @polly_allocateMemoryForDevice(i64 32)
-; IR-NEXT:  [[REG0:%.+]] = getelementptr float, ptr %B, i64 8
-; IR-NEXT:  call void @polly_copyFromHostToDevice(ptr [[REG0]], ptr %p_dev_array_MemRef_B, i64 32)
-
-; IR:      [[REGA:%.+]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_B)
-; IR-NEXT: [[REGC:%.+]]  = getelementptr float, ptr [[REGA]], i64 -8
-
-;    void foo(float A[], float B[]) {
-;      for (long i = 0; i < 8; i++)
-;        B[i + 8] *= 4;
-;
-;      for (long i = 0; i < 8; i++)
-;        A[i] *= 12;
-;    }
-;
-;    #ifdef OUTPUT
-;    int main() {
-;      float A[16];
-;
-;      for (long i = 0; i < 16; i++) {
-;        __sync_synchronize();
-;        A[i] = i;
-;      }
-;
-;      foo(A, A);
-;
-;      float sum = 0;
-;      for (long i = 0; i < 16; i++) {
-;        __sync_synchronize();
-;        sum += A[i];
-;      }
-;
-;      printf("%f\n", sum);
-;    }
-;    #endif
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, ptr %B) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb7, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ]
-  %exitcond1 = icmp ne i64 %i.0, 8
-  br i1 %exitcond1, label %bb3, label %bb9
-
-bb3:                                              ; preds = %bb2
-  %tmp = add nuw nsw i64 %i.0, 8
-  %tmp4 = getelementptr inbounds float, ptr %B, i64 %tmp
-  %tmp5 = load float, ptr %tmp4, align 4
-  %tmp6 = fmul float %tmp5, 4.000000e+00
-  store float %tmp6, ptr %tmp4, align 4
-  br label %bb7
-
-bb7:                                              ; preds = %bb3
-  %tmp8 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb9:                                              ; preds = %bb2
-  br label %bb10
-
-bb10:                                             ; preds = %bb15, %bb9
-  %i1.0 = phi i64 [ 0, %bb9 ], [ %tmp16, %bb15 ]
-  %exitcond = icmp ne i64 %i1.0, 8
-  br i1 %exitcond, label %bb11, label %bb17
-
-bb11:                                             ; preds = %bb10
-  %tmp12 = getelementptr inbounds float, ptr %A, i64 %i1.0
-  %tmp13 = load float, ptr %tmp12, align 4
-  %tmp14 = fmul float %tmp13, 1.200000e+01
-  store float %tmp14, ptr %tmp12, align 4
-  br label %bb15
-
-bb15:                                             ; preds = %bb11
-  %tmp16 = add nuw nsw i64 %i1.0, 1
-  br label %bb10
-
-bb17:                                             ; preds = %bb10
-  ret void
-}
diff --git a/polly/test/GPGPU/only-part-of-array-modified.ll b/polly/test/GPGPU/only-part-of-array-modified.ll
deleted file mode 100644
index abc380badfb6..000000000000
--- a/polly/test/GPGPU/only-part-of-array-modified.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-;
-; REQUIRES: pollyacc
-;
-;    void foo(float A[], float B[]) {
-;      for (long i = 0; i < 1024; i++)
-;        A[2 * i] = B[i];
-;    }
-
-; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i32), cudaMemcpyHostToDevice));
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2047) * sizeof(i32), cudaMemcpyHostToDevice));
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, ptr %B) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb8, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp9, %bb8 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb10
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds float, ptr %B, i64 %i.0
-  %tmp4 = load i32, ptr %tmp, align 4
-  %tmp5 = shl nsw i64 %i.0, 1
-  %tmp6 = getelementptr inbounds float, ptr %A, i64 %tmp5
-  store i32 %tmp4, ptr %tmp6, align 4
-  br label %bb8
-
-bb8:                                              ; preds = %bb2
-  %tmp9 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb10:                                             ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/parametric-loop-bound.ll b/polly/test/GPGPU/parametric-loop-bound.ll
deleted file mode 100644
index e436bd663a4a..000000000000
--- a/polly/test/GPGPU/parametric-loop-bound.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -S < %s | \
-; RUN: FileCheck -check-prefix=IR %s
-
-; REQUIRES: pollyacc
-
-;    void foo(long A[], long n) {
-;      for (long i = 0; i < n; i++)
-;        A[i] += 100;
-;    }
-
-; CODE: if (n >= 1) {
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (n) * sizeof(i64), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(n >= 1048545 ? 32768 : (n + 31) / 32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, n);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:        cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (n) * sizeof(i64), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaFree(dev_MemRef_A));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: for (int c0 = 0; c0 <= (n - 32 * b0 - 1) / 1048576; c0 += 1)
-; CODE-NEXT:   if (n >= 32 * b0 + t0 + 1048576 * c0 + 1)
-; CODE-NEXT:     Stmt_bb2(32 * b0 + t0 + 1048576 * c0);
-
-; IR: store i64 %n, ptr %polly_launch_0_param_1
-; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x ptr], ptr %polly_launch_0_params, i64 0, i64 1
-; IR-NEXT: store ptr %polly_launch_0_param_1, ptr [[REGA]]
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, i64 %n) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb6, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp7, %bb6 ]
-  %tmp = icmp slt i64 %i.0, %n
-  br i1 %tmp, label %bb2, label %bb8
-
-bb2:                                              ; preds = %bb1
-  %tmp3 = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp4 = load i64, ptr %tmp3, align 8
-  %tmp5 = add nsw i64 %tmp4, 100
-  store i64 %tmp5, ptr %tmp3, align 8
-  br label %bb6
-
-bb6:                                              ; preds = %bb2
-  %tmp7 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb8:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/partial_writes.ll b/polly/test/GPGPU/partial_writes.ll
deleted file mode 100644
index c3df624df7ac..000000000000
--- a/polly/test/GPGPU/partial_writes.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-codegen-ppcg -polly-stmt-granularity=bb -S < %s \
-; RUN: | FileCheck %s
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; CHECK: polly_launchKernel
-
-; Function Attrs: nounwind uwtable
-define void @partial_writes() {
-bb:
-  %tmp = tail call ptr @wibble() #2
-  br label %bb2
-
-bb2:                                              ; preds = %bb11, %bb
-  %tmp3 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ]
-  %tmp4 = getelementptr inbounds [1200 x double], ptr %tmp, i64 0, i64 %tmp3
-  %tmp5 = load double, ptr %tmp4, align 8, !tbaa !1
-  br label %bb6
-
-bb6:                                              ; preds = %bb6, %bb2
-  %tmp7 = phi double [ undef, %bb2 ], [ undef, %bb6 ]
-  %tmp8 = phi i64 [ 0, %bb2 ], [ %tmp9, %bb6 ]
-  store double undef, ptr %tmp4, align 8, !tbaa !1
-  %tmp9 = add nuw nsw i64 %tmp8, 1
-  %tmp10 = icmp eq i64 %tmp9, 900
-  br i1 %tmp10, label %bb11, label %bb6
-
-bb11:                                             ; preds = %bb6
-  %tmp12 = add nuw nsw i64 %tmp3, 1
-  %tmp13 = icmp eq i64 %tmp12, 1200
-  br i1 %tmp13, label %bb14, label %bb2
-
-bb14:                                             ; preds = %bb11
-  ret void
-}
-
-declare ptr @wibble()
-
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 6.0.0 (trunk 309912) (llvm/trunk 309933)"}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"double", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop b/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop
deleted file mode 100644
index d5b537ee1f05..000000000000
--- a/polly/test/GPGPU/partial_writes___%bb2---%bb14.jscop
+++ /dev/null
@@ -1,47 +0,0 @@
-{
-   "arrays" : [
-      {
-         "name" : "MemRef_tmp",
-         "sizes" : [ "*" ],
-         "type" : "double"
-      }
-   ],
-   "context" : "{  :  }",
-   "name" : "%bb2---%bb14",
-   "statements" : [
-      {
-         "accesses" : [
-            {
-               "kind" : "read",
-               "relation" : "{ Stmt_bb2[i0] -> MemRef_tmp[i0] }"
-            },
-            {
-               "kind" : "write",
-               "relation" : "{ Stmt_bb2[i0] -> MemRef_tmp[i0] }"
-            }
-         ],
-         "domain" : "{ Stmt_bb2[i0] : 0 <= i0 <= 1199 }",
-         "name" : "Stmt_bb2",
-         "schedule" : "{ Stmt_bb2[i0] -> [i0, 0, 0] }"
-      },
-      {
-         "accesses" : [
-            {
-               "kind" : "write",
-               "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] : i1 <= 898 }"
-            },
-            {
-               "kind" : "read",
-               "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] }"
-            },
-            {
-               "kind" : "write",
-               "relation" : "{ Stmt_bb6[i0, i1] -> MemRef_tmp[i0] }"
-            }
-         ],
-         "domain" : "{ Stmt_bb6[i0, i1] : 0 <= i0 <= 1199 and 0 <= i1 <= 899 }",
-         "name" : "Stmt_bb6",
-         "schedule" : "{ Stmt_bb6[i0, i1] -> [i0, 1, i1] }"
-      }
-   ]
-}
diff --git a/polly/test/GPGPU/phi-nodes-in-kernel.ll b/polly/test/GPGPU/phi-nodes-in-kernel.ll
deleted file mode 100644
index acb1f2c4e0e2..000000000000
--- a/polly/test/GPGPU/phi-nodes-in-kernel.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-IR
-
-; REQUIRES: pollyacc
-
-; Approximate C source:
-; void kernel_dynprog(int c[50]) {
-;     int iter = 0;
-;     int outl = 0;
-;
-;      while(1) {
-;         for(int indvar = 1 ; indvar <= 49; indvar++) {
-;             c[indvar] = undef;
-;         }
-;         add78 = c[49] + outl;
-;         inc80 = iter + 1;
-;
-;         if (true) break;
-;
-;         outl = add78;
-;         iter = inc80;
-;      }
-;}
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; CODE:       cudaCheckReturn(cudaMalloc((void **) &dev_MemRef_c, (50) * sizeof(i32)));
-
-; CODE:       {
-; CODE-NEXT:    dim3 k0_dimBlock(32);
-; CODE-NEXT:    dim3 k0_dimGrid(2);
-; CODE-NEXT:    kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_c);
-; CODE-NEXT:    cudaCheckKernel();
-; CODE-NEXT:  }
-
-; CODE:       cudaCheckReturn(cudaMemcpy(MemRef_c, dev_MemRef_c, (50) * sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE-NEXT:  cudaCheckReturn(cudaFree(dev_MemRef_c));
-
-; CODE: # kernel0
-; CODE-NEXT: if (32 * b0 + t0 <= 48)
-; CODE-NEXT:     Stmt_for_body17(0, 32 * b0 + t0);
-
-; IR-LABEL: call void @polly_freeKernel
-; IR:       [[REGC:%.+]] =   bitcast i32* %{{[0-9]+}} to i8*
-; IR-NEXT:  call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_c, i8* [[REGC]], i64 196)
-
-; KERNEL-IR: define ptx_kernel void @FUNC_kernel_dynprog_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_c) #0 {
-; KERNEL-IR: %polly.access.MemRef_c = getelementptr i32, i32 addrspace(1)* %polly.access.cast.MemRef_c, i64 %9
-; KERNEL-IR-NEXT: store i32 422, i32 addrspace(1)* %polly.access.MemRef_c, align 4
-
-define void @kernel_dynprog([50 x i32]* %c) {
-entry:
-  %arrayidx77 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 49
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.cond15.for.cond12.loopexit_crit_edge, %entry
-  %out_l.055 = phi i32 [ 0, %entry ], [ %add78, %for.cond15.for.cond12.loopexit_crit_edge ]
-  %iter.054 = phi i32 [ 0, %entry ], [ %inc80, %for.cond15.for.cond12.loopexit_crit_edge ]
-  br label %for.body17
-
-for.cond15.for.cond12.loopexit_crit_edge:         ; preds = %for.body17
-  %tmp = load i32, i32* %arrayidx77, align 4
-  %add78 = add nsw i32 %tmp, %out_l.055
-  %inc80 = add nuw nsw i32 %iter.054, 1
-  br i1 false, label %for.cond1.preheader, label %for.end81
-
-for.body17:                                       ; preds = %for.body17, %for.cond1.preheader
-  %indvars.iv71 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next72, %for.body17 ]
-  %arrayidx69 = getelementptr inbounds [50 x i32], [50 x i32]* %c, i64 0, i64 %indvars.iv71
-  store i32 422, i32* %arrayidx69, align 4
-  %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1
-  %lftr.wideiv74 = trunc i64 %indvars.iv.next72 to i32
-  %exitcond75 = icmp ne i32 %lftr.wideiv74, 50
-  br i1 %exitcond75, label %for.body17, label %for.cond15.for.cond12.loopexit_crit_edge
-
-for.end81:                                        ; preds = %for.cond15.for.cond12.loopexit_crit_edge
-  ret void
-}
diff --git a/polly/test/GPGPU/private-memory.ll b/polly/test/GPGPU/private-memory.ll
deleted file mode 100644
index d4ba9fa19b39..000000000000
--- a/polly/test/GPGPU/private-memory.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -polly-acc-use-private \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -polly-acc-use-private \
-; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; REQUIRES: pollyacc
-
-;    void add(float *A) {
-;      for (long i = 0; i < 32; i++)
-;        for (long j = 0; j < 10; j++)
-;          A[i] += 1;
-;    }
-
-; CODE: # kernel0
-; CODE: {
-; CODE:     read(t0);
-; CODE:     for (int c3 = 0; c3 <= 9; c3 += 1)
-; CODE:       Stmt_bb5(t0, c3);
-; CODE:     write(t0);
-; CODE: }
-
-; KERNEL: %private_array = alloca [1 x float]
-
-; KERNEL:   %polly.access.cast.private_array = bitcast [1 x float]* %private_array to float*
-; KERNEL-NEXT:   %polly.access.private_array = getelementptr float, float* %polly.access.cast.private_array, i64 0
-; KERNEL-NEXT:   %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-NEXT:   %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %t0
-; KERNEL-NEXT:   %shared.read = load float, float addrspace(1)* %polly.access.MemRef_A
-; KERNEL-NEXT:   store float %shared.read, float* %polly.access.private_array
-
-; KERNEL:   %polly.access.cast.private_array5 = bitcast [1 x float]* %private_array to float*
-; KERNEL-NEXT:   %polly.access.private_array6 = getelementptr float, float* %polly.access.cast.private_array5, i64 0
-; KERNEL-NEXT:   %polly.access.cast.MemRef_A7 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-NEXT:   %polly.access.MemRef_A8 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A7, i64 %t0
-; KERNEL-NEXT:   %shared.write = load float, float* %polly.access.private_array6
-; KERNEL-NEXT:   store float %shared.write, float addrspace(1)* %polly.access.MemRef_A8
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @add(float* %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb11, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ]
-  %exitcond1 = icmp ne i64 %i.0, 32
-  br i1 %exitcond1, label %bb3, label %bb13
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb8, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ]
-  %exitcond = icmp ne i64 %j.0, 10
-  br i1 %exitcond, label %bb5, label %bb10
-
-bb5:                                              ; preds = %bb4
-  %tmp = getelementptr inbounds float, float* %A, i64 %i.0
-  %tmp6 = load float, float* %tmp, align 4
-  %tmp7 = fadd float %tmp6, 1.000000e+00
-  store float %tmp7, float* %tmp, align 4
-  br label %bb8
-
-bb8:                                              ; preds = %bb5
-  %tmp9 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb10:                                             ; preds = %bb4
-  br label %bb11
-
-bb11:                                             ; preds = %bb10
-  %tmp12 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb13:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/privatization-simple.ll b/polly/test/GPGPU/privatization-simple.ll
deleted file mode 100644
index c715b8e77b67..000000000000
--- a/polly/test/GPGPU/privatization-simple.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-; void f(int A[], int B[], int control, int C[]) {
-;     int x;
-; #pragma scop
-;     for(int i = 0; i < 1000; i ++) {
-;         x = 0;
-;         if(control) x = C[i];
-;         B[i] = x * A[i];
-;
-;     }
-; #pragma endscop
-; }
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @f(ptr %A, ptr %B, i32 %control, ptr %C) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %if.end
-  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ]
-  %tobool = icmp eq i32 %control, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx = getelementptr inbounds i32, ptr %C, i64 %indvars.iv
-  %tmp4 = load i32, ptr %arrayidx, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %for.body, %if.then
-  %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ]
-  %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
-  %tmp8 = load i32, ptr %arrayidx2, align 4
-  %mul = mul nsw i32 %tmp8, %x.0
-  %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
-  store i32 %mul, ptr %arrayidx4, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.end:                                          ; preds = %if.end
-  ret void
-}
diff --git a/polly/test/GPGPU/privatization.ll b/polly/test/GPGPU/privatization.ll
deleted file mode 100644
index fbb291575146..000000000000
--- a/polly/test/GPGPU/privatization.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Function: checkPrivatization
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-
-
-; Check that kernel launch is generated in host IR.
-; the declare would not be generated unless a call to a kernel exists.
-; HOST-IR: declare void @polly_launchKernel(ptr, i32, i32, i32, i32, i32, ptr)
-
-;
-;
-;    void checkPrivatization(int A[], int B[], int C[], int control) {
-;      int x;
-;    #pragma scop
-;      for (int i = 0; i < 1000; i++) {
-;        x = 0;
-;        if (control)
-;          x += C[i];
-;
-;        B[i] = x * A[i];
-;      }
-;    #pragma endscop
-;    }
-;
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @checkPrivatization(ptr %A, ptr %B, ptr %C, i32 %control) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %if.end
-  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %if.end ]
-  %tobool = icmp eq i32 %control, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %arrayidx = getelementptr inbounds i32, ptr %C, i64 %indvars.iv
-  %tmp4 = load i32, ptr %arrayidx, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %for.body, %if.then
-  %x.0 = phi i32 [ %tmp4, %if.then ], [ 0, %for.body ]
-  %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
-  %tmp9 = load i32, ptr %arrayidx2, align 4
-  %mul = mul nsw i32 %tmp9, %x.0
-  %arrayidx4 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
-  store i32 %mul, ptr %arrayidx4, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.end:                                          ; preds = %if.end
-  ret void
-}
diff --git a/polly/test/GPGPU/region-stmt.ll b/polly/test/GPGPU/region-stmt.ll
deleted file mode 100644
index 8e392fb30062..000000000000
--- a/polly/test/GPGPU/region-stmt.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (128) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (128) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(4);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_B);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (128) * sizeof(float), cudaMemcpyDeviceToHost));
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_for_body__TO__if_end(32 * b0 + t0);
-
-; IR: @polly_initContext
-
-; KERNEL-IR: kernel_0
-
-; REQUIRES: pollyacc
-
-;    void foo(float A[], float B[]) {
-;      for (long i = 0; i < 128; i++)
-;        if (A[i] == 42)
-;          B[i] += 2 * i;
-;        else
-;          B[i] += 4 * i;
-;    }
-;
-source_filename = "/tmp/test.c"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(ptr %A, ptr %B) {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ]
-  %exitcond = icmp ne i64 %i.0, 128
-  br i1 %exitcond, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %arrayidx = getelementptr inbounds float, ptr %A, i64 %i.0
-  %tmp = load float, ptr %arrayidx, align 4
-  %cmp1 = fcmp oeq float %tmp, 4.200000e+01
-  br i1 %cmp1, label %if.then, label %if.else
-
-if.then:                                          ; preds = %for.body
-  %mul = shl nsw i64 %i.0, 1
-  %conv = sitofp i64 %mul to float
-  %arrayidx2 = getelementptr inbounds float, ptr %B, i64 %i.0
-  %tmp1 = load float, ptr %arrayidx2, align 4
-  %add = fadd float %tmp1, %conv
-  store float %add, ptr %arrayidx2, align 4
-  br label %if.end
-
-if.else:                                          ; preds = %for.body
-  %mul3 = shl nsw i64 %i.0, 2
-  %conv4 = sitofp i64 %mul3 to float
-  %arrayidx5 = getelementptr inbounds float, ptr %B, i64 %i.0
-  %tmp2 = load float, ptr %arrayidx5, align 4
-  %add6 = fadd float %tmp2, %conv4
-  store float %add6, ptr %arrayidx5, align 4
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  br label %for.inc
-
-for.inc:                                          ; preds = %if.end
-  %inc = add nuw nsw i64 %i.0, 1
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  ret void
-}
diff --git a/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll b/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll
deleted file mode 100644
index 326236cf92fd..000000000000
--- a/polly/test/GPGPU/remove-dead-instructions-in-stmt-2.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-IR
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; KERNEL-IR: store i32 0, ptr addrspace(1) %polly.access.MemRef_sum_c, align 4
-; KERNEL-IR-NEXT: br label %polly.merge
-
-define void @kernel_dynprog(ptr %sum_c) {
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %entry
-  br label %for.body3
-
-for.cond1.loopexit:                               ; preds = %for.end
-  %indvars.iv.next49 = add nuw nsw i64 %indvars.iv48, 1
-  %exitcond57 = icmp ne i64 %indvars.iv.next56, 49
-  br i1 %exitcond57, label %for.body3, label %for.inc55
-
-for.body3:                                        ; preds = %for.cond1.loopexit, %for.cond1.preheader
-  %indvars.iv55 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next56, %for.cond1.loopexit ]
-  %indvars.iv48 = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next49, %for.cond1.loopexit ]
-  %indvars.iv.next56 = add nuw nsw i64 %indvars.iv55, 1
-  %arrayidx10 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv48, i64 %indvars.iv55
-  store i32 0, ptr %arrayidx10, align 4
-  %cmp1334 = icmp slt i64 %indvars.iv.next56, %indvars.iv48
-  br label %for.end
-
-for.end:                                          ; preds = %for.body3
-  br label %for.cond1.loopexit
-
-for.inc55:                                        ; preds = %for.cond1.loopexit
-  ret void
-}
diff --git a/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll b/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll
deleted file mode 100644
index 2024f006c53a..000000000000
--- a/polly/test/GPGPU/remove-dead-instructions-in-stmt.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s -check-prefix=KERNEL-IR
-
-; REQUIRES: pollyacc
-
-; Ensure that no dead instructions are emitted between the store and the
-; branch instruction of the ScopStmt. At some point, our dead-code-elimination
-; did not remove code that was inserted to compute the old (unused) branch
-; condition. This code referred to CPU registers and consequently resulted
-; in invalid bitcode.
-
-; KERNEL-IR: store i32 0, ptr addrspace(1) %polly.access.MemRef_sum_c, align 4
-; KERNEL-IR-NEXT: br label %polly.merge
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define void @kernel_dynprog(ptr %sum_c) {
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %entry
-  br label %for.body3
-
-for.cond4.for.cond1.loopexit_crit_edge:           ; preds = %for.end
-  br label %for.cond1.loopexit
-
-for.cond1.loopexit:                               ; preds = %for.cond4.for.cond1.loopexit_crit_edge
-  br i1 undef, label %for.body3, label %for.inc55
-
-for.body3:                                        ; preds = %for.cond1.loopexit, %for.cond1.preheader
-  %indvars.iv55 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next56, %for.cond1.loopexit ]
-  %indvars.iv.next56 = add nuw nsw i64 %indvars.iv55, 1
-  br label %for.body6
-
-for.body6:                                        ; preds = %for.end, %for.body3
-  %indvars.iv50 = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next51, %for.end ]
-  %arrayidx10 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv50, i64 %indvars.iv55
-  store i32 0, ptr %arrayidx10, align 4
-  %cmp1334 = icmp slt i64 %indvars.iv.next56, %indvars.iv50
-  br i1 %cmp1334, label %for.body14.lr.ph, label %for.end
-
-for.body14.lr.ph:                                 ; preds = %for.body6
-  br label %for.body14
-
-for.body14:                                       ; preds = %for.body14, %for.body14.lr.ph
-  %arrayidx32 = getelementptr inbounds [50 x [50 x i32]], ptr %sum_c, i64 %indvars.iv55, i64 %indvars.iv50, i64 0
-  br i1 false, label %for.body14, label %for.cond12.for.end_crit_edge
-
-for.cond12.for.end_crit_edge:                     ; preds = %for.body14
-  br label %for.end
-
-for.end:                                          ; preds = %for.cond12.for.end_crit_edge, %for.body6
-  %indvars.iv.next51 = add nuw nsw i64 %indvars.iv50, 1
-  %lftr.wideiv53 = trunc i64 %indvars.iv.next51 to i32
-  %exitcond54 = icmp ne i32 %lftr.wideiv53, 50
-  br i1 %exitcond54, label %for.body6, label %for.cond4.for.cond1.loopexit_crit_edge
-
-for.inc55:                                        ; preds = %for.cond1.loopexit
-  unreachable
-}
diff --git a/polly/test/GPGPU/run-time-check.ll b/polly/test/GPGPU/run-time-check.ll
deleted file mode 100644
index 3b04c3e01593..000000000000
--- a/polly/test/GPGPU/run-time-check.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-;
-; REQUIRES: pollyacc
-;
-;    void foo(long n, float A[][32]) {
-;      for (long i = 0; i < n; i++)
-;        for (long j = 0; j < n; j++)
-;          A[i][j] += A[i + 1][j + 1];
-;    }
-
-; IR:       %tmp = icmp slt i64 %i.0, %n
-; IR-NEXT:  br i1 %tmp, label %bb2, label %polly.merge_new_and_old
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(i64 %n, ptr %A) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb15, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp16, %bb15 ]
-  %tmp = icmp slt i64 %i.0, %n
-  br i1 %tmp, label %bb2, label %bb17
-
-bb2:                                              ; preds = %bb1
-  br label %bb3
-
-bb3:                                              ; preds = %bb12, %bb2
-  %j.0 = phi i64 [ 0, %bb2 ], [ %tmp13, %bb12 ]
-  %exitcond = icmp ne i64 %j.0, %n
-  br i1 %exitcond, label %bb4, label %bb14
-
-bb4:                                              ; preds = %bb3
-  %tmp5 = add nuw nsw i64 %j.0, 1
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  %tmp7 = getelementptr inbounds [32 x float], ptr %A, i64 %tmp6, i64 %tmp5
-  %tmp8 = load float, ptr %tmp7, align 4
-  %tmp9 = getelementptr inbounds [32 x float], ptr %A, i64 %i.0, i64 %j.0
-  %tmp10 = load float, ptr %tmp9, align 4
-  %tmp11 = fadd float %tmp10, %tmp8
-  store float %tmp11, ptr %tmp9, align 4
-  br label %bb12
-
-bb12:                                             ; preds = %bb4
-  %tmp13 = add nuw nsw i64 %j.0, 1
-  br label %bb3
-
-bb14:                                             ; preds = %bb3
-  br label %bb15
-
-bb15:                                             ; preds = %bb14
-  %tmp16 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb17:                                             ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-param-and-value-32-bit.ll b/polly/test/GPGPU/scalar-param-and-value-32-bit.ll
deleted file mode 100644
index 0313d64e976c..000000000000
--- a/polly/test/GPGPU/scalar-param-and-value-32-bit.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck %s
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-;
-;    void foo(float A[], int n) {
-;      for (long j = 0; j < n; j++)
-;        A[j + n] += 42;
-;    }
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; CHECK: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A, i32 %n)
-
-define void @foo(ptr %A, i32 %n) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb9, %bb
-  %j.0 = phi i64 [ 0, %bb ], [ %tmp10, %bb9 ]
-  %tmp = sext i32 %n to i64
-  %tmp2 = icmp slt i64 %j.0, %tmp
-  br i1 %tmp2, label %bb3, label %bb11
-
-bb3:                                              ; preds = %bb1
-  %tmp4 = sext i32 %n to i64
-  %tmp5 = add nsw i64 %j.0, %tmp4
-  %tmp6 = getelementptr inbounds float, ptr %A, i64 %tmp5
-  %tmp7 = load float, ptr %tmp6, align 4
-  %tmp8 = fadd float %tmp7, 4.200000e+01
-  store float %tmp8, ptr %tmp6, align 4
-  br label %bb9
-
-bb9:                                              ; preds = %bb3
-  %tmp10 = add nuw nsw i64 %j.0, 1
-  br label %bb1
-
-bb11:                                             ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-param-and-value-use.ll b/polly/test/GPGPU/scalar-param-and-value-use.ll
deleted file mode 100644
index 0301d88e16ac..000000000000
--- a/polly/test/GPGPU/scalar-param-and-value-use.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=IR %s
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-;    void foo(long n, float A[][n]) {
-;      for (long i = 0; i < 32; i++)
-;        for (long j = 0; j < 32; j++)
-;          A[i][j] += A[i + 1][j + 1];
-;    }
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; This test case failed at some point as %n was only available in this kernel
-; when referenced through an isl_id in an isl ast expression, but not when
-; it was referenced from a SCEV  or instruction that not part of any loop
-; bound.
-
-; IR: %polly.access.mul.MemRef_A = mul nsw i64 {{.*}}, %n
-
-define void @foo(i64 %n, ptr %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb19, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp20, %bb19 ]
-  %exitcond1 = icmp ne i64 %i.0, 32
-  br i1 %exitcond1, label %bb3, label %bb21
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb16, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp17, %bb16 ]
-  %exitcond = icmp ne i64 %j.0, 32
-  br i1 %exitcond, label %bb5, label %bb18
-
-bb5:                                              ; preds = %bb4
-  %tmp = add nuw nsw i64 %j.0, 1
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  %tmp7 = mul nsw i64 %tmp6, %n
-  %tmp8 = getelementptr inbounds float, ptr %A, i64 %tmp7
-  %tmp9 = getelementptr inbounds float, ptr %tmp8, i64 %tmp
-  %tmp10 = load float, ptr %tmp9, align 4
-  %tmp11 = mul nsw i64 %i.0, %n
-  %tmp12 = getelementptr inbounds float, ptr %A, i64 %tmp11
-  %tmp13 = getelementptr inbounds float, ptr %tmp12, i64 %j.0
-  %tmp14 = load float, ptr %tmp13, align 4
-  %tmp15 = fadd float %tmp14, %tmp10
-  store float %tmp15, ptr %tmp13, align 4
-  br label %bb16
-
-bb16:                                             ; preds = %bb5
-  %tmp17 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb18:                                             ; preds = %bb4
-  br label %bb19
-
-bb19:                                             ; preds = %bb18
-  %tmp20 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb21:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-parameter-fp128.ll b/polly/test/GPGPU/scalar-parameter-fp128.ll
deleted file mode 100644
index f20a809c7c83..000000000000
--- a/polly/test/GPGPU/scalar-parameter-fp128.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today with "LowerFormalArguments didn't emit the correct number of values!"
-
-;    void foo(fp128 A[], fp128 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @fp128(ptr %A, fp128 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds fp128, ptr %A, i64 %i.0
-  %tmp3 = load fp128, ptr %tmp, align 4
-  %tmp4 = fadd fp128 %tmp3, %b
-  store fp128 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
diff --git a/polly/test/GPGPU/scalar-parameter-half.ll b/polly/test/GPGPU/scalar-parameter-half.ll
deleted file mode 100644
index 127096256812..000000000000
--- a/polly/test/GPGPU/scalar-parameter-half.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-;    void foo(half A[], half b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @half(ptr %A, half %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds half, ptr %A, i64 %i.0
-  %tmp3 = load half, ptr %tmp, align 4
-  %tmp4 = fadd half %tmp3, %b
-  store half %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
diff --git a/polly/test/GPGPU/scalar-parameter-i120.ll b/polly/test/GPGPU/scalar-parameter-i120.ll
deleted file mode 100644
index 06fb46dd917e..000000000000
--- a/polly/test/GPGPU/scalar-parameter-i120.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits"
-
-;    void foo(i120 A[], i120 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @i120(ptr %A, i120 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i120 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i120 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i120, ptr %A, i120 %i.0
-  %tmp3 = load i120, ptr %tmp, align 4
-  %tmp4 = add i120 %tmp3, %b
-  store i120 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i120 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
diff --git a/polly/test/GPGPU/scalar-parameter-i128.ll b/polly/test/GPGPU/scalar-parameter-i128.ll
deleted file mode 100644
index 8e54cf4636d4..000000000000
--- a/polly/test/GPGPU/scalar-parameter-i128.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-;    void foo(i128 A[], i128 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @i128(ptr %A, i128 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i128 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i128 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i128, ptr %A, i128 %i.0
-  %tmp3 = load i128, ptr %tmp, align 4
-  %tmp4 = add i128 %tmp3, %b
-  store i128 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i128 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-parameter-i3000.ll b/polly/test/GPGPU/scalar-parameter-i3000.ll
deleted file mode 100644
index 5c36b3fd62cb..000000000000
--- a/polly/test/GPGPU/scalar-parameter-i3000.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits"
-
-;    void foo(i3000 A[], i3000 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @i3000(ptr %A, i3000 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i3000 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i3000 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i3000, ptr %A, i3000 %i.0
-  %tmp3 = load i3000, ptr %tmp, align 4
-  %tmp4 = add i3000 %tmp3, %b
-  store i3000 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i3000 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-parameter-i80.ll b/polly/test/GPGPU/scalar-parameter-i80.ll
deleted file mode 100644
index a672cd5c1cdc..000000000000
--- a/polly/test/GPGPU/scalar-parameter-i80.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today with "Promotion is not suitable for scalars of size larger than 64-bits"
-
-;    void foo(i80 A[], i80 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @i80(ptr %A, i80 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i80 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i80 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i80, ptr %A, i80 %i.0
-  %tmp3 = load i80, ptr %tmp, align 4
-  %tmp4 = add i80 %tmp3, %b
-  store i80 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i80 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
diff --git a/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll b/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll
deleted file mode 100644
index 11dfd68ede9b..000000000000
--- a/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today with "LowerFormalArguments didn't emit the correct number of values!"
-
-;    void foo(fp128 A[], fp128 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @ppc_fp128(ptr %A, ppc_fp128 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds ppc_fp128, ptr %A, i64 %i.0
-  %tmp3 = load ppc_fp128, ptr %tmp, align 4
-  %tmp4 = fadd ppc_fp128 %tmp3, %b
-  store ppc_fp128 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-parameter-x86_fp80.ll b/polly/test/GPGPU/scalar-parameter-x86_fp80.ll
deleted file mode 100644
index f20a809c7c83..000000000000
--- a/polly/test/GPGPU/scalar-parameter-x86_fp80.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code -disable-output %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today with "LowerFormalArguments didn't emit the correct number of values!"
-
-;    void foo(fp128 A[], fp128 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @fp128(ptr %A, fp128 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds fp128, ptr %A, i64 %i.0
-  %tmp3 = load fp128, ptr %tmp, align 4
-  %tmp4 = fadd fp128 %tmp3, %b
-  store fp128 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
diff --git a/polly/test/GPGPU/scalar-parameter.ll b/polly/test/GPGPU/scalar-parameter.ll
deleted file mode 100644
index e416c93211d5..000000000000
--- a/polly/test/GPGPU/scalar-parameter.ll
+++ /dev/null
@@ -1,411 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -S < %s | \
-; RUN: FileCheck -check-prefix=IR %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; XFAIL: *
-
-; REQUIRES: pollyacc, target=nvptx{{.*}}
-
-; This fails today due to extensive output differences from when the test was written.
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; KERNEL: define ptx_kernel void @kernel_0(ptr %MemRef_A, float %MemRef_b)
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, MemRef_b);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(float A[], float b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @float(ptr %A, float %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds float, ptr %A, i64 %i.0
-  %tmp3 = load float, ptr %tmp, align 4
-  %tmp4 = fadd float %tmp3, %b
-  store float %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; KERNEL: define ptx_kernel void @kernel_0(ptr %MemRef_A, double %MemRef_b)
-; KERNEL-NEXT: entry:
-; KERNEL-NEXT:   %b.s2a = alloca double
-; KERNEL-NEXT:   store double %MemRef_b, ptr %b.s2a
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, MemRef_b);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(double A[], double b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @double(ptr %A, double %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds double, ptr %A, i64 %i.0
-  %tmp3 = load double, ptr %tmp, align 4
-  %tmp4 = fadd double %tmp3, %b
-  store double %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i1), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i1), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(i1 A[], i1 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @i1(ptr %A, i1 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i1, ptr %A, i64 %i.0
-  %tmp3 = load i1, ptr %tmp, align 4
-  %tmp4 = add i1 %tmp3, %b
-  store i1 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i3), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i3), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(i3 A[], i3 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @i3(ptr %A, i3 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i3, ptr %A, i64 %i.0
-  %tmp3 = load i3, ptr %tmp, align 4
-  %tmp4 = add i3 %tmp3, %b
-  store i3 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i8), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i8), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(i8 A[], i32 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @i8(ptr %A, i8 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i8, ptr %A, i64 %i.0
-  %tmp3 = load i8, ptr %tmp, align 4
-  %tmp4 = add i8 %tmp3, %b
-  store i8 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; IR-LABEL: @i8
-
-; IR: [[REGA:%.+]] = call ptr @polly_getDevicePtr(ptr %p_dev_array_MemRef_A)
-; IR-NEXT: store ptr [[REGA:%.+]], ptr %polly_launch_0_param_0
-; IR-NEXT: store ptr %polly_launch_0_param_0, ptr %polly_launch_0_params
-; IR-NEXT: store i8 %b, ptr %polly_launch_0_param_1
-; IR-NEXT: [[REGD:%.+]] = getelementptr [2 x ptr], ptr %polly_launch_0_params, i64 0, i64 1
-; IR-NEXT: store ptr %polly_launch_0_param_1, ptr [[REGD]]
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i32), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(i32 A[], i32 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @i32(ptr %A, i32 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i32, ptr %A, i64 %i.0
-  %tmp3 = load i32, ptr %tmp, align 4
-  %tmp4 = add i32 %tmp3, %b
-  store i32 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i60), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i60), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(i60 A[], i60 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @i60(ptr %A, i60 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i60, ptr %A, i64 %i.0
-  %tmp3 = load i60, ptr %tmp, align 4
-  %tmp4 = add i60 %tmp3, %b
-  store i60 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-;    void foo(i64 A[], i64 b) {
-;      for (long i = 0; i < 1024; i++)
-;        A[i] += b;
-;    }
-;
-define void @i64(ptr %A, i64 %b) {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb5, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
-  %exitcond = icmp ne i64 %i.0, 1024
-  br i1 %exitcond, label %bb2, label %bb7
-
-bb2:                                              ; preds = %bb1
-  %tmp = getelementptr inbounds i64, ptr %A, i64 %i.0
-  %tmp3 = load i64, ptr %tmp, align 4
-  %tmp4 = add i64 %tmp3, %b
-  store i64 %tmp4, ptr %tmp, align 4
-  br label %bb5
-
-bb5:                                              ; preds = %bb2
-  %tmp6 = add nuw nsw i64 %i.0, 1
-  br label %bb1
-
-bb7:                                              ; preds = %bb1
-  ret void
-}
diff --git a/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll b/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll
deleted file mode 100644
index 31110437fdca..000000000000
--- a/polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: opt %loadPolly -polly-acc-dump-code -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOP
-
-; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
-; RUN: -polly-acc-dump-code -polly-stmt-granularity=bb \
-; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODE
-
-; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
-; RUN: -polly-invariant-load-hoisting -polly-stmt-granularity=bb < %s \
-; RUN: | FileCheck %s -check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Invariant Accesses: {
-; SCOP-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
-; SCOP-NEXT:             { Stmt_loop_a[i0] -> MemRef_p[0] };
-; SCOP-NEXT:         Execution Context: {  :  }
-; SCOP-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: {
-; CODE-NEXT:   if (32 * b0 + t0 <= 1025) {
-; CODE-NEXT:     Stmt_loop(32 * b0 + t0);
-; CODE-NEXT:     write(0);
-; CODE-NEXT:   }
-; CODE-NEXT:   sync0();
-; CODE-NEXT: }
-
-; Check that we generate a correct "always false" branch.
-; HOST-IR:  br i1 false, label %polly.start, label %loop.pre_entry_bb
-
-; This test case checks that we generate correct code if PPCGCodeGeneration
-; decides a build is unsuccessful with invariant load hoisting enabled.
-;
-; There is a conditional branch which switches between the original code and
-; the new code. We try to set this conditional branch to branch on false.
-; However, invariant load hoisting changes the structure of the scop, so we
-; need to change the way we *locate* this instruction.
-
-target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
-target triple = "i386-apple-macosx10.12.0"
-
-define void @foo(ptr %A, ptr %p) {
-entry:
-  br label %loop
-
-loop:
-  %indvar = phi i64 [0, %entry], [%indvar.next, %loop]
-  %indvar.next = add i64 %indvar, 1
-  %invariant = load float, ptr %p
-  %ptr = getelementptr float, ptr %A, i64 %indvar
-  store float 42.0, ptr %ptr
-  %cmp = icmp sle i64 %indvar, 1024
-  br i1 %cmp, label %loop, label %loop2
-
-loop2:
-  %indvar2 = phi i64 [0, %loop], [%indvar2.next, %loop2]
-  %indvar2f = phi float [%invariant, %loop], [%indvar2f, %loop2]
-  %indvar2.next = add i64 %indvar2, 1
-  store float %indvar2f, ptr %A
-  %cmp2 = icmp sle i64 %indvar2, 1024
-  br i1 %cmp2, label %loop2, label %end
-
-end:
-  ret void
-}
diff --git a/polly/test/GPGPU/scheduler-timeout.ll b/polly/test/GPGPU/scheduler-timeout.ll
deleted file mode 100644
index 4a49c53d66c7..000000000000
--- a/polly/test/GPGPU/scheduler-timeout.ll
+++ /dev/null
@@ -1,174 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; REQUIRES: pollyacc
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; This test case took at some point forever to schedule, as the isl scheduler
-; seems to have problems if domain constraints appear in the dependences
-; provided to the scheduler.
-
-;   /* D := alpha*A*B*C + beta*D */
-;   for (i = 0; i < _PB_NI; i++)
-;     for (j = 0; j < _PB_NJ; j++)
-;       {
-;   tmp[i][j] = 0;
-;   for (k = 0; k < _PB_NK; ++k)
-;     tmp[i][j] += alpha * A[i][k] * B[k][j];
-;       }
-;   for (i = 0; i < _PB_NI; i++)
-;     for (j = 0; j < _PB_NL; j++)
-;       {
-;   D[i][j] *= beta;
-;   for (k = 0; k < _PB_NJ; ++k)
-;     D[i][j] += tmp[i][k] * C[k][j];
-;       }
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_D, MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_C, MemRef_C, (4096) * (4096) * sizeof(float), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(16, 32);
-; CODE-NEXT:     dim3 k0_dimGrid(128, 128);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_A, MemRef_alpha, dev_MemRef_B);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   {
-; CODE-NEXT:     dim3 k1_dimBlock(16, 32);
-; CODE-NEXT:     dim3 k1_dimGrid(128, 128);
-; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_tmp, dev_MemRef_D, MemRef_beta, dev_MemRef_C);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_tmp, dev_MemRef_tmp, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost));
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_D, dev_MemRef_D, (4096) * (4096) * sizeof(float), cudaMemcpyDeviceToHost));
-
-; CODE: # kernel0
-; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1)
-; CODE-NEXT:   for (int c4 = 0; c4 <= 1; c4 += 1) {
-; CODE-NEXT:     if (c2 == 0)
-; CODE-NEXT:       Stmt_for_body6(32 * b0 + t0, 32 * b1 + t1 + 16 * c4);
-; CODE-NEXT:     for (int c5 = 0; c5 <= 31; c5 += 1)
-; CODE-NEXT:       Stmt_for_body11(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5);
-; CODE-NEXT:   }
-
-; CODE: # kernel1
-; CODE-NEXT: for (int c2 = 0; c2 <= 127; c2 += 1)
-; CODE-NEXT:   for (int c4 = 0; c4 <= 1; c4 += 1) {
-; CODE-NEXT:     if (c2 == 0)
-; CODE-NEXT:       Stmt_for_body36(32 * b0 + t0, 32 * b1 + t1 + 16 * c4);
-; CODE-NEXT:     for (int c5 = 0; c5 <= 31; c5 += 1)
-; CODE-NEXT:       Stmt_for_body44(32 * b0 + t0, 32 * b1 + t1 + 16 * c4, 32 * c2 + c5);
-; CODE-NEXT:   }
-
-
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #0
-
-; Function Attrs: nounwind uwtable
-define internal void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, float %alpha, float %beta, ptr %tmp, ptr %A, ptr %B, ptr %C, ptr %D) #1 {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %entry.split, %for.inc28
-  %indvars.iv19 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next20, %for.inc28 ]
-  br label %for.body6
-
-for.cond31.preheader:                             ; preds = %for.inc28
-  br label %for.cond34.preheader
-
-for.body6:                                        ; preds = %for.cond4.preheader, %for.inc25
-  %indvars.iv16 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next17, %for.inc25 ]
-  %arrayidx8 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv19, i64 %indvars.iv16
-  store float 0.000000e+00, ptr %arrayidx8, align 4, !tbaa !1
-  br label %for.body11
-
-for.body11:                                       ; preds = %for.body6, %for.body11
-  %indvars.iv13 = phi i64 [ 0, %for.body6 ], [ %indvars.iv.next14, %for.body11 ]
-  %arrayidx15 = getelementptr inbounds [4096 x float], ptr %A, i64 %indvars.iv19, i64 %indvars.iv13
-  %tmp22 = load float, ptr %arrayidx15, align 4, !tbaa !1
-  %mul = fmul float %tmp22, %alpha
-  %arrayidx19 = getelementptr inbounds [4096 x float], ptr %B, i64 %indvars.iv13, i64 %indvars.iv16
-  %tmp23 = load float, ptr %arrayidx19, align 4, !tbaa !1
-  %mul20 = fmul float %mul, %tmp23
-  %arrayidx24 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv19, i64 %indvars.iv16
-  %tmp24 = load float, ptr %arrayidx24, align 4, !tbaa !1
-  %add = fadd float %tmp24, %mul20
-  store float %add, ptr %arrayidx24, align 4, !tbaa !1
-  %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
-  %exitcond15 = icmp ne i64 %indvars.iv.next14, 4096
-  br i1 %exitcond15, label %for.body11, label %for.inc25
-
-for.inc25:                                        ; preds = %for.body11
-  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
-  %exitcond18 = icmp ne i64 %indvars.iv.next17, 4096
-  br i1 %exitcond18, label %for.body6, label %for.inc28
-
-for.inc28:                                        ; preds = %for.inc25
-  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
-  %exitcond21 = icmp ne i64 %indvars.iv.next20, 4096
-  br i1 %exitcond21, label %for.cond4.preheader, label %for.cond31.preheader
-
-for.cond34.preheader:                             ; preds = %for.cond31.preheader, %for.inc65
-  %indvars.iv10 = phi i64 [ 0, %for.cond31.preheader ], [ %indvars.iv.next11, %for.inc65 ]
-  br label %for.body36
-
-for.body36:                                       ; preds = %for.cond34.preheader, %for.inc62
-  %indvars.iv7 = phi i64 [ 0, %for.cond34.preheader ], [ %indvars.iv.next8, %for.inc62 ]
-  %arrayidx40 = getelementptr inbounds [4096 x float], ptr %D, i64 %indvars.iv10, i64 %indvars.iv7
-  %tmp25 = load float, ptr %arrayidx40, align 4, !tbaa !1
-  %mul41 = fmul float %tmp25, %beta
-  store float %mul41, ptr %arrayidx40, align 4, !tbaa !1
-  br label %for.body44
-
-for.body44:                                       ; preds = %for.body36, %for.body44
-  %indvars.iv = phi i64 [ 0, %for.body36 ], [ %indvars.iv.next, %for.body44 ]
-  %arrayidx48 = getelementptr inbounds [4096 x float], ptr %tmp, i64 %indvars.iv10, i64 %indvars.iv
-  %tmp26 = load float, ptr %arrayidx48, align 4, !tbaa !1
-  %arrayidx52 = getelementptr inbounds [4096 x float], ptr %C, i64 %indvars.iv, i64 %indvars.iv7
-  %tmp27 = load float, ptr %arrayidx52, align 4, !tbaa !1
-  %mul53 = fmul float %tmp26, %tmp27
-  %arrayidx57 = getelementptr inbounds [4096 x float], ptr %D, i64 %indvars.iv10, i64 %indvars.iv7
-  %tmp28 = load float, ptr %arrayidx57, align 4, !tbaa !1
-  %add58 = fadd float %tmp28, %mul53
-  store float %add58, ptr %arrayidx57, align 4, !tbaa !1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, 4096
-  br i1 %exitcond, label %for.body44, label %for.inc62
-
-for.inc62:                                        ; preds = %for.body44
-  %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
-  %exitcond9 = icmp ne i64 %indvars.iv.next8, 4096
-  br i1 %exitcond9, label %for.body36, label %for.inc65
-
-for.inc65:                                        ; preds = %for.inc62
-  %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
-  %exitcond12 = icmp ne i64 %indvars.iv.next11, 4096
-  br i1 %exitcond12, label %for.cond34.preheader, label %for.end67
-
-for.end67:                                        ; preds = %for.inc65
-  ret void
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #0
-
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"float", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/shared-memory-scalar.ll b/polly/test/GPGPU/shared-memory-scalar.ll
deleted file mode 100644
index cd2b1705a388..000000000000
--- a/polly/test/GPGPU/shared-memory-scalar.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -polly-acc-use-shared \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; REQUIRES: pollyacc
-
-;    void add(float *A, float alpha) {
-;      for (long i = 0; i < 32; i++)
-;        for (long j = 0; j < 10; j++)
-;          A[i] += alpha;
-;    }
-
-; CODE:       read(t0);
-; CODE-NEXT:  sync0();
-; CODE-NEXT:  for (int c3 = 0; c3 <= 9; c3 += 1)
-; CODE-NEXT:    Stmt_bb5(t0, c3);
-; CODE-NEXT:  sync1();
-; CODE-NEXT:  write(t0);
-
-; This test case was intended to test code generation for scalars stored
-; in shared memory. However, after properly marking the scalar as read-only
-; the scalar is not stored any more in shared memory. We still leave this
-; test case as documentation if we every forget to mark scalars as read-only.
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @add(ptr %A, float %alpha) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb11, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ]
-  %exitcond1 = icmp ne i64 %i.0, 32
-  br i1 %exitcond1, label %bb3, label %bb13
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb8, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ]
-  %exitcond = icmp ne i64 %j.0, 10
-  br i1 %exitcond, label %bb5, label %bb10
-
-bb5:                                              ; preds = %bb4
-  %tmp = getelementptr inbounds float, ptr %A, i64 %i.0
-  %tmp6 = load float, ptr %tmp, align 4
-  %tmp7 = fadd float %tmp6, %alpha
-  store float %tmp7, ptr %tmp, align 4
-  br label %bb8
-
-bb8:                                              ; preds = %bb5
-  %tmp9 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb10:                                             ; preds = %bb4
-  br label %bb11
-
-bb11:                                             ; preds = %bb10
-  %tmp12 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb13:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/shared-memory-two-dimensional.ll b/polly/test/GPGPU/shared-memory-two-dimensional.ll
deleted file mode 100644
index 6ee51650295f..000000000000
--- a/polly/test/GPGPU/shared-memory-two-dimensional.ll
+++ /dev/null
@@ -1,103 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -polly-acc-use-shared \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -polly-acc-use-shared \
-; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; REQUIRES: pollyacc
-
-;    void foo(float A[], float b[][8]) {
-;      for (long i = 0; i < 32; i++)
-;        for (long j = 0; j < 16; j++)
-;          for (long k = 0; k < 8; k++)
-;            A[i] += j * k * b[j][k];
-;    }
-
-
-; CODE:      # kernel0
-; CODE-NEXT: {
-; CODE-NEXT:   if (t0 <= 7)
-; CODE-NEXT:     for (int c0 = 0; c0 <= 15; c0 += 1)
-; CODE-NEXT:       read(c0, t0);
-; CODE-NEXT:   read(t0);
-; CODE-NEXT:   sync0();
-; CODE-NEXT:   for (int c3 = 0; c3 <= 15; c3 += 1)
-; CODE-NEXT:     for (int c4 = 0; c4 <= 7; c4 += 1)
-; CODE-NEXT:       Stmt_bb8(t0, c3, c4);
-; CODE-NEXT:   sync1();
-; CODE-NEXT:   write(t0);
-; CODE-NEXT: }
-
-; KERNEL: @shared_MemRef_b = internal addrspace(3) global [16 x [8 x float]] zeroinitializer, align 4
-
-; KERNEL:        %polly.access.mul.MemRef_b = mul nsw i64 %polly.indvar, 8
-; KERNEL-NEXT:   %polly.access.add.MemRef_b = add nsw i64 %polly.access.mul.MemRef_b, %t0
-; KERNEL-NEXT:   %polly.access.MemRef_b = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_b, i64 %polly.access.add.MemRef_b
-; KERNEL-NEXT:   %shared.read = load float, float addrspace(1)* %polly.access.MemRef_b
-; KERNEL-NEXT:   store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_b
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(float* %A, [8 x float]* %b) {
-bb:
-  br label %bb3
-
-bb3:                                              ; preds = %bb22, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp23, %bb22 ]
-  %exitcond2 = icmp ne i64 %i.0, 32
-  br i1 %exitcond2, label %bb4, label %bb24
-
-bb4:                                              ; preds = %bb3
-  br label %bb5
-
-bb5:                                              ; preds = %bb19, %bb4
-  %j.0 = phi i64 [ 0, %bb4 ], [ %tmp20, %bb19 ]
-  %exitcond1 = icmp ne i64 %j.0, 16
-  br i1 %exitcond1, label %bb6, label %bb21
-
-bb6:                                              ; preds = %bb5
-  br label %bb7
-
-bb7:                                              ; preds = %bb16, %bb6
-  %k.0 = phi i64 [ 0, %bb6 ], [ %tmp17, %bb16 ]
-  %exitcond = icmp ne i64 %k.0, 8
-  br i1 %exitcond, label %bb8, label %bb18
-
-bb8:                                              ; preds = %bb7
-  %tmp = mul nuw nsw i64 %j.0, %k.0
-  %tmp9 = sitofp i64 %tmp to float
-  %tmp10 = getelementptr inbounds [8 x float], [8 x float]* %b, i64 %j.0, i64 %k.0
-  %tmp11 = load float, float* %tmp10, align 4
-  %tmp12 = fmul float %tmp9, %tmp11
-  %tmp13 = getelementptr inbounds float, float* %A, i64 %i.0
-  %tmp14 = load float, float* %tmp13, align 4
-  %tmp15 = fadd float %tmp14, %tmp12
-  store float %tmp15, float* %tmp13, align 4
-  br label %bb16
-
-bb16:                                             ; preds = %bb8
-  %tmp17 = add nuw nsw i64 %k.0, 1
-  br label %bb7
-
-bb18:                                             ; preds = %bb7
-  br label %bb19
-
-bb19:                                             ; preds = %bb18
-  %tmp20 = add nuw nsw i64 %j.0, 1
-  br label %bb5
-
-bb21:                                             ; preds = %bb5
-  br label %bb22
-
-bb22:                                             ; preds = %bb21
-  %tmp23 = add nuw nsw i64 %i.0, 1
-  br label %bb3
-
-bb24:                                             ; preds = %bb3
-  ret void
-}
diff --git a/polly/test/GPGPU/shared-memory.ll b/polly/test/GPGPU/shared-memory.ll
deleted file mode 100644
index 920db0d37127..000000000000
--- a/polly/test/GPGPU/shared-memory.ll
+++ /dev/null
@@ -1,83 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -polly-acc-use-shared \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -polly-acc-use-shared \
-; RUN: -disable-output -polly-acc-dump-kernel-ir < %s | \
-; RUN: FileCheck -check-prefix=KERNEL %s
-
-; REQUIRES: pollyacc
-
-;    void add(float *A) {
-;      for (long i = 0; i < 32; i++)
-;        for (long j = 0; j < 10; j++)
-;          A[i] += 1;
-;    }
-
-; CODE: # kernel0
-; CODE: {
-; CODE:   read(t0);
-; CODE:   sync0();
-; CODE:   for (int c3 = 0; c3 <= 9; c3 += 1)
-; CODE:     Stmt_bb5(t0, c3);
-; CODE:   sync1();
-; CODE:   write(t0);
-; CODE: }
-
-; KERNEL: @shared_MemRef_A = internal addrspace(3) global [32 x float] zeroinitializer, align 4
-
-; KERNEL:   %polly.access.shared_MemRef_A = getelementptr float, float addrspace(3)* getelementptr inbounds ([32 x float], [32 x float] addrspace(3)* @shared_MemRef_A, i32 0, i32 0), i64 %t0
-; KERNEL-NEXT:   %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-NEXT:   %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %t0
-; KERNEL-NEXT:   %shared.read = load float, float addrspace(1)* %polly.access.MemRef_A
-; KERNEL-NEXT:   store float %shared.read, float addrspace(3)* %polly.access.shared_MemRef_A
-
-; KERNEL:   %polly.access.shared_MemRef_A3 = getelementptr float, float addrspace(3)* getelementptr inbounds ([32 x float], [32 x float] addrspace(3)* @shared_MemRef_A, i32 0, i32 0), i64 %t0
-; KERNEL-NEXT:   %polly.access.cast.MemRef_A4 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; KERNEL-NEXT:   %polly.access.MemRef_A5 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A4, i64 %t0
-; KERNEL-NEXT:   %shared.write = load float, float addrspace(3)* %polly.access.shared_MemRef_A3
-; KERNEL-NEXT:   store float %shared.write, float addrspace(1)* %polly.access.MemRef_A5
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @add(float* %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb11, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ]
-  %exitcond1 = icmp ne i64 %i.0, 32
-  br i1 %exitcond1, label %bb3, label %bb13
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb8, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp9, %bb8 ]
-  %exitcond = icmp ne i64 %j.0, 10
-  br i1 %exitcond, label %bb5, label %bb10
-
-bb5:                                              ; preds = %bb4
-  %tmp = getelementptr inbounds float, float* %A, i64 %i.0
-  %tmp6 = load float, float* %tmp, align 4
-  %tmp7 = fadd float %tmp6, 1.000000e+00
-  store float %tmp7, float* %tmp, align 4
-  br label %bb8
-
-bb8:                                              ; preds = %bb5
-  %tmp9 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb10:                                             ; preds = %bb4
-  br label %bb11
-
-bb11:                                             ; preds = %bb10
-  %tmp12 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb13:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/simple-managed-memory-rewrite.ll b/polly/test/GPGPU/simple-managed-memory-rewrite.ll
deleted file mode 100644
index d8c5b320e2b0..000000000000
--- a/polly/test/GPGPU/simple-managed-memory-rewrite.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s |  FileCheck %s --check-prefix=SCOP
-
-; RUN: opt %loadPolly -S  -polly-process-unprofitable -polly-acc-mincompute=0 \
-; RUN: -polly-codegen-ppcg -polly-acc-codegen-managed-memory \
-; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR
-
-; REQUIRES: pollyacc
-
-; SCOP:      Function: f
-; SCOP-NEXT: Region: %for.body---%for.end
-; SCOP-NEXT: Max Loop Depth:  1
-; SCOP: i32 MemRef_A[*];
-
-; Check that we generate a constructor call for @A.toptr
-; HOST-IR: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr {{.*}}, ptr @A.toptr }]
-
-; Check that we generate a constructor
-; 4 bytes * 100 = 400
-; HOST-IR: define void {{.*}}constructor() {
-; HOST-IR-NEXT: entry:
-; HOST-IR-NEXT:   %mem.raw = call ptr @polly_mallocManaged(i64 400)
-; HOST-IR-NEXT:   store ptr %mem.raw, ptr @A.toptr
-; HOST-IR-NEXT:   ret void
-; HOST-IR-NEXT: }
-
-; HOST-IR-NOT: @A
-
-source_filename = "test.c"
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.12.0"
-
-@A = internal global [100 x i32] zeroinitializer, align 16
-
-define void @f() {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %entry.split, %for.body
-  %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds [100 x i32], ptr @A, i64 0, i64 %indvars.iv1
-  store i32 42, ptr %arrayidx, align 4, !tbaa !3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 100
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #0
-
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #0
-
-attributes #0 = { argmemonly nounwind }
-
-!llvm.module.flags = !{!0, !1}
-!llvm.ident = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"PIC Level", i32 2}
-!2 = !{!"clang version 6.0.0"}
-!3 = !{!4, !4, i64 0}
-!4 = !{!"int", !5, i64 0}
-!5 = !{!"omnipotent char", !6, i64 0}
-!6 = !{!"Simple C/C++ TBAA"}
diff --git a/polly/test/GPGPU/size-cast.ll b/polly/test/GPGPU/size-cast.ll
deleted file mode 100644
index 5e2c85de4251..000000000000
--- a/polly/test/GPGPU/size-cast.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
-; RUN: FileCheck %s -check-prefix=IR
-
-; REQUIRES: pollyacc
-
-; This test case ensures that we properly sign-extend the types we are using.
-
-; CODE:      if (arg >= 1 && arg1 == 0) {
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_arg3, MemRef_arg3, (arg) * sizeof(double), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(32);
-; CODE-NEXT:     dim3 k0_dimGrid(arg >= 1048545 ? 32768 : (arg + 31) / 32);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_arg3, dev_MemRef_arg2, arg, arg1);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_arg2, dev_MemRef_arg2, (arg) * sizeof(double), cudaMemcpyDeviceToHost));
-; CODE-NEXT  cudaCheckReturn(cudaFree(dev_MemRef_arg3));
-; CODE-NEXT  cudaCheckReturn(cudaFree(dev_MemRef_arg2));
-
-; CODE: # kernel0
-; CODE-NEXT: for (int c0 = 0; c0 <= (arg - 32 * b0 - 1) / 1048576; c0 += 1)
-; CODE-NEXT:   if (arg >= 32 * b0 + t0 + 1048576 * c0 + 1)
-; CODE-NEXT:     Stmt_bb6(0, 32 * b0 + t0 + 1048576 * c0);
-
-; IR-LABEL:  call ptr @polly_initContextCUDA()
-; IR:        sext i32 %arg to i64
-; IR-NEXT:   mul i64
-; IR-NEXT:   @polly_allocateMemoryForDevice
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define void @hoge(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3) {
-bb:
-  br label %bb4
-
-bb4:                                              ; preds = %bb13, %bb
-  br label %bb6
-
-bb5:                                              ; preds = %bb13
-  ret void
-
-bb6:                                              ; preds = %bb6, %bb4
-  %tmp = phi i64 [ 0, %bb4 ], [ %tmp10, %bb6 ]
-  %tmp7 = getelementptr inbounds double, ptr %arg3, i64 %tmp
-  %tmp8 = load double, ptr %tmp7, align 8
-  %tmp9 = getelementptr inbounds [1000 x double], ptr %arg2, i64 0, i64 %tmp
-  store double %tmp8, ptr %tmp9, align 8
-  %tmp10 = add nuw nsw i64 %tmp, 1
-  %tmp11 = zext i32 %arg to i64
-  %tmp12 = icmp ne i64 %tmp10, %tmp11
-  br i1 %tmp12, label %bb6, label %bb13
-
-bb13:                                             ; preds = %bb6
-  %tmp14 = zext i32 %arg1 to i64
-  %tmp15 = icmp ne i64 0, %tmp14
-  br i1 %tmp15, label %bb4, label %bb5
-}
diff --git a/polly/test/GPGPU/spir-codegen.ll b/polly/test/GPGPU/spir-codegen.ll
deleted file mode 100644
index 3715e1ec4427..000000000000
--- a/polly/test/GPGPU/spir-codegen.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; RUN: opt -opaque-pointers=0 %loadPolly -polly-codegen-ppcg \
-; RUN: -polly-gpu-arch=spir32 \
-; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \
-; RUN: FileCheck %s
-
-; REQUIRES: pollyacc
-
-; CHECK:      target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-; CHECK-NEXT: target triple = "spir-unknown-unknown"
-
-; CHECK-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = call i32 @__gen_ocl_get_group_id0()
-; CHECK-NEXT:   %__gen_ocl_get_group_id0 = zext i32 %0 to i64
-; CHECK-NEXT:   %1 = call i32 @__gen_ocl_get_group_id1()
-; CHECK-NEXT:   %__gen_ocl_get_group_id1 = zext i32 %1 to i64
-; CHECK-NEXT:   %2 = call i32 @__gen_ocl_get_local_id0()
-; CHECK-NEXT:   %__gen_ocl_get_local_id0 = zext i32 %2 to i64
-; CHECK-NEXT:   %3 = call i32 @__gen_ocl_get_local_id1()
-; CHECK-NEXT:   %__gen_ocl_get_local_id1 = zext i32 %3 to i64
-; CHECK-NEXT:   br label %polly.loop_preheader
-
-; CHECK-LABEL: polly.loop_exit:                                  ; preds = %polly.stmt.bb5
-; CHECK-NEXT:   ret void
-
-; CHECK-LABEL: polly.loop_header:                                ; preds = %polly.stmt.bb5, %polly.loop_preheader
-; CHECK-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
-; CHECK-NEXT:   %4 = mul nsw i64 32, %__gen_ocl_get_group_id0
-; CHECK-NEXT:   %5 = add nsw i64 %4, %__gen_ocl_get_local_id0
-; CHECK-NEXT:   %6 = mul nsw i64 32, %__gen_ocl_get_group_id1
-; CHECK-NEXT:   %7 = add nsw i64 %6, %__gen_ocl_get_local_id1
-; CHECK-NEXT:   %8 = mul nsw i64 16, %polly.indvar
-; CHECK-NEXT:   %9 = add nsw i64 %7, %8
-; CHECK-NEXT:   br label %polly.stmt.bb5
-
-; CHECK-LABEL: polly.stmt.bb5:                                   ; preds = %polly.loop_header
-; CHECK-NEXT:   %10 = mul i64 %5, %9
-; CHECK-NEXT:   %p_tmp6 = sitofp i64 %10 to float
-; CHECK-NEXT:   %polly.access.cast.MemRef_A = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; CHECK-NEXT:   %11 = mul nsw i64 32, %__gen_ocl_get_group_id0
-; CHECK-NEXT:   %12 = add nsw i64 %11, %__gen_ocl_get_local_id0
-; CHECK-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024
-; CHECK-NEXT:   %13 = mul nsw i64 32, %__gen_ocl_get_group_id1
-; CHECK-NEXT:   %14 = add nsw i64 %13, %__gen_ocl_get_local_id1
-; CHECK-NEXT:   %15 = mul nsw i64 16, %polly.indvar
-; CHECK-NEXT:   %16 = add nsw i64 %14, %15
-; CHECK-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16
-; CHECK-NEXT:   %polly.access.MemRef_A = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
-; CHECK-NEXT:   %tmp8_p_scalar_ = load float, float addrspace(1)* %polly.access.MemRef_A, align 4
-; CHECK-NEXT:   %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
-; CHECK-NEXT:   %polly.access.cast.MemRef_A1 = bitcast i8 addrspace(1)* %MemRef_A to float addrspace(1)*
-; CHECK-NEXT:   %17 = mul nsw i64 32, %__gen_ocl_get_group_id0
-; CHECK-NEXT:   %18 = add nsw i64 %17, %__gen_ocl_get_local_id0
-; CHECK-NEXT:   %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024
-; CHECK-NEXT:   %19 = mul nsw i64 32, %__gen_ocl_get_group_id1
-; CHECK-NEXT:   %20 = add nsw i64 %19, %__gen_ocl_get_local_id1
-; CHECK-NEXT:   %21 = mul nsw i64 16, %polly.indvar
-; CHECK-NEXT:   %22 = add nsw i64 %20, %21
-; CHECK-NEXT:   %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22
-; CHECK-NEXT:   %polly.access.MemRef_A4 = getelementptr float, float addrspace(1)* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A3
-; CHECK-NEXT:   store float %p_tmp9, float addrspace(1)* %polly.access.MemRef_A4, align 4
-; CHECK-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
-; CHECK-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar_next, 1
-; CHECK-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
-
-; CHECK-LABEL: polly.loop_preheader:                             ; preds = %entry
-; CHECK-NEXT:   br label %polly.loop_header
-
-; CHECK: attributes #0 = { "polly.skip.fn" }
-
-;    void double_parallel_loop(float A[][1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        for (long j = 0; j < 1024; j++)
-;          A[i][j] += i * j;
-;    }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @double_parallel_loop([1024 x float]* %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb13, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
-  %exitcond1 = icmp ne i64 %i.0, 1024
-  br i1 %exitcond1, label %bb3, label %bb15
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb10, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %j.0, 1024
-  br i1 %exitcond, label %bb5, label %bb12
-
-bb5:                                              ; preds = %bb4
-  %tmp = mul nuw nsw i64 %i.0, %j.0
-  %tmp6 = sitofp i64 %tmp to float
-  %tmp7 = getelementptr inbounds [1024 x float], [1024 x float]* %A, i64 %i.0, i64 %j.0
-  %tmp8 = load float, float* %tmp7, align 4
-  %tmp9 = fadd float %tmp8, %tmp6
-  store float %tmp9, float* %tmp7, align 4
-  br label %bb10
-
-bb10:                                             ; preds = %bb5
-  %tmp11 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb12:                                             ; preds = %bb4
-  br label %bb13
-
-bb13:                                             ; preds = %bb12
-  %tmp14 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb15:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/spir-typesize.ll b/polly/test/GPGPU/spir-typesize.ll
deleted file mode 100644
index fce17c54e6e9..000000000000
--- a/polly/test/GPGPU/spir-typesize.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -polly-gpu-arch=spir64 \
-; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \
-; RUN: FileCheck -check-prefix=I64 %s
-
-; RUN: opt %loadPolly -polly-codegen-ppcg \
-; RUN: -polly-gpu-arch=spir32 \
-; RUN: -polly-acc-dump-kernel-ir -polly-process-unprofitable -disable-output < %s | \
-; RUN: FileCheck -check-prefix=I32 %s
-
-; REQUIRES: pollyacc
-
-; This test case checks whether the openCl runtime functions (get_local_id/get_group_id) return the right types for 32 and 64bit devices.
-
-; I32:      target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-; I32-NEXT: target triple = "spir-unknown-unknown"
-
-; I32-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 {
-; I32-NEXT: entry:
-; I32-NEXT:   %0 = call i32 @__gen_ocl_get_group_id0()
-; I32-NEXT:   %__gen_ocl_get_group_id0 = zext i32 %0 to i64
-; I32-NEXT:   %1 = call i32 @__gen_ocl_get_group_id1()
-; I32-NEXT:   %__gen_ocl_get_group_id1 = zext i32 %1 to i64
-; I32-NEXT:   %2 = call i32 @__gen_ocl_get_local_id0()
-; I32-NEXT:   %__gen_ocl_get_local_id0 = zext i32 %2 to i64
-; I32-NEXT:   %3 = call i32 @__gen_ocl_get_local_id1()
-; I32-NEXT:   %__gen_ocl_get_local_id1 = zext i32 %3 to i64
-; I32-NEXT:   br label %polly.loop_preheader
-
-; I64:       target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-; I64-next:  target triple = "spir64-unknown-unknown"
-
-; I64-LABEL: define spir_kernel void @FUNC_double_parallel_loop_SCOP_0_KERNEL_0(ptr addrspace(1) %MemRef_A) #0 !kernel_arg_addr_space !0 !kernel_arg_name !1 !kernel_arg_access_qual !1 !kernel_arg_type !1 !kernel_arg_type_qual !1 !kernel_arg_base_type !1 {
-; I64-NEXT: entry:
-; I64-NEXT:   %0 = call i64 @__gen_ocl_get_group_id0()
-; I64-NEXT:   %1 = call i64 @__gen_ocl_get_group_id1()
-; I64-NEXT:   %2 = call i64 @__gen_ocl_get_local_id0()
-; I64-NEXT:   %3 = call i64 @__gen_ocl_get_local_id1()
-; I64-NEXT:   br label %polly.loop_preheader
-
-
-;    void double_parallel_loop(float A[][1024]) {
-;      for (long i = 0; i < 1024; i++)
-;        for (long j = 0; j < 1024; j++)
-;          A[i][j] += i * j;
-;    }
-;
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @double_parallel_loop(ptr %A) {
-bb:
-  br label %bb2
-
-bb2:                                              ; preds = %bb13, %bb
-  %i.0 = phi i64 [ 0, %bb ], [ %tmp14, %bb13 ]
-  %exitcond1 = icmp ne i64 %i.0, 1024
-  br i1 %exitcond1, label %bb3, label %bb15
-
-bb3:                                              ; preds = %bb2
-  br label %bb4
-
-bb4:                                              ; preds = %bb10, %bb3
-  %j.0 = phi i64 [ 0, %bb3 ], [ %tmp11, %bb10 ]
-  %exitcond = icmp ne i64 %j.0, 1024
-  br i1 %exitcond, label %bb5, label %bb12
-
-bb5:                                              ; preds = %bb4
-  %tmp = mul nuw nsw i64 %i.0, %j.0
-  %tmp6 = sitofp i64 %tmp to float
-  %tmp7 = getelementptr inbounds [1024 x float], ptr %A, i64 %i.0, i64 %j.0
-  %tmp8 = load float, ptr %tmp7, align 4
-  %tmp9 = fadd float %tmp8, %tmp6
-  store float %tmp9, ptr %tmp7, align 4
-  br label %bb10
-
-bb10:                                             ; preds = %bb5
-  %tmp11 = add nuw nsw i64 %j.0, 1
-  br label %bb4
-
-bb12:                                             ; preds = %bb4
-  br label %bb13
-
-bb13:                                             ; preds = %bb12
-  %tmp14 = add nuw nsw i64 %i.0, 1
-  br label %bb2
-
-bb15:                                             ; preds = %bb2
-  ret void
-}
diff --git a/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll b/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll
deleted file mode 100644
index 6fd14cbfbcd1..000000000000
--- a/polly/test/GPGPU/unknown-fn-call-not-copied-into-kernel.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s
-
-; Check that we do not create a kernel if there is an
-; unknown function call in a candidate kernel.
-
-; Check that we model the kernel as a scop.
-; SCOP:      Function: f
-; SCOP-NEXT:     Region: %entry.split---%for.end13
-
-; If a kernel were generated, then this code would have been part of the kernel
-; and not the `.ll` file that is generated.
-; CHECK:       %conv = fpext float %0 to double
-; CHECK-NEXT:  %1 = tail call double @extern.fn(double %conv)
-; CHECK-NEXT:  %conv6 = fptrunc double %1 to float
-
-; REQUIRES: pollyacc
-
-; static const int N = 1000;
-; void f(float A[N][N], int n, float B[N][N]) {
-;   for(int i = 0; i < n; i++) {
-;     for(int j = 0; j < n; j++) {
-;       B[i][j] = extern_fn(A[i][j], 3);
-;     }
-;
-;   }
-; }
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.11.0"
-
-define void @f(ptr %A, i32 %n, ptr %B) {
-entry:
-  br label %entry.split
-
-entry.split:                                      ; preds = %entry
-  %cmp3 = icmp sgt i32 %n, 0
-  br i1 %cmp3, label %for.cond1.preheader.lr.ph, label %for.end13
-
-for.cond1.preheader.lr.ph:                        ; preds = %entry.split
-  br label %for.cond1.preheader
-
-for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.inc11
-  %indvars.iv5 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next6, %for.inc11 ]
-  %cmp21 = icmp sgt i32 %n, 0
-  br i1 %cmp21, label %for.body3.lr.ph, label %for.inc11
-
-for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
-  br label %for.body3
-
-for.body3:                                        ; preds = %for.body3.lr.ph, %for.body3
-  %indvars.iv = phi i64 [ 0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
-  %arrayidx5 = getelementptr inbounds [1000 x float], ptr %A, i64 %indvars.iv5, i64 %indvars.iv
-  %0 = load float, ptr %arrayidx5, align 4
-  %conv = fpext float %0 to double
-  %1 = tail call double @extern.fn(double %conv)
-  %conv6 = fptrunc double %1 to float
-  %arrayidx10 = getelementptr inbounds [1000 x float], ptr %B, i64 %indvars.iv5, i64 %indvars.iv
-  store float %conv6, ptr %arrayidx10, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %wide.trip.count = zext i32 %n to i64
-  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond, label %for.body3, label %for.cond1.for.inc11_crit_edge
-
-for.cond1.for.inc11_crit_edge:                    ; preds = %for.body3
-  br label %for.inc11
-
-for.inc11:                                        ; preds = %for.cond1.for.inc11_crit_edge, %for.cond1.preheader
-  %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1
-  %wide.trip.count7 = zext i32 %n to i64
-  %exitcond8 = icmp ne i64 %indvars.iv.next6, %wide.trip.count7
-  br i1 %exitcond8, label %for.cond1.preheader, label %for.cond.for.end13_crit_edge
-
-for.cond.for.end13_crit_edge:                     ; preds = %for.inc11
-  br label %for.end13
-
-for.end13:                                        ; preds = %for.cond.for.end13_crit_edge, %entry.split
-  ret void
-}
-
-declare double @extern.fn(double) #0
-attributes #0 = { readnone }
diff --git a/polly/test/GPGPU/untouched-arrays.ll b/polly/test/GPGPU/untouched-arrays.ll
deleted file mode 100644
index 5c7e0c7b543b..000000000000
--- a/polly/test/GPGPU/untouched-arrays.ll
+++ /dev/null
@@ -1,270 +0,0 @@
-; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
-; RUN: -disable-output < %s | \
-; RUN: FileCheck -check-prefix=CODE %s
-
-; REQUIRES: pollyacc
-
-; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_global_1, MemRef_global_1, (142) * sizeof(i32), cudaMemcpyHostToDevice));
-; CODE-NEXT:   {
-; CODE-NEXT:     dim3 k0_dimBlock(10);
-; CODE-NEXT:     dim3 k0_dimGrid(1);
-; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_global_1);
-; CODE-NEXT:     cudaCheckKernel();
-; CODE-NEXT:   }
-
-; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_global_1, dev_MemRef_global_1, (142) * sizeof(i32), cudaMemcpyDeviceToHost));
-; CODE:   cudaCheckReturn(cudaFree(dev_MemRef_global_1));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb33(t0, 0);
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.hoge = type { [23 x i16], [22 x i16], [14 x i16], [13 x i16] }
-
-@global = external global [9 x %struct.hoge], align 16
-@global.1 = external global [9 x [152 x i32]], align 16
-
-; Function Attrs: nounwind uwtable
-define void @widget() #0 {
-bb:
-  br label %bb1
-
-bb1:                                              ; preds = %bb1, %bb
-  br i1 undef, label %bb1, label %bb2
-
-bb2:                                              ; preds = %bb2, %bb1
-  br i1 undef, label %bb2, label %bb3
-
-bb3:                                              ; preds = %bb3, %bb2
-  br i1 undef, label %bb3, label %bb4
-
-bb4:                                              ; preds = %bb4, %bb3
-  br i1 undef, label %bb4, label %bb5
-
-bb5:                                              ; preds = %bb5, %bb4
-  br i1 undef, label %bb5, label %bb6
-
-bb6:                                              ; preds = %bb6, %bb5
-  br i1 undef, label %bb6, label %bb7
-
-bb7:                                              ; preds = %bb7, %bb6
-  br i1 undef, label %bb7, label %bb8
-
-bb8:                                              ; preds = %bb8, %bb7
-  br i1 undef, label %bb8, label %bb9
-
-bb9:                                              ; preds = %bb8
-  br label %bb10
-
-bb10:                                             ; preds = %bb12, %bb9
-  br label %bb11
-
-bb11:                                             ; preds = %bb11, %bb10
-  br i1 undef, label %bb11, label %bb12
-
-bb12:                                             ; preds = %bb11
-  br i1 undef, label %bb10, label %bb13
-
-bb13:                                             ; preds = %bb18, %bb12
-  br i1 undef, label %bb16, label %bb14
-
-bb14:                                             ; preds = %bb16, %bb13
-  br i1 undef, label %bb15, label %bb18
-
-bb15:                                             ; preds = %bb14
-  br label %bb17
-
-bb16:                                             ; preds = %bb16, %bb13
-  br i1 undef, label %bb16, label %bb14
-
-bb17:                                             ; preds = %bb17, %bb15
-  br i1 undef, label %bb17, label %bb18
-
-bb18:                                             ; preds = %bb17, %bb14
-  br i1 undef, label %bb13, label %bb19
-
-bb19:                                             ; preds = %bb25, %bb18
-  br label %bb20
-
-bb20:                                             ; preds = %bb24, %bb19
-  br i1 undef, label %bb21, label %bb24
-
-bb21:                                             ; preds = %bb20
-  br i1 undef, label %bb23, label %bb22
-
-bb22:                                             ; preds = %bb21
-  br label %bb24
-
-bb23:                                             ; preds = %bb21
-  br label %bb24
-
-bb24:                                             ; preds = %bb23, %bb22, %bb20
-  br i1 undef, label %bb20, label %bb25
-
-bb25:                                             ; preds = %bb24
-  br i1 undef, label %bb19, label %bb26
-
-bb26:                                             ; preds = %bb56, %bb25
-  %tmp = phi ptr [ undef, %bb56 ], [ getelementptr inbounds ([9 x [152 x i32]], ptr @global.1, i64 0, i64 0, i64 32), %bb25 ]
-  br label %bb27
-
-bb27:                                             ; preds = %bb27, %bb26
-  br i1 undef, label %bb27, label %bb28
-
-bb28:                                             ; preds = %bb27
-  br label %bb30
-
-bb30:                                             ; preds = %bb38, %bb28
-  %tmp31 = phi i32 [ 3, %bb28 ], [ %tmp40, %bb38 ]
-  %tmp32 = phi ptr [ %tmp, %bb28 ], [ %tmp39, %bb38 ]
-  br label %bb33
-
-bb33:                                             ; preds = %bb33, %bb30
-  %tmp34 = phi i32 [ 0, %bb30 ], [ %tmp37, %bb33 ]
-  %tmp35 = phi ptr [ %tmp32, %bb30 ], [ undef, %bb33 ]
-  %tmp36 = getelementptr inbounds i32, ptr %tmp35, i64 1
-  store i32 undef, ptr %tmp36, align 4, !tbaa !1
-  %tmp37 = add nuw nsw i32 %tmp34, 1
-  br i1 false, label %bb33, label %bb38
-
-bb38:                                             ; preds = %bb33
-  %tmp39 = getelementptr i32, ptr %tmp32, i64 12
-  %tmp40 = add nuw nsw i32 %tmp31, 1
-  %tmp41 = icmp ne i32 %tmp40, 13
-  br i1 %tmp41, label %bb30, label %bb42
-
-bb42:                                             ; preds = %bb38
-  %tmp43 = getelementptr inbounds [9 x %struct.hoge], ptr @global, i64 0, i64 0, i32 3, i64 0
-  br label %bb44
-
-bb44:                                             ; preds = %bb51, %bb42
-  %tmp45 = phi i32 [ 0, %bb42 ], [ %tmp52, %bb51 ]
-  %tmp46 = phi ptr [ %tmp43, %bb42 ], [ undef, %bb51 ]
-  %tmp47 = load i16, ptr %tmp46, align 2, !tbaa !5
-  br label %bb48
-
-bb48:                                             ; preds = %bb48, %bb44
-  %tmp49 = phi i32 [ 0, %bb44 ], [ %tmp50, %bb48 ]
-  %tmp50 = add nuw nsw i32 %tmp49, 1
-  br i1 false, label %bb48, label %bb51
-
-bb51:                                             ; preds = %bb48
-  %tmp52 = add nuw nsw i32 %tmp45, 1
-  %tmp53 = icmp ne i32 %tmp52, 13
-  br i1 %tmp53, label %bb44, label %bb54
-
-bb54:                                             ; preds = %bb51
-  br label %bb55
-
-bb55:                                             ; preds = %bb55, %bb54
-  br i1 undef, label %bb55, label %bb56
-
-bb56:                                             ; preds = %bb55
-  br i1 undef, label %bb26, label %bb57
-
-bb57:                                             ; preds = %bb60, %bb56
-  br label %bb58
-
-bb58:                                             ; preds = %bb58, %bb57
-  br i1 undef, label %bb58, label %bb59
-
-bb59:                                             ; preds = %bb59, %bb58
-  br i1 undef, label %bb59, label %bb60
-
-bb60:                                             ; preds = %bb59
-  br i1 undef, label %bb57, label %bb61
-
-bb61:                                             ; preds = %bb65, %bb60
-  br label %bb62
-
-bb62:                                             ; preds = %bb64, %bb61
-  br label %bb63
-
-bb63:                                             ; preds = %bb63, %bb62
-  br i1 undef, label %bb63, label %bb64
-
-bb64:                                             ; preds = %bb63
-  br i1 undef, label %bb62, label %bb65
-
-bb65:                                             ; preds = %bb64
-  br i1 undef, label %bb61, label %bb66
-
-bb66:                                             ; preds = %bb70, %bb65
-  br label %bb67
-
-bb67:                                             ; preds = %bb69, %bb66
-  br label %bb68
-
-bb68:                                             ; preds = %bb68, %bb67
-  br i1 undef, label %bb68, label %bb69
-
-bb69:                                             ; preds = %bb68
-  br i1 undef, label %bb67, label %bb70
-
-bb70:                                             ; preds = %bb69
-  br i1 undef, label %bb66, label %bb71
-
-bb71:                                             ; preds = %bb73, %bb70
-  br label %bb72
-
-bb72:                                             ; preds = %bb72, %bb71
-  br i1 undef, label %bb72, label %bb73
-
-bb73:                                             ; preds = %bb72
-  br i1 undef, label %bb71, label %bb74
-
-bb74:                                             ; preds = %bb80, %bb73
-  br label %bb75
-
-bb75:                                             ; preds = %bb79, %bb74
-  br label %bb76
-
-bb76:                                             ; preds = %bb78, %bb75
-  br label %bb77
-
-bb77:                                             ; preds = %bb77, %bb76
-  br i1 undef, label %bb77, label %bb78
-
-bb78:                                             ; preds = %bb77
-  br i1 undef, label %bb76, label %bb79
-
-bb79:                                             ; preds = %bb78
-  br i1 undef, label %bb75, label %bb80
-
-bb80:                                             ; preds = %bb79
-  br i1 undef, label %bb74, label %bb81
-
-bb81:                                             ; preds = %bb85, %bb80
-  br label %bb82
-
-bb82:                                             ; preds = %bb84, %bb81
-  br label %bb83
-
-bb83:                                             ; preds = %bb83, %bb82
-  br i1 undef, label %bb83, label %bb84
-
-bb84:                                             ; preds = %bb83
-  br i1 undef, label %bb82, label %bb85
-
-bb85:                                             ; preds = %bb84
-  br i1 undef, label %bb81, label %bb86
-
-bb86:                                             ; preds = %bb85
-  ret void
-}
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 4.0.0"}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"int", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
-!5 = !{!6, !6, i64 0}
-!6 = !{!"short", !3, i64 0}
diff --git a/polly/test/Unit/lit.site.cfg.in b/polly/test/Unit/lit.site.cfg.in
index 2aeaf197f06c..a93b8b7a527b 100644
--- a/polly/test/Unit/lit.site.cfg.in
+++ b/polly/test/Unit/lit.site.cfg.in
@@ -11,7 +11,6 @@ config.polly_obj_root = "@POLLY_BINARY_DIR@"
 config.polly_lib_dir = "@POLLY_LIB_DIR@"
 config.shlibdir = "@SHLIBDIR@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
-config.enable_gpgpu_codegen = "@GPU_CODEGEN@"
 config.llvm_polly_link_into_tools = "@LLVM_POLLY_LINK_INTO_TOOLS@"
 config.has_unittests = @POLLY_GTEST_AVAIL@
 
diff --git a/polly/test/lit.cfg b/polly/test/lit.cfg
index 41e3a589c61e..0943507ebe50 100644
--- a/polly/test/lit.cfg
+++ b/polly/test/lit.cfg
@@ -70,6 +70,4 @@ except OSError:
     print("Could not find llvm-config in " + config.llvm_tools_dir)
     exit(42)
 
-if re.search(r'NVPTX', llvm_config_cmd.stdout.read().decode('ascii')):
-    config.available_features.add('nvptx-registered-target')
 llvm_config_cmd.wait()
diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in
index 4aed9875c3fb..b44061260834 100644
--- a/polly/test/lit.site.cfg.in
+++ b/polly/test/lit.site.cfg.in
@@ -7,7 +7,6 @@ config.llvm_libs_dir = lit_config.substitute("@LLVM_LIBS_DIR@")
 config.polly_obj_root = "@POLLY_BINARY_DIR@"
 config.polly_lib_dir = "@POLLY_LIB_DIR@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
-config.enable_gpgpu_codegen = "@GPU_CODEGEN@"
 config.llvm_polly_link_into_tools = "@LLVM_POLLY_LINK_INTO_TOOLS@"
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.extra_paths = "@POLLY_TEST_EXTRA_PATHS@".split(";")
@@ -50,9 +49,6 @@ else:
     config.substitutions.append(('%loadNPMPolly', commonOpts ))
 
 
-if config.enable_gpgpu_codegen == 'TRUE' :
-    config.available_features.add('pollyacc')
-
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
author	Michael Kruse <llvm-project@meinersbur.de>	2023-01-25 14:03:57 -0600
committer	Michael Kruse <llvm-project@meinersbur.de>	2023-03-08 17:33:04 -0600
commit	19afbfe33156d211fa959dadeea46cd17b9c723c (patch)
tree	db53498143b16127c6c0e22a671a8d11eece4152 /polly/test
parent	115c7beda74f3cfaf83b91d14bc97a39bff4cf19 (diff)
download	llvm-19afbfe33156d211fa959dadeea46cd17b9c723c.tar.gz