summaryrefslogtreecommitdiff
path: root/polly/docs
diff options
context:
space:
mode:
authorMichael Kruse <llvm@meinersbur.de>2018-09-26 15:21:43 +0000
committerMichael Kruse <llvm@meinersbur.de>2018-09-26 15:21:43 +0000
commitfe7bd34b79e59e2af209890918c01648ce4b9542 (patch)
tree38900d34b3b0531cc652a51c53b3f578a088e2fd /polly/docs
parentea4f20c6bef7ee65a820e65f93efe0af97997a14 (diff)
downloadllvm-fe7bd34b79e59e2af209890918c01648ce4b9542.tar.gz
Move www/experiments to docs/experiments
llvm-svn: 343118
Diffstat (limited to 'polly/docs')
-rw-r--r--polly/docs/experiments/matmul/matmul.c52
-rwxr-xr-xpolly/docs/experiments/matmul/matmul.normalopt.exebin0 -> 8849 bytes
-rw-r--r--polly/docs/experiments/matmul/matmul.normalopt.llbin0 -> 2620 bytes
-rw-r--r--polly/docs/experiments/matmul/matmul.normalopt.s274
-rwxr-xr-xpolly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exebin0 -> 13413 bytes
-rw-r--r--polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.llbin0 -> 5980 bytes
-rw-r--r--polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s754
-rwxr-xr-xpolly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.exebin0 -> 8923 bytes
-rw-r--r--polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.llbin0 -> 3724 bytes
-rw-r--r--polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.s396
-rwxr-xr-xpolly/docs/experiments/matmul/matmul.polly.interchanged+tiled.exebin0 -> 8916 bytes
-rw-r--r--polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.llbin0 -> 3612 bytes
-rw-r--r--polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.s390
-rwxr-xr-xpolly/docs/experiments/matmul/matmul.polly.interchanged.exebin0 -> 8910 bytes
-rw-r--r--polly/docs/experiments/matmul/matmul.polly.interchanged.llbin0 -> 3128 bytes
-rw-r--r--polly/docs/experiments/matmul/matmul.polly.interchanged.s286
-rw-r--r--polly/docs/experiments/matmul/matmul.preopt.ll171
-rw-r--r--polly/docs/experiments/matmul/matmul.s269
-rwxr-xr-xpolly/docs/experiments/matmul/runall.sh95
-rw-r--r--polly/docs/experiments/matmul/scops.init_array.dot39
-rw-r--r--polly/docs/experiments/matmul/scops.init_array.dot.pngbin0 -> 154236 bytes
-rw-r--r--polly/docs/experiments/matmul/scops.main.dot50
-rw-r--r--polly/docs/experiments/matmul/scops.main.dot.pngbin0 -> 190505 bytes
-rw-r--r--polly/docs/experiments/matmul/scops.print_array.dot51
-rw-r--r--polly/docs/experiments/matmul/scops.print_array.dot.pngbin0 -> 200271 bytes
-rw-r--r--polly/docs/experiments/matmul/scopsonly.init_array.dot39
-rw-r--r--polly/docs/experiments/matmul/scopsonly.init_array.dot.pngbin0 -> 28261 bytes
-rw-r--r--polly/docs/experiments/matmul/scopsonly.main.dot50
-rw-r--r--polly/docs/experiments/matmul/scopsonly.main.dot.pngbin0 -> 43325 bytes
-rw-r--r--polly/docs/experiments/matmul/scopsonly.print_array.dot51
-rw-r--r--polly/docs/experiments/matmul/scopsonly.print_array.dot.pngbin0 -> 52118 bytes
31 files changed, 2967 insertions, 0 deletions
diff --git a/polly/docs/experiments/matmul/matmul.c b/polly/docs/experiments/matmul/matmul.c
new file mode 100644
index 000000000000..49fffc808f3e
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.c
@@ -0,0 +1,52 @@
+#include <stdio.h>
+
+#define N 1536
+float A[N][N];
+float B[N][N];
+float C[N][N];
+
+void init_array()
+{
+ int i, j;
+
+ for (i = 0; i < N; i++) {
+ for (j = 0; j < N; j++) {
+ A[i][j] = (1+(i*j)%1024)/2.0;
+ B[i][j] = (1+(i*j)%1024)/2.0;
+ }
+ }
+}
+
+void print_array()
+{
+ int i, j;
+
+ for (i = 0; i < N; i++) {
+ for (j = 0; j < N; j++) {
+ fprintf(stdout, "%lf ", C[i][j]);
+ if (j%80 == 79) fprintf(stdout, "\n");
+ }
+ fprintf(stdout, "\n");
+ }
+}
+
+int main()
+{
+ int i, j, k;
+ double t_start, t_end;
+
+ init_array();
+
+ for (i = 0; i < N; i++) {
+ for (j = 0; j < N; j++) {
+ C[i][j] = 0;
+ for (k = 0; k < N; k++)
+ C[i][j] = C[i][j] + A[i][k] * B[k][j];
+ }
+ }
+
+#ifdef TEST
+ print_array();
+#endif
+ return 0;
+}
diff --git a/polly/docs/experiments/matmul/matmul.normalopt.exe b/polly/docs/experiments/matmul/matmul.normalopt.exe
new file mode 100755
index 000000000000..cdb9e67af454
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.normalopt.exe
Binary files differ
diff --git a/polly/docs/experiments/matmul/matmul.normalopt.ll b/polly/docs/experiments/matmul/matmul.normalopt.ll
new file mode 100644
index 000000000000..ba792c29f701
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.normalopt.ll
Binary files differ
diff --git a/polly/docs/experiments/matmul/matmul.normalopt.s b/polly/docs/experiments/matmul/matmul.normalopt.s
new file mode 100644
index 000000000000..079af702a14f
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.normalopt.s
@@ -0,0 +1,274 @@
+ .file "matmul.normalopt.ll"
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 8
+.LCPI0_0:
+ .quad 4602678819172646912 # double 0.5
+ .text
+ .globl init_array
+ .align 16, 0x90
+ .type init_array,@function
+init_array: # @init_array
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp2:
+ .cfi_def_cfa_offset 16
+.Ltmp3:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp4:
+ .cfi_def_cfa_register %rbp
+ xorl %r8d, %r8d
+ vmovsd .LCPI0_0(%rip), %xmm0
+ .align 16, 0x90
+.LBB0_1: # %for.cond1.preheader
+ # =>This Loop Header: Depth=1
+ # Child Loop BB0_2 Depth 2
+ xorl %ecx, %ecx
+ .align 16, 0x90
+.LBB0_2: # %for.body3
+ # Parent Loop BB0_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ movl %ecx, %edx
+ imull %r8d, %edx
+ movl %edx, %esi
+ sarl $31, %esi
+ shrl $22, %esi
+ addl %edx, %esi
+ andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
+ negl %esi
+ movq %r8, %rax
+ shlq $11, %rax
+ leal 1(%rdx,%rsi), %edi
+ leaq (%rax,%rax,2), %rsi
+ leaq 1(%rcx), %rdx
+ cmpq $1536, %rdx # imm = 0x600
+ vcvtsi2sdl %edi, %xmm0, %xmm1
+ vmulsd %xmm0, %xmm1, %xmm1
+ vcvtsd2ss %xmm1, %xmm1, %xmm1
+ vmovss %xmm1, A(%rsi,%rcx,4)
+ vmovss %xmm1, B(%rsi,%rcx,4)
+ movq %rdx, %rcx
+ jne .LBB0_2
+# BB#3: # %for.inc17
+ # in Loop: Header=BB0_1 Depth=1
+ incq %r8
+ cmpq $1536, %r8 # imm = 0x600
+ jne .LBB0_1
+# BB#4: # %for.end19
+ popq %rbp
+ ret
+.Ltmp5:
+ .size init_array, .Ltmp5-init_array
+ .cfi_endproc
+
+ .globl print_array
+ .align 16, 0x90
+ .type print_array,@function
+print_array: # @print_array
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp9:
+ .cfi_def_cfa_offset 16
+.Ltmp10:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp11:
+ .cfi_def_cfa_register %rbp
+ pushq %r15
+ pushq %r14
+ pushq %r12
+ pushq %rbx
+.Ltmp12:
+ .cfi_offset %rbx, -48
+.Ltmp13:
+ .cfi_offset %r12, -40
+.Ltmp14:
+ .cfi_offset %r14, -32
+.Ltmp15:
+ .cfi_offset %r15, -24
+ xorl %r14d, %r14d
+ movl $C, %r15d
+ .align 16, 0x90
+.LBB1_1: # %for.cond1.preheader
+ # =>This Loop Header: Depth=1
+ # Child Loop BB1_2 Depth 2
+ movq stdout(%rip), %rax
+ movq %r15, %r12
+ xorl %ebx, %ebx
+ .align 16, 0x90
+.LBB1_2: # %for.body3
+ # Parent Loop BB1_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ vmovss (%r12), %xmm0
+ vcvtss2sd %xmm0, %xmm0, %xmm0
+ movq %rax, %rdi
+ movl $.L.str, %esi
+ movb $1, %al
+ callq fprintf
+ movslq %ebx, %rax
+ imulq $1717986919, %rax, %rcx # imm = 0x66666667
+ movq %rcx, %rdx
+ shrq $63, %rdx
+ sarq $37, %rcx
+ addl %edx, %ecx
+ imull $80, %ecx, %ecx
+ subl %ecx, %eax
+ cmpl $79, %eax
+ jne .LBB1_4
+# BB#3: # %if.then
+ # in Loop: Header=BB1_2 Depth=2
+ movq stdout(%rip), %rsi
+ movl $10, %edi
+ callq fputc
+.LBB1_4: # %for.inc
+ # in Loop: Header=BB1_2 Depth=2
+ addq $4, %r12
+ incq %rbx
+ movq stdout(%rip), %rax
+ cmpq $1536, %rbx # imm = 0x600
+ jne .LBB1_2
+# BB#5: # %for.end
+ # in Loop: Header=BB1_1 Depth=1
+ movl $10, %edi
+ movq %rax, %rsi
+ callq fputc
+ addq $6144, %r15 # imm = 0x1800
+ incq %r14
+ cmpq $1536, %r14 # imm = 0x600
+ jne .LBB1_1
+# BB#6: # %for.end12
+ popq %rbx
+ popq %r12
+ popq %r14
+ popq %r15
+ popq %rbp
+ ret
+.Ltmp16:
+ .size print_array, .Ltmp16-print_array
+ .cfi_endproc
+
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 8
+.LCPI2_0:
+ .quad 4602678819172646912 # double 0.5
+ .text
+ .globl main
+ .align 16, 0x90
+ .type main,@function
+main: # @main
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp19:
+ .cfi_def_cfa_offset 16
+.Ltmp20:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp21:
+ .cfi_def_cfa_register %rbp
+ xorl %r8d, %r8d
+ vmovsd .LCPI2_0(%rip), %xmm0
+ .align 16, 0x90
+.LBB2_1: # %for.cond1.preheader.i
+ # =>This Loop Header: Depth=1
+ # Child Loop BB2_2 Depth 2
+ xorl %ecx, %ecx
+ .align 16, 0x90
+.LBB2_2: # %for.body3.i
+ # Parent Loop BB2_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ movl %ecx, %edx
+ imull %r8d, %edx
+ movl %edx, %esi
+ sarl $31, %esi
+ shrl $22, %esi
+ addl %edx, %esi
+ andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
+ negl %esi
+ movq %r8, %rax
+ shlq $11, %rax
+ leal 1(%rdx,%rsi), %edi
+ leaq (%rax,%rax,2), %rsi
+ leaq 1(%rcx), %rdx
+ cmpq $1536, %rdx # imm = 0x600
+ vcvtsi2sdl %edi, %xmm0, %xmm1
+ vmulsd %xmm0, %xmm1, %xmm1
+ vcvtsd2ss %xmm1, %xmm1, %xmm1
+ vmovss %xmm1, A(%rsi,%rcx,4)
+ vmovss %xmm1, B(%rsi,%rcx,4)
+ movq %rdx, %rcx
+ jne .LBB2_2
+# BB#3: # %for.inc17.i
+ # in Loop: Header=BB2_1 Depth=1
+ incq %r8
+ cmpq $1536, %r8 # imm = 0x600
+ jne .LBB2_1
+# BB#4:
+ xorl %r8d, %r8d
+ movl $A, %r9d
+ .align 16, 0x90
+.LBB2_5: # %for.cond1.preheader
+ # =>This Loop Header: Depth=1
+ # Child Loop BB2_6 Depth 2
+ # Child Loop BB2_7 Depth 3
+ leaq (%r8,%r8,2), %rdx
+ shlq $11, %rdx
+ leaq C(%rdx), %rsi
+ xorl %edi, %edi
+ .align 16, 0x90
+.LBB2_6: # %for.body3
+ # Parent Loop BB2_5 Depth=1
+ # => This Loop Header: Depth=2
+ # Child Loop BB2_7 Depth 3
+ movl $0, (%rsi)
+ vxorps %xmm0, %xmm0, %xmm0
+ movq $-9437184, %rax # imm = 0xFFFFFFFFFF700000
+ movq %r9, %rcx
+ .align 16, 0x90
+.LBB2_7: # %for.body8
+ # Parent Loop BB2_5 Depth=1
+ # Parent Loop BB2_6 Depth=2
+ # => This Inner Loop Header: Depth=3
+ vmovss (%rcx), %xmm1
+ vmulss B+9437184(%rax,%rdi,4), %xmm1, %xmm1
+ vaddss %xmm1, %xmm0, %xmm0
+ addq $4, %rcx
+ addq $6144, %rax # imm = 0x1800
+ jne .LBB2_7
+# BB#8: # %for.inc25
+ # in Loop: Header=BB2_6 Depth=2
+ vmovss %xmm0, (%rsi)
+ leaq C+4(%rdx,%rdi,4), %rsi
+ incq %rdi
+ cmpq $1536, %rdi # imm = 0x600
+ jne .LBB2_6
+# BB#9: # %for.inc28
+ # in Loop: Header=BB2_5 Depth=1
+ addq $6144, %r9 # imm = 0x1800
+ incq %r8
+ cmpq $1536, %r8 # imm = 0x600
+ jne .LBB2_5
+# BB#10: # %for.end30
+ xorl %eax, %eax
+ popq %rbp
+ ret
+.Ltmp22:
+ .size main, .Ltmp22-main
+ .cfi_endproc
+
+ .type A,@object # @A
+ .comm A,9437184,16
+ .type B,@object # @B
+ .comm B,9437184,16
+ .type .L.str,@object # @.str
+ .section .rodata.str1.1,"aMS",@progbits,1
+.L.str:
+ .asciz "%lf "
+ .size .L.str, 5
+
+ .type C,@object # @C
+ .comm C,9437184,16
+
+ .section ".note.GNU-stack","",@progbits
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exe b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exe
new file mode 100755
index 000000000000..feb24366d730
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.exe
Binary files differ
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll
new file mode 100644
index 000000000000..593794ef380b
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll
Binary files differ
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s
new file mode 100644
index 000000000000..ca87de11704e
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s
@@ -0,0 +1,754 @@
+ .file "matmul.polly.interchanged+tiled+vector+openmp.ll"
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 8
+.LCPI0_0:
+ .quad 4602678819172646912 # double 0.5
+ .text
+ .globl init_array
+ .align 16, 0x90
+ .type init_array,@function
+init_array: # @init_array
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp3:
+ .cfi_def_cfa_offset 16
+.Ltmp4:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp5:
+ .cfi_def_cfa_register %rbp
+ pushq %r15
+ pushq %r14
+ pushq %rbx
+ subq $24, %rsp
+.Ltmp6:
+ .cfi_offset %rbx, -40
+.Ltmp7:
+ .cfi_offset %r14, -32
+.Ltmp8:
+ .cfi_offset %r15, -24
+ leaq -32(%rbp), %rsi
+ movl $init_array.omp_subfn, %edi
+ xorl %edx, %edx
+ xorl %ecx, %ecx
+ movl $1536, %r8d # imm = 0x600
+ movl $1, %r9d
+ callq GOMP_parallel_loop_runtime_start
+ leaq -40(%rbp), %rdi
+ leaq -48(%rbp), %rsi
+ callq GOMP_loop_runtime_next
+ testb %al, %al
+ je .LBB0_4
+# BB#1:
+ leaq -40(%rbp), %r14
+ leaq -48(%rbp), %r15
+ vmovsd .LCPI0_0(%rip), %xmm1
+ .align 16, 0x90
+.LBB0_2: # %omp.loadIVBounds.i
+ # =>This Loop Header: Depth=1
+ # Child Loop BB0_8 Depth 2
+ # Child Loop BB0_5 Depth 3
+ movq -48(%rbp), %r8
+ leaq -1(%r8), %rcx
+ movq -40(%rbp), %rax
+ cmpq %rcx, %rax
+ jg .LBB0_3
+# BB#7: # %polly.loop_preheader4.preheader.i
+ # in Loop: Header=BB0_2 Depth=1
+ addq $-2, %r8
+ .align 16, 0x90
+.LBB0_8: # %polly.loop_preheader4.i
+ # Parent Loop BB0_2 Depth=1
+ # => This Loop Header: Depth=2
+ # Child Loop BB0_5 Depth 3
+ xorl %edx, %edx
+ .align 16, 0x90
+.LBB0_5: # %polly.loop_header3.i
+ # Parent Loop BB0_2 Depth=1
+ # Parent Loop BB0_8 Depth=2
+ # => This Inner Loop Header: Depth=3
+ movl %edx, %esi
+ imull %eax, %esi
+ movl %esi, %edi
+ sarl $31, %edi
+ shrl $22, %edi
+ addl %esi, %edi
+ andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00
+ negl %edi
+ movq %rax, %rcx
+ shlq $11, %rcx
+ leal 1(%rsi,%rdi), %ebx
+ leaq (%rcx,%rcx,2), %rdi
+ leaq 1(%rdx), %rsi
+ cmpq $1536, %rsi # imm = 0x600
+ vcvtsi2sdl %ebx, %xmm0, %xmm0
+ vmulsd %xmm1, %xmm0, %xmm0
+ vcvtsd2ss %xmm0, %xmm0, %xmm0
+ vmovss %xmm0, A(%rdi,%rdx,4)
+ vmovss %xmm0, B(%rdi,%rdx,4)
+ movq %rsi, %rdx
+ jne .LBB0_5
+# BB#6: # %polly.loop_exit5.i
+ # in Loop: Header=BB0_8 Depth=2
+ cmpq %r8, %rax
+ leaq 1(%rax), %rax
+ jle .LBB0_8
+.LBB0_3: # %omp.checkNext.backedge.i
+ # in Loop: Header=BB0_2 Depth=1
+ movq %r14, %rdi
+ movq %r15, %rsi
+ callq GOMP_loop_runtime_next
+ vmovsd .LCPI0_0(%rip), %xmm1
+ testb %al, %al
+ jne .LBB0_2
+.LBB0_4: # %init_array.omp_subfn.exit
+ callq GOMP_loop_end_nowait
+ callq GOMP_parallel_end
+ addq $24, %rsp
+ popq %rbx
+ popq %r14
+ popq %r15
+ popq %rbp
+ ret
+.Ltmp9:
+ .size init_array, .Ltmp9-init_array
+ .cfi_endproc
+
+ .globl print_array
+ .align 16, 0x90
+ .type print_array,@function
+print_array: # @print_array
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp13:
+ .cfi_def_cfa_offset 16
+.Ltmp14:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp15:
+ .cfi_def_cfa_register %rbp
+ pushq %r15
+ pushq %r14
+ pushq %r12
+ pushq %rbx
+.Ltmp16:
+ .cfi_offset %rbx, -48
+.Ltmp17:
+ .cfi_offset %r12, -40
+.Ltmp18:
+ .cfi_offset %r14, -32
+.Ltmp19:
+ .cfi_offset %r15, -24
+ xorl %r14d, %r14d
+ movl $C, %r15d
+ .align 16, 0x90
+.LBB1_1: # %for.cond1.preheader
+ # =>This Loop Header: Depth=1
+ # Child Loop BB1_2 Depth 2
+ movq stdout(%rip), %rax
+ movq %r15, %r12
+ xorl %ebx, %ebx
+ .align 16, 0x90
+.LBB1_2: # %for.body3
+ # Parent Loop BB1_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ vmovss (%r12), %xmm0
+ vcvtss2sd %xmm0, %xmm0, %xmm0
+ movq %rax, %rdi
+ movl $.L.str, %esi
+ movb $1, %al
+ callq fprintf
+ movslq %ebx, %rax
+ imulq $1717986919, %rax, %rcx # imm = 0x66666667
+ movq %rcx, %rdx
+ shrq $63, %rdx
+ sarq $37, %rcx
+ addl %edx, %ecx
+ imull $80, %ecx, %ecx
+ subl %ecx, %eax
+ cmpl $79, %eax
+ jne .LBB1_4
+# BB#3: # %if.then
+ # in Loop: Header=BB1_2 Depth=2
+ movq stdout(%rip), %rsi
+ movl $10, %edi
+ callq fputc
+.LBB1_4: # %for.inc
+ # in Loop: Header=BB1_2 Depth=2
+ addq $4, %r12
+ incq %rbx
+ movq stdout(%rip), %rax
+ cmpq $1536, %rbx # imm = 0x600
+ jne .LBB1_2
+# BB#5: # %for.end
+ # in Loop: Header=BB1_1 Depth=1
+ movl $10, %edi
+ movq %rax, %rsi
+ callq fputc
+ addq $6144, %r15 # imm = 0x1800
+ incq %r14
+ cmpq $1536, %r14 # imm = 0x600
+ jne .LBB1_1
+# BB#6: # %for.end12
+ popq %rbx
+ popq %r12
+ popq %r14
+ popq %r15
+ popq %rbp
+ ret
+.Ltmp20:
+ .size print_array, .Ltmp20-print_array
+ .cfi_endproc
+
+ .globl main
+ .align 16, 0x90
+ .type main,@function
+main: # @main
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp24:
+ .cfi_def_cfa_offset 16
+.Ltmp25:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp26:
+ .cfi_def_cfa_register %rbp
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbx
+ subq $24, %rsp
+.Ltmp27:
+ .cfi_offset %rbx, -56
+.Ltmp28:
+ .cfi_offset %r12, -48
+.Ltmp29:
+ .cfi_offset %r13, -40
+.Ltmp30:
+ .cfi_offset %r14, -32
+.Ltmp31:
+ .cfi_offset %r15, -24
+ callq init_array
+ leaq -48(%rbp), %rsi
+ movl $main.omp_subfn, %edi
+ xorl %edx, %edx
+ xorl %ecx, %ecx
+ movl $1536, %r8d # imm = 0x600
+ movl $1, %r9d
+ callq GOMP_parallel_loop_runtime_start
+ leaq -56(%rbp), %rdi
+ leaq -64(%rbp), %rsi
+ callq GOMP_loop_runtime_next
+ testb %al, %al
+ je .LBB2_4
+# BB#1:
+ leaq -56(%rbp), %r14
+ leaq -64(%rbp), %r15
+ .align 16, 0x90
+.LBB2_2: # %omp.loadIVBounds.i
+ # =>This Loop Header: Depth=1
+ # Child Loop BB2_6 Depth 2
+ movq -64(%rbp), %r12
+ leaq -1(%r12), %rcx
+ movq -56(%rbp), %rax
+ cmpq %rcx, %rax
+ jg .LBB2_3
+# BB#5: # %polly.loop_preheader4.preheader.i
+ # in Loop: Header=BB2_2 Depth=1
+ addq $-2, %r12
+ leaq (%rax,%rax,2), %rcx
+ leaq -1(%rax), %r13
+ shlq $11, %rcx
+ leaq C(%rcx), %rbx
+ .align 16, 0x90
+.LBB2_6: # %polly.loop_preheader4.i
+ # Parent Loop BB2_2 Depth=1
+ # => This Inner Loop Header: Depth=2
+ movq %rbx, %rdi
+ xorl %esi, %esi
+ movl $6144, %edx # imm = 0x1800
+ callq memset
+ addq $6144, %rbx # imm = 0x1800
+ incq %r13
+ cmpq %r12, %r13
+ jle .LBB2_6
+.LBB2_3: # %omp.checkNext.backedge.i
+ # in Loop: Header=BB2_2 Depth=1
+ movq %r14, %rdi
+ movq %r15, %rsi
+ callq GOMP_loop_runtime_next
+ testb %al, %al
+ jne .LBB2_2
+.LBB2_4: # %main.omp_subfn.exit
+ callq GOMP_loop_end_nowait
+ callq GOMP_parallel_end
+ leaq -48(%rbp), %rbx
+ movl $main.omp_subfn1, %edi
+ movq %rbx, %rsi
+ xorl %edx, %edx
+ xorl %ecx, %ecx
+ movl $1536, %r8d # imm = 0x600
+ movl $64, %r9d
+ callq GOMP_parallel_loop_runtime_start
+ movq %rbx, %rdi
+ callq main.omp_subfn1
+ callq GOMP_parallel_end
+ xorl %eax, %eax
+ addq $24, %rsp
+ popq %rbx
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ popq %rbp
+ ret
+.Ltmp32:
+ .size main, .Ltmp32-main
+ .cfi_endproc
+
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 8
+.LCPI3_0:
+ .quad 4602678819172646912 # double 0.5
+ .text
+ .align 16, 0x90
+ .type init_array.omp_subfn,@function
+init_array.omp_subfn: # @init_array.omp_subfn
+ .cfi_startproc
+# BB#0: # %omp.setup
+ pushq %rbp
+.Ltmp36:
+ .cfi_def_cfa_offset 16
+.Ltmp37:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp38:
+ .cfi_def_cfa_register %rbp
+ pushq %r15
+ pushq %r14
+ pushq %rbx
+ subq $24, %rsp
+.Ltmp39:
+ .cfi_offset %rbx, -40
+.Ltmp40:
+ .cfi_offset %r14, -32
+.Ltmp41:
+ .cfi_offset %r15, -24
+ leaq -32(%rbp), %rdi
+ leaq -40(%rbp), %rsi
+ callq GOMP_loop_runtime_next
+ testb %al, %al
+ je .LBB3_4
+# BB#1:
+ leaq -32(%rbp), %r14
+ leaq -40(%rbp), %r15
+ vmovsd .LCPI3_0(%rip), %xmm1
+ .align 16, 0x90
+.LBB3_2: # %omp.loadIVBounds
+ # =>This Loop Header: Depth=1
+ # Child Loop BB3_8 Depth 2
+ # Child Loop BB3_5 Depth 3
+ movq -40(%rbp), %r8
+ leaq -1(%r8), %rcx
+ movq -32(%rbp), %rax
+ cmpq %rcx, %rax
+ jg .LBB3_3
+# BB#7: # %polly.loop_preheader4.preheader
+ # in Loop: Header=BB3_2 Depth=1
+ addq $-2, %r8
+ .align 16, 0x90
+.LBB3_8: # %polly.loop_preheader4
+ # Parent Loop BB3_2 Depth=1
+ # => This Loop Header: Depth=2
+ # Child Loop BB3_5 Depth 3
+ xorl %edx, %edx
+ .align 16, 0x90
+.LBB3_5: # %polly.loop_header3
+ # Parent Loop BB3_2 Depth=1
+ # Parent Loop BB3_8 Depth=2
+ # => This Inner Loop Header: Depth=3
+ movl %edx, %esi
+ imull %eax, %esi
+ movl %esi, %edi
+ sarl $31, %edi
+ shrl $22, %edi
+ addl %esi, %edi
+ andl $-1024, %edi # imm = 0xFFFFFFFFFFFFFC00
+ negl %edi
+ movq %rax, %rcx
+ shlq $11, %rcx
+ leal 1(%rsi,%rdi), %ebx
+ leaq (%rcx,%rcx,2), %rdi
+ leaq 1(%rdx), %rsi
+ cmpq $1536, %rsi # imm = 0x600
+ vcvtsi2sdl %ebx, %xmm0, %xmm0
+ vmulsd %xmm1, %xmm0, %xmm0
+ vcvtsd2ss %xmm0, %xmm0, %xmm0
+ vmovss %xmm0, A(%rdi,%rdx,4)
+ vmovss %xmm0, B(%rdi,%rdx,4)
+ movq %rsi, %rdx
+ jne .LBB3_5
+# BB#6: # %polly.loop_exit5
+ # in Loop: Header=BB3_8 Depth=2
+ cmpq %r8, %rax
+ leaq 1(%rax), %rax
+ jle .LBB3_8
+.LBB3_3: # %omp.checkNext.backedge
+ # in Loop: Header=BB3_2 Depth=1
+ movq %r14, %rdi
+ movq %r15, %rsi
+ callq GOMP_loop_runtime_next
+ vmovsd .LCPI3_0(%rip), %xmm1
+ testb %al, %al
+ jne .LBB3_2
+.LBB3_4: # %omp.exit
+ callq GOMP_loop_end_nowait
+ addq $24, %rsp
+ popq %rbx
+ popq %r14
+ popq %r15
+ popq %rbp
+ ret
+.Ltmp42:
+ .size init_array.omp_subfn, .Ltmp42-init_array.omp_subfn
+ .cfi_endproc
+
+ .align 16, 0x90
+ .type main.omp_subfn,@function
+main.omp_subfn: # @main.omp_subfn
+ .cfi_startproc
+# BB#0: # %omp.setup
+ pushq %rbp
+.Ltmp46:
+ .cfi_def_cfa_offset 16
+.Ltmp47:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp48:
+ .cfi_def_cfa_register %rbp
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbx
+ subq $24, %rsp
+.Ltmp49:
+ .cfi_offset %rbx, -56
+.Ltmp50:
+ .cfi_offset %r12, -48
+.Ltmp51:
+ .cfi_offset %r13, -40
+.Ltmp52:
+ .cfi_offset %r14, -32
+.Ltmp53:
+ .cfi_offset %r15, -24
+ leaq -48(%rbp), %rdi
+ leaq -56(%rbp), %rsi
+ callq GOMP_loop_runtime_next
+ testb %al, %al
+ je .LBB4_4
+# BB#1:
+ leaq -48(%rbp), %r14
+ leaq -56(%rbp), %r15
+ .align 16, 0x90
+.LBB4_2: # %omp.loadIVBounds
+ # =>This Loop Header: Depth=1
+ # Child Loop BB4_6 Depth 2
+ movq -56(%rbp), %r12
+ leaq -1(%r12), %rcx
+ movq -48(%rbp), %rax
+ cmpq %rcx, %rax
+ jg .LBB4_3
+# BB#5: # %polly.loop_preheader4.preheader
+ # in Loop: Header=BB4_2 Depth=1
+ addq $-2, %r12
+ leaq (%rax,%rax,2), %rcx
+ leaq -1(%rax), %r13
+ shlq $11, %rcx
+ leaq C(%rcx), %rbx
+ .align 16, 0x90
+.LBB4_6: # %polly.loop_preheader4
+ # Parent Loop BB4_2 Depth=1
+ # => This Inner Loop Header: Depth=2
+ movq %rbx, %rdi
+ xorl %esi, %esi
+ movl $6144, %edx # imm = 0x1800
+ callq memset
+ addq $6144, %rbx # imm = 0x1800
+ incq %r13
+ cmpq %r12, %r13
+ jle .LBB4_6
+.LBB4_3: # %omp.checkNext.backedge
+ # in Loop: Header=BB4_2 Depth=1
+ movq %r14, %rdi
+ movq %r15, %rsi
+ callq GOMP_loop_runtime_next
+ testb %al, %al
+ jne .LBB4_2
+.LBB4_4: # %omp.exit
+ callq GOMP_loop_end_nowait
+ addq $24, %rsp
+ popq %rbx
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ popq %rbp
+ ret
+.Ltmp54:
+ .size main.omp_subfn, .Ltmp54-main.omp_subfn
+ .cfi_endproc
+
+ .align 16, 0x90
+ .type main.omp_subfn1,@function
+main.omp_subfn1: # @main.omp_subfn1
+ .cfi_startproc
+# BB#0: # %omp.setup
+ pushq %rbp
+.Ltmp58:
+ .cfi_def_cfa_offset 16
+.Ltmp59:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp60:
+ .cfi_def_cfa_register %rbp
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbx
+ subq $72, %rsp
+.Ltmp61:
+ .cfi_offset %rbx, -56
+.Ltmp62:
+ .cfi_offset %r12, -48
+.Ltmp63:
+ .cfi_offset %r13, -40
+.Ltmp64:
+ .cfi_offset %r14, -32
+.Ltmp65:
+ .cfi_offset %r15, -24
+ jmp .LBB5_1
+ .align 16, 0x90
+.LBB5_2: # %omp.loadIVBounds
+ # in Loop: Header=BB5_1 Depth=1
+ movq -56(%rbp), %rax
+ movq %rax, -112(%rbp) # 8-byte Spill
+ leaq -1(%rax), %rax
+ movq -48(%rbp), %rcx
+ cmpq %rax, %rcx
+ jg .LBB5_1
+# BB#3: # %polly.loop_preheader4.preheader
+ # in Loop: Header=BB5_1 Depth=1
+ leaq -1(%rcx), %rax
+ movq %rax, -88(%rbp) # 8-byte Spill
+ addq $-65, -112(%rbp) # 8-byte Folded Spill
+ movq %rcx, %rax
+ shlq $9, %rax
+ leaq (%rax,%rax,2), %rax
+ leaq C+16(,%rax,4), %rax
+ movq %rax, -104(%rbp) # 8-byte Spill
+ .align 16, 0x90
+.LBB5_7: # %polly.loop_preheader4
+ # Parent Loop BB5_1 Depth=1
+ # => This Loop Header: Depth=2
+ # Child Loop BB5_8 Depth 3
+ # Child Loop BB5_9 Depth 4
+ # Child Loop BB5_12 Depth 5
+ # Child Loop BB5_17 Depth 6
+ # Child Loop BB5_18 Depth 7
+ # Child Loop BB5_14 Depth 5
+ movq %rcx, -72(%rbp) # 8-byte Spill
+ leaq 62(%rcx), %rdi
+ xorl %edx, %edx
+ .align 16, 0x90
+.LBB5_8: # %polly.loop_preheader11
+ # Parent Loop BB5_1 Depth=1
+ # Parent Loop BB5_7 Depth=2
+ # => This Loop Header: Depth=3
+ # Child Loop BB5_9 Depth 4
+ # Child Loop BB5_12 Depth 5
+ # Child Loop BB5_17 Depth 6
+ # Child Loop BB5_18 Depth 7
+ # Child Loop BB5_14 Depth 5
+ movq %rdx, -96(%rbp) # 8-byte Spill
+ leaq -4(%rdx), %rcx
+ movq %rdx, %rax
+ decq %rax
+ cmovsq %rcx, %rax
+ movq %rax, %r14
+ sarq $63, %r14
+ shrq $62, %r14
+ addq %rax, %r14
+ andq $-4, %r14
+ movq %rdx, %rax
+ orq $63, %rax
+ leaq -4(%rax), %rdx
+ movq -104(%rbp), %rcx # 8-byte Reload
+ leaq (%rcx,%r14,4), %rcx
+ movq %rcx, -80(%rbp) # 8-byte Spill
+ leaq B+16(,%r14,4), %rbx
+ leaq 4(%r14), %rcx
+ movq %rcx, -64(%rbp) # 8-byte Spill
+ xorl %r11d, %r11d
+ .align 16, 0x90
+.LBB5_9: # %polly.loop_header10
+ # Parent Loop BB5_1 Depth=1
+ # Parent Loop BB5_7 Depth=2
+ # Parent Loop BB5_8 Depth=3
+ # => This Loop Header: Depth=4
+ # Child Loop BB5_12 Depth 5
+ # Child Loop BB5_17 Depth 6
+ # Child Loop BB5_18 Depth 7
+ # Child Loop BB5_14 Depth 5
+ movabsq $9223372036854775744, %rcx # imm = 0x7FFFFFFFFFFFFFC0
+ cmpq %rcx, -72(%rbp) # 8-byte Folded Reload
+ jg .LBB5_15
+# BB#10: # %polly.loop_header17.preheader
+ # in Loop: Header=BB5_9 Depth=4
+ movq %r11, %r15
+ orq $63, %r15
+ cmpq %r15, %r11
+ movq -88(%rbp), %rcx # 8-byte Reload
+ jle .LBB5_11
+ .align 16, 0x90
+.LBB5_14: # %polly.loop_exit28.us
+ # Parent Loop BB5_1 Depth=1
+ # Parent Loop BB5_7 Depth=2
+ # Parent Loop BB5_8 Depth=3
+ # Parent Loop BB5_9 Depth=4
+ # => This Inner Loop Header: Depth=5
+ incq %rcx
+ cmpq %rdi, %rcx
+ jle .LBB5_14
+ jmp .LBB5_15
+ .align 16, 0x90
+.LBB5_11: # in Loop: Header=BB5_9 Depth=4
+ decq %r15
+ movq -80(%rbp), %r13 # 8-byte Reload
+ movq -72(%rbp), %rcx # 8-byte Reload
+ .align 16, 0x90
+.LBB5_12: # %polly.loop_header26.preheader
+ # Parent Loop BB5_1 Depth=1
+ # Parent Loop BB5_7 Depth=2
+ # Parent Loop BB5_8 Depth=3
+ # Parent Loop BB5_9 Depth=4
+ # => This Loop Header: Depth=5
+ # Child Loop BB5_17 Depth 6
+ # Child Loop BB5_18 Depth 7
+ cmpq %rax, -64(%rbp) # 8-byte Folded Reload
+ movq %rbx, %r12
+ movq %r11, %r8
+ jg .LBB5_13
+ .align 16, 0x90
+.LBB5_17: # %polly.loop_header35.preheader
+ # Parent Loop BB5_1 Depth=1
+ # Parent Loop BB5_7 Depth=2
+ # Parent Loop BB5_8 Depth=3
+ # Parent Loop BB5_9 Depth=4
+ # Parent Loop BB5_12 Depth=5
+ # => This Loop Header: Depth=6
+ # Child Loop BB5_18 Depth 7
+ leaq (%rcx,%rcx,2), %rsi
+ shlq $11, %rsi
+ vbroadcastss A(%rsi,%r8,4), %xmm0
+ movq %r13, %r9
+ movq %r12, %r10
+ movq %r14, %rsi
+.LBB5_18: # %polly.loop_header35
+ # Parent Loop BB5_1 Depth=1
+ # Parent Loop BB5_7 Depth=2
+ # Parent Loop BB5_8 Depth=3
+ # Parent Loop BB5_9 Depth=4
+ # Parent Loop BB5_12 Depth=5
+ # Parent Loop BB5_17 Depth=6
+ # => This Inner Loop Header: Depth=7
+ vmulps (%r10), %xmm0, %xmm1
+ vaddps (%r9), %xmm1, %xmm1
+ vmovaps %xmm1, (%r9)
+ addq $16, %r9
+ addq $16, %r10
+ addq $4, %rsi
+ cmpq %rdx, %rsi
+ jle .LBB5_18
+# BB#16: # %polly.loop_exit37
+ # in Loop: Header=BB5_17 Depth=6
+ addq $6144, %r12 # imm = 0x1800
+ cmpq %r15, %r8
+ leaq 1(%r8), %r8
+ jle .LBB5_17
+ .align 16, 0x90
+.LBB5_13: # %polly.loop_exit28
+ # in Loop: Header=BB5_12 Depth=5
+ addq $6144, %r13 # imm = 0x1800
+ cmpq %rdi, %rcx
+ leaq 1(%rcx), %rcx
+ jle .LBB5_12
+ .align 16, 0x90
+.LBB5_15: # %polly.loop_exit19
+ # in Loop: Header=BB5_9 Depth=4
+ addq $393216, %rbx # imm = 0x60000
+ cmpq $1472, %r11 # imm = 0x5C0
+ leaq 64(%r11), %r11
+ jl .LBB5_9
+# BB#5: # %polly.loop_exit12
+ # in Loop: Header=BB5_8 Depth=3
+ movq -96(%rbp), %rdx # 8-byte Reload
+ cmpq $1472, %rdx # imm = 0x5C0
+ leaq 64(%rdx), %rdx
+ jl .LBB5_8
+# BB#6: # %polly.loop_exit5
+ # in Loop: Header=BB5_7 Depth=2
+ addq $64, -88(%rbp) # 8-byte Folded Spill
+ addq $393216, -104(%rbp) # 8-byte Folded Spill
+ # imm = 0x60000
+ movq -72(%rbp), %rcx # 8-byte Reload
+ cmpq -112(%rbp), %rcx # 8-byte Folded Reload
+ leaq 64(%rcx), %rcx
+ jle .LBB5_7
+.LBB5_1: # %omp.setup
+ # =>This Loop Header: Depth=1
+ # Child Loop BB5_7 Depth 2
+ # Child Loop BB5_8 Depth 3
+ # Child Loop BB5_9 Depth 4
+ # Child Loop BB5_12 Depth 5
+ # Child Loop BB5_17 Depth 6
+ # Child Loop BB5_18 Depth 7
+ # Child Loop BB5_14 Depth 5
+ leaq -48(%rbp), %rdi
+ leaq -56(%rbp), %rsi
+ callq GOMP_loop_runtime_next
+ testb %al, %al
+ jne .LBB5_2
+# BB#4: # %omp.exit
+ callq GOMP_loop_end_nowait
+ addq $72, %rsp
+ popq %rbx
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ popq %rbp
+ ret
+.Ltmp66:
+ .size main.omp_subfn1, .Ltmp66-main.omp_subfn1
+ .cfi_endproc
+
+ .type A,@object # @A
+ .comm A,9437184,16
+ .type B,@object # @B
+ .comm B,9437184,16
+ .type .L.str,@object # @.str
+ .section .rodata.str1.1,"aMS",@progbits,1
+.L.str:
+ .asciz "%lf "
+ .size .L.str, 5
+
+ .type C,@object # @C
+ .comm C,9437184,16
+
+ .section ".note.GNU-stack","",@progbits
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.exe b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.exe
new file mode 100755
index 000000000000..36b788ea9ac3
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.exe
Binary files differ
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll
new file mode 100644
index 000000000000..9d1f9ad098f9
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll
Binary files differ
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.s b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.s
new file mode 100644
index 000000000000..485d230bc398
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.s
@@ -0,0 +1,396 @@
+ .file "matmul.polly.interchanged+tiled+vector.ll"
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 8
+.LCPI0_0:
+ .quad 4602678819172646912 # double 0.5
+ .text
+ .globl init_array
+ .align 16, 0x90
+ .type init_array,@function
+init_array: # @init_array
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp2:
+ .cfi_def_cfa_offset 16
+.Ltmp3:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp4:
+ .cfi_def_cfa_register %rbp
+ xorl %r8d, %r8d
+ vmovsd .LCPI0_0(%rip), %xmm0
+ .align 16, 0x90
+.LBB0_1: # %polly.loop_preheader3
+ # =>This Loop Header: Depth=1
+ # Child Loop BB0_2 Depth 2
+ xorl %ecx, %ecx
+ .align 16, 0x90
+.LBB0_2: # %polly.loop_header2
+ # Parent Loop BB0_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ movl %ecx, %edx
+ imull %r8d, %edx
+ movl %edx, %esi
+ sarl $31, %esi
+ shrl $22, %esi
+ addl %edx, %esi
+ andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
+ negl %esi
+ movq %r8, %rax
+ shlq $11, %rax
+ leal 1(%rdx,%rsi), %edi
+ leaq (%rax,%rax,2), %rsi
+ leaq 1(%rcx), %rdx
+ cmpq $1536, %rdx # imm = 0x600
+ vcvtsi2sdl %edi, %xmm0, %xmm1
+ vmulsd %xmm0, %xmm1, %xmm1
+ vcvtsd2ss %xmm1, %xmm1, %xmm1
+ vmovss %xmm1, A(%rsi,%rcx,4)
+ vmovss %xmm1, B(%rsi,%rcx,4)
+ movq %rdx, %rcx
+ jne .LBB0_2
+# BB#3: # %polly.loop_exit4
+ # in Loop: Header=BB0_1 Depth=1
+ incq %r8
+ cmpq $1536, %r8 # imm = 0x600
+ jne .LBB0_1
+# BB#4: # %polly.loop_exit
+ popq %rbp
+ ret
+.Ltmp5:
+ .size init_array, .Ltmp5-init_array
+ .cfi_endproc
+
+ .globl print_array
+ .align 16, 0x90
+ .type print_array,@function
+print_array: # @print_array
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp9:
+ .cfi_def_cfa_offset 16
+.Ltmp10:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp11:
+ .cfi_def_cfa_register %rbp
+ pushq %r15
+ pushq %r14
+ pushq %r12
+ pushq %rbx
+.Ltmp12:
+ .cfi_offset %rbx, -48
+.Ltmp13:
+ .cfi_offset %r12, -40
+.Ltmp14:
+ .cfi_offset %r14, -32
+.Ltmp15:
+ .cfi_offset %r15, -24
+ xorl %r14d, %r14d
+ movl $C, %r15d
+ .align 16, 0x90
+.LBB1_1: # %for.cond1.preheader
+ # =>This Loop Header: Depth=1
+ # Child Loop BB1_2 Depth 2
+ movq stdout(%rip), %rax
+ movq %r15, %r12
+ xorl %ebx, %ebx
+ .align 16, 0x90
+.LBB1_2: # %for.body3
+ # Parent Loop BB1_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ vmovss (%r12), %xmm0
+ vcvtss2sd %xmm0, %xmm0, %xmm0
+ movq %rax, %rdi
+ movl $.L.str, %esi
+ movb $1, %al
+ callq fprintf
+ movslq %ebx, %rax
+ imulq $1717986919, %rax, %rcx # imm = 0x66666667
+ movq %rcx, %rdx
+ shrq $63, %rdx
+ sarq $37, %rcx
+ addl %edx, %ecx
+ imull $80, %ecx, %ecx
+ subl %ecx, %eax
+ cmpl $79, %eax
+ jne .LBB1_4
+# BB#3: # %if.then
+ # in Loop: Header=BB1_2 Depth=2
+ movq stdout(%rip), %rsi
+ movl $10, %edi
+ callq fputc
+.LBB1_4: # %for.inc
+ # in Loop: Header=BB1_2 Depth=2
+ addq $4, %r12
+ incq %rbx
+ movq stdout(%rip), %rax
+ cmpq $1536, %rbx # imm = 0x600
+ jne .LBB1_2
+# BB#5: # %for.end
+ # in Loop: Header=BB1_1 Depth=1
+ movl $10, %edi
+ movq %rax, %rsi
+ callq fputc
+ addq $6144, %r15 # imm = 0x1800
+ incq %r14
+ cmpq $1536, %r14 # imm = 0x600
+ jne .LBB1_1
+# BB#6: # %for.end12
+ popq %rbx
+ popq %r12
+ popq %r14
+ popq %r15
+ popq %rbp
+ ret
+.Ltmp16:
+ .size print_array, .Ltmp16-print_array
+ .cfi_endproc
+
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 8
+.LCPI2_0:
+ .quad 4602678819172646912 # double 0.5
+ .text
+ .globl main
+ .align 16, 0x90
+ .type main,@function
+main: # @main
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp20:
+ .cfi_def_cfa_offset 16
+.Ltmp21:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp22:
+ .cfi_def_cfa_register %rbp
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbx
+ subq $56, %rsp
+.Ltmp23:
+ .cfi_offset %rbx, -56
+.Ltmp24:
+ .cfi_offset %r12, -48
+.Ltmp25:
+ .cfi_offset %r13, -40
+.Ltmp26:
+ .cfi_offset %r14, -32
+.Ltmp27:
+ .cfi_offset %r15, -24
+ xorl %ebx, %ebx
+ vmovsd .LCPI2_0(%rip), %xmm0
+ .align 16, 0x90
+.LBB2_1: # %polly.loop_preheader3.i
+ # =>This Loop Header: Depth=1
+ # Child Loop BB2_2 Depth 2
+ xorl %ecx, %ecx
+ .align 16, 0x90
+.LBB2_2: # %polly.loop_header2.i
+ # Parent Loop BB2_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ movl %ecx, %edx
+ imull %ebx, %edx
+ movl %edx, %esi
+ sarl $31, %esi
+ shrl $22, %esi
+ addl %edx, %esi
+ andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
+ negl %esi
+ movq %rbx, %rax
+ shlq $11, %rax
+ leal 1(%rdx,%rsi), %edi
+ leaq (%rax,%rax,2), %rsi
+ leaq 1(%rcx), %rdx
+ cmpq $1536, %rdx # imm = 0x600
+ vcvtsi2sdl %edi, %xmm0, %xmm1
+ vmulsd %xmm0, %xmm1, %xmm1
+ vcvtsd2ss %xmm1, %xmm1, %xmm1
+ vmovss %xmm1, A(%rsi,%rcx,4)
+ vmovss %xmm1, B(%rsi,%rcx,4)
+ movq %rdx, %rcx
+ jne .LBB2_2
+# BB#3: # %polly.loop_exit4.i
+ # in Loop: Header=BB2_1 Depth=1
+ incq %rbx
+ cmpq $1536, %rbx # imm = 0x600
+ jne .LBB2_1
+# BB#4: # %polly.loop_preheader3.preheader
+ movl $C, %edi
+ xorl %esi, %esi
+ movl $9437184, %edx # imm = 0x900000
+ callq memset
+ xorl %esi, %esi
+ movl $C+16, %eax
+ movq %rax, -88(%rbp) # 8-byte Spill
+ .align 16, 0x90
+.LBB2_5: # %polly.loop_preheader17
+ # =>This Loop Header: Depth=1
+ # Child Loop BB2_15 Depth 2
+ # Child Loop BB2_8 Depth 3
+ # Child Loop BB2_11 Depth 4
+ # Child Loop BB2_17 Depth 5
+ # Child Loop BB2_18 Depth 6
+ movq %rsi, -56(%rbp) # 8-byte Spill
+ movq %rsi, %rax
+ orq $63, %rax
+ movq %rax, -72(%rbp) # 8-byte Spill
+ leaq -1(%rax), %rax
+ movq %rax, -48(%rbp) # 8-byte Spill
+ xorl %edx, %edx
+ .align 16, 0x90
+.LBB2_15: # %polly.loop_preheader24
+ # Parent Loop BB2_5 Depth=1
+ # => This Loop Header: Depth=2
+ # Child Loop BB2_8 Depth 3
+ # Child Loop BB2_11 Depth 4
+ # Child Loop BB2_17 Depth 5
+ # Child Loop BB2_18 Depth 6
+ movq %rdx, -80(%rbp) # 8-byte Spill
+ leaq -4(%rdx), %rcx
+ movq %rdx, %rax
+ decq %rax
+ cmovsq %rcx, %rax
+ movq %rax, %r15
+ sarq $63, %r15
+ shrq $62, %r15
+ addq %rax, %r15
+ andq $-4, %r15
+ movq %rdx, %r13
+ orq $63, %r13
+ leaq -4(%r13), %rdx
+ xorl %r10d, %r10d
+ movq -88(%rbp), %rax # 8-byte Reload
+ leaq (%rax,%r15,4), %rax
+ movq %rax, -64(%rbp) # 8-byte Spill
+ leaq B+16(,%r15,4), %rbx
+ leaq 4(%r15), %r12
+ .align 16, 0x90
+.LBB2_8: # %polly.loop_header23
+ # Parent Loop BB2_5 Depth=1
+ # Parent Loop BB2_15 Depth=2
+ # => This Loop Header: Depth=3
+ # Child Loop BB2_11 Depth 4
+ # Child Loop BB2_17 Depth 5
+ # Child Loop BB2_18 Depth 6
+ cmpq -72(%rbp), %rsi # 8-byte Folded Reload
+ jg .LBB2_13
+# BB#9: # %polly.loop_header30.preheader
+ # in Loop: Header=BB2_8 Depth=3
+ movq %r10, %rax
+ orq $63, %rax
+ cmpq %rax, %r10
+ jg .LBB2_13
+# BB#10: # in Loop: Header=BB2_8 Depth=3
+ decq %rax
+ movq -64(%rbp), %r14 # 8-byte Reload
+ movq -56(%rbp), %r11 # 8-byte Reload
+ .align 16, 0x90
+.LBB2_11: # %polly.loop_header37.preheader
+ # Parent Loop BB2_5 Depth=1
+ # Parent Loop BB2_15 Depth=2
+ # Parent Loop BB2_8 Depth=3
+ # => This Loop Header: Depth=4
+ # Child Loop BB2_17 Depth 5
+ # Child Loop BB2_18 Depth 6
+ cmpq %r13, %r12
+ movq %rbx, %r8
+ movq %r10, %rsi
+ jg .LBB2_12
+ .align 16, 0x90
+.LBB2_17: # %polly.loop_header46.preheader
+ # Parent Loop BB2_5 Depth=1
+ # Parent Loop BB2_15 Depth=2
+ # Parent Loop BB2_8 Depth=3
+ # Parent Loop BB2_11 Depth=4
+ # => This Loop Header: Depth=5
+ # Child Loop BB2_18 Depth 6
+ leaq (%r11,%r11,2), %rcx
+ shlq $11, %rcx
+ vbroadcastss A(%rcx,%rsi,4), %xmm0
+ movq %r14, %rdi
+ movq %r8, %r9
+ movq %r15, %rcx
+.LBB2_18: # %polly.loop_header46
+ # Parent Loop BB2_5 Depth=1
+ # Parent Loop BB2_15 Depth=2
+ # Parent Loop BB2_8 Depth=3
+ # Parent Loop BB2_11 Depth=4
+ # Parent Loop BB2_17 Depth=5
+ # => This Inner Loop Header: Depth=6
+ vmulps (%r9), %xmm0, %xmm1
+ vaddps (%rdi), %xmm1, %xmm1
+ vmovaps %xmm1, (%rdi)
+ addq $16, %rdi
+ addq $16, %r9
+ addq $4, %rcx
+ cmpq %rdx, %rcx
+ jle .LBB2_18
+# BB#16: # %polly.loop_exit48
+ # in Loop: Header=BB2_17 Depth=5
+ addq $6144, %r8 # imm = 0x1800
+ cmpq %rax, %rsi
+ leaq 1(%rsi), %rsi
+ jle .LBB2_17
+ .align 16, 0x90
+.LBB2_12: # %polly.loop_exit39
+ # in Loop: Header=BB2_11 Depth=4
+ addq $6144, %r14 # imm = 0x1800
+ cmpq -48(%rbp), %r11 # 8-byte Folded Reload
+ leaq 1(%r11), %r11
+ jle .LBB2_11
+ .align 16, 0x90
+.LBB2_13: # %polly.loop_exit32
+ # in Loop: Header=BB2_8 Depth=3
+ addq $393216, %rbx # imm = 0x60000
+ cmpq $1472, %r10 # imm = 0x5C0
+ leaq 64(%r10), %r10
+ movq -56(%rbp), %rsi # 8-byte Reload
+ jl .LBB2_8
+# BB#14: # %polly.loop_exit25
+ # in Loop: Header=BB2_15 Depth=2
+ movq -80(%rbp), %rdx # 8-byte Reload
+ cmpq $1472, %rdx # imm = 0x5C0
+ leaq 64(%rdx), %rdx
+ jl .LBB2_15
+# BB#6: # %polly.loop_exit18
+ # in Loop: Header=BB2_5 Depth=1
+ addq $393216, -88(%rbp) # 8-byte Folded Spill
+ # imm = 0x60000
+ cmpq $1472, %rsi # imm = 0x5C0
+ leaq 64(%rsi), %rsi
+ jl .LBB2_5
+# BB#7: # %polly.loop_exit11
+ xorl %eax, %eax
+ addq $56, %rsp
+ popq %rbx
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ popq %rbp
+ ret
+.Ltmp28:
+ .size main, .Ltmp28-main
+ .cfi_endproc
+
+ .type A,@object # @A
+ .comm A,9437184,16
+ .type B,@object # @B
+ .comm B,9437184,16
+ .type .L.str,@object # @.str
+ .section .rodata.str1.1,"aMS",@progbits,1
+.L.str:
+ .asciz "%lf "
+ .size .L.str, 5
+
+ .type C,@object # @C
+ .comm C,9437184,16
+
+ .section ".note.GNU-stack","",@progbits
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.exe b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.exe
new file mode 100755
index 000000000000..fbd8b128fd88
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.exe
Binary files differ
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll
new file mode 100644
index 000000000000..acdd95f3bc4c
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll
Binary files differ
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.s b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.s
new file mode 100644
index 000000000000..f7ab7fdd59cc
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.s
@@ -0,0 +1,390 @@
+ .file "matmul.polly.interchanged+tiled.ll"
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 8
+.LCPI0_0:
+ .quad 4602678819172646912 # double 0.5
+ .text
+ .globl init_array
+ .align 16, 0x90
+ .type init_array,@function
+init_array: # @init_array
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp2:
+ .cfi_def_cfa_offset 16
+.Ltmp3:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp4:
+ .cfi_def_cfa_register %rbp
+ xorl %r8d, %r8d
+ vmovsd .LCPI0_0(%rip), %xmm0
+ .align 16, 0x90
+.LBB0_1: # %polly.loop_preheader3
+ # =>This Loop Header: Depth=1
+ # Child Loop BB0_2 Depth 2
+ xorl %ecx, %ecx
+ .align 16, 0x90
+.LBB0_2: # %polly.loop_header2
+ # Parent Loop BB0_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ movl %ecx, %edx
+ imull %r8d, %edx
+ movl %edx, %esi
+ sarl $31, %esi
+ shrl $22, %esi
+ addl %edx, %esi
+ andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
+ negl %esi
+ movq %r8, %rax
+ shlq $11, %rax
+ leal 1(%rdx,%rsi), %edi
+ leaq (%rax,%rax,2), %rsi
+ leaq 1(%rcx), %rdx
+ cmpq $1536, %rdx # imm = 0x600
+ vcvtsi2sdl %edi, %xmm0, %xmm1
+ vmulsd %xmm0, %xmm1, %xmm1
+ vcvtsd2ss %xmm1, %xmm1, %xmm1
+ vmovss %xmm1, A(%rsi,%rcx,4)
+ vmovss %xmm1, B(%rsi,%rcx,4)
+ movq %rdx, %rcx
+ jne .LBB0_2
+# BB#3: # %polly.loop_exit4
+ # in Loop: Header=BB0_1 Depth=1
+ incq %r8
+ cmpq $1536, %r8 # imm = 0x600
+ jne .LBB0_1
+# BB#4: # %polly.loop_exit
+ popq %rbp
+ ret
+.Ltmp5:
+ .size init_array, .Ltmp5-init_array
+ .cfi_endproc
+
+ .globl print_array
+ .align 16, 0x90
+ .type print_array,@function
+print_array: # @print_array
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp9:
+ .cfi_def_cfa_offset 16
+.Ltmp10:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp11:
+ .cfi_def_cfa_register %rbp
+ pushq %r15
+ pushq %r14
+ pushq %r12
+ pushq %rbx
+.Ltmp12:
+ .cfi_offset %rbx, -48
+.Ltmp13:
+ .cfi_offset %r12, -40
+.Ltmp14:
+ .cfi_offset %r14, -32
+.Ltmp15:
+ .cfi_offset %r15, -24
+ xorl %r14d, %r14d
+ movl $C, %r15d
+ .align 16, 0x90
+.LBB1_1: # %for.cond1.preheader
+ # =>This Loop Header: Depth=1
+ # Child Loop BB1_2 Depth 2
+ movq stdout(%rip), %rax
+ movq %r15, %r12
+ xorl %ebx, %ebx
+ .align 16, 0x90
+.LBB1_2: # %for.body3
+ # Parent Loop BB1_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ vmovss (%r12), %xmm0
+ vcvtss2sd %xmm0, %xmm0, %xmm0
+ movq %rax, %rdi
+ movl $.L.str, %esi
+ movb $1, %al
+ callq fprintf
+ movslq %ebx, %rax
+ imulq $1717986919, %rax, %rcx # imm = 0x66666667
+ movq %rcx, %rdx
+ shrq $63, %rdx
+ sarq $37, %rcx
+ addl %edx, %ecx
+ imull $80, %ecx, %ecx
+ subl %ecx, %eax
+ cmpl $79, %eax
+ jne .LBB1_4
+# BB#3: # %if.then
+ # in Loop: Header=BB1_2 Depth=2
+ movq stdout(%rip), %rsi
+ movl $10, %edi
+ callq fputc
+.LBB1_4: # %for.inc
+ # in Loop: Header=BB1_2 Depth=2
+ addq $4, %r12
+ incq %rbx
+ movq stdout(%rip), %rax
+ cmpq $1536, %rbx # imm = 0x600
+ jne .LBB1_2
+# BB#5: # %for.end
+ # in Loop: Header=BB1_1 Depth=1
+ movl $10, %edi
+ movq %rax, %rsi
+ callq fputc
+ addq $6144, %r15 # imm = 0x1800
+ incq %r14
+ cmpq $1536, %r14 # imm = 0x600
+ jne .LBB1_1
+# BB#6: # %for.end12
+ popq %rbx
+ popq %r12
+ popq %r14
+ popq %r15
+ popq %rbp
+ ret
+.Ltmp16:
+ .size print_array, .Ltmp16-print_array
+ .cfi_endproc
+
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 8
+.LCPI2_0:
+ .quad 4602678819172646912 # double 0.5
+ .text
+ .globl main
+ .align 16, 0x90
+ .type main,@function
+main: # @main
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp20:
+ .cfi_def_cfa_offset 16
+.Ltmp21:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp22:
+ .cfi_def_cfa_register %rbp
+ pushq %r15
+ pushq %r14
+ pushq %r13
+ pushq %r12
+ pushq %rbx
+ subq $56, %rsp
+.Ltmp23:
+ .cfi_offset %rbx, -56
+.Ltmp24:
+ .cfi_offset %r12, -48
+.Ltmp25:
+ .cfi_offset %r13, -40
+.Ltmp26:
+ .cfi_offset %r14, -32
+.Ltmp27:
+ .cfi_offset %r15, -24
+ xorl %ebx, %ebx
+ vmovsd .LCPI2_0(%rip), %xmm0
+ .align 16, 0x90
+.LBB2_1: # %polly.loop_preheader3.i
+ # =>This Loop Header: Depth=1
+ # Child Loop BB2_2 Depth 2
+ xorl %ecx, %ecx
+ .align 16, 0x90
+.LBB2_2: # %polly.loop_header2.i
+ # Parent Loop BB2_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ movl %ecx, %edx
+ imull %ebx, %edx
+ movl %edx, %esi
+ sarl $31, %esi
+ shrl $22, %esi
+ addl %edx, %esi
+ andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
+ negl %esi
+ movq %rbx, %rax
+ shlq $11, %rax
+ leal 1(%rdx,%rsi), %edi
+ leaq (%rax,%rax,2), %rsi
+ leaq 1(%rcx), %rdx
+ cmpq $1536, %rdx # imm = 0x600
+ vcvtsi2sdl %edi, %xmm0, %xmm1
+ vmulsd %xmm0, %xmm1, %xmm1
+ vcvtsd2ss %xmm1, %xmm1, %xmm1
+ vmovss %xmm1, A(%rsi,%rcx,4)
+ vmovss %xmm1, B(%rsi,%rcx,4)
+ movq %rdx, %rcx
+ jne .LBB2_2
+# BB#3: # %polly.loop_exit4.i
+ # in Loop: Header=BB2_1 Depth=1
+ incq %rbx
+ cmpq $1536, %rbx # imm = 0x600
+ jne .LBB2_1
+# BB#4: # %polly.loop_preheader3.preheader
+ movl $C, %ebx
+ movl $C, %edi
+ xorl %esi, %esi
+ movl $9437184, %edx # imm = 0x900000
+ callq memset
+ xorl %eax, %eax
+ .align 16, 0x90
+.LBB2_5: # %polly.loop_preheader17
+ # =>This Loop Header: Depth=1
+ # Child Loop BB2_15 Depth 2
+ # Child Loop BB2_8 Depth 3
+ # Child Loop BB2_11 Depth 4
+ # Child Loop BB2_17 Depth 5
+ # Child Loop BB2_18 Depth 6
+ movq %rax, -56(%rbp) # 8-byte Spill
+ movq %rbx, -88(%rbp) # 8-byte Spill
+ movq %rax, %rcx
+ orq $63, %rcx
+ movq %rcx, -72(%rbp) # 8-byte Spill
+ leaq -1(%rcx), %rcx
+ movq %rcx, -48(%rbp) # 8-byte Spill
+ movq $-1, %r15
+ movl $B, %ecx
+ movq %rbx, -64(%rbp) # 8-byte Spill
+ xorl %r12d, %r12d
+ .align 16, 0x90
+.LBB2_15: # %polly.loop_preheader24
+ # Parent Loop BB2_5 Depth=1
+ # => This Loop Header: Depth=2
+ # Child Loop BB2_8 Depth 3
+ # Child Loop BB2_11 Depth 4
+ # Child Loop BB2_17 Depth 5
+ # Child Loop BB2_18 Depth 6
+ movq %rcx, -80(%rbp) # 8-byte Spill
+ movq %r12, %r13
+ orq $63, %r13
+ leaq -1(%r13), %rbx
+ xorl %r9d, %r9d
+ movq %rcx, %rdx
+ .align 16, 0x90
+.LBB2_8: # %polly.loop_header23
+ # Parent Loop BB2_5 Depth=1
+ # Parent Loop BB2_15 Depth=2
+ # => This Loop Header: Depth=3
+ # Child Loop BB2_11 Depth 4
+ # Child Loop BB2_17 Depth 5
+ # Child Loop BB2_18 Depth 6
+ cmpq -72(%rbp), %rax # 8-byte Folded Reload
+ jg .LBB2_13
+# BB#9: # %polly.loop_header30.preheader
+ # in Loop: Header=BB2_8 Depth=3
+ movq %r9, %rax
+ orq $63, %rax
+ cmpq %rax, %r9
+ jg .LBB2_13
+# BB#10: # in Loop: Header=BB2_8 Depth=3
+ decq %rax
+ movq -64(%rbp), %r10 # 8-byte Reload
+ movq -56(%rbp), %r11 # 8-byte Reload
+ .align 16, 0x90
+.LBB2_11: # %polly.loop_header37.preheader
+ # Parent Loop BB2_5 Depth=1
+ # Parent Loop BB2_15 Depth=2
+ # Parent Loop BB2_8 Depth=3
+ # => This Loop Header: Depth=4
+ # Child Loop BB2_17 Depth 5
+ # Child Loop BB2_18 Depth 6
+ cmpq %r13, %r12
+ movq %rdx, %r14
+ movq %r9, %rcx
+ jg .LBB2_12
+ .align 16, 0x90
+.LBB2_17: # %polly.loop_header46.preheader
+ # Parent Loop BB2_5 Depth=1
+ # Parent Loop BB2_15 Depth=2
+ # Parent Loop BB2_8 Depth=3
+ # Parent Loop BB2_11 Depth=4
+ # => This Loop Header: Depth=5
+ # Child Loop BB2_18 Depth 6
+ leaq (%r11,%r11,2), %rsi
+ shlq $11, %rsi
+ vmovss A(%rsi,%rcx,4), %xmm0
+ movq %r10, %rdi
+ movq %r14, %r8
+ movq %r15, %rsi
+.LBB2_18: # %polly.loop_header46
+ # Parent Loop BB2_5 Depth=1
+ # Parent Loop BB2_15 Depth=2
+ # Parent Loop BB2_8 Depth=3
+ # Parent Loop BB2_11 Depth=4
+ # Parent Loop BB2_17 Depth=5
+ # => This Inner Loop Header: Depth=6
+ vmulss (%r8), %xmm0, %xmm1
+ vaddss (%rdi), %xmm1, %xmm1
+ vmovss %xmm1, (%rdi)
+ addq $4, %rdi
+ addq $4, %r8
+ incq %rsi
+ cmpq %rbx, %rsi
+ jle .LBB2_18
+# BB#16: # %polly.loop_exit48
+ # in Loop: Header=BB2_17 Depth=5
+ addq $6144, %r14 # imm = 0x1800
+ cmpq %rax, %rcx
+ leaq 1(%rcx), %rcx
+ jle .LBB2_17
+ .align 16, 0x90
+.LBB2_12: # %polly.loop_exit39
+ # in Loop: Header=BB2_11 Depth=4
+ addq $6144, %r10 # imm = 0x1800
+ cmpq -48(%rbp), %r11 # 8-byte Folded Reload
+ leaq 1(%r11), %r11
+ jle .LBB2_11
+ .align 16, 0x90
+.LBB2_13: # %polly.loop_exit32
+ # in Loop: Header=BB2_8 Depth=3
+ addq $393216, %rdx # imm = 0x60000
+ cmpq $1472, %r9 # imm = 0x5C0
+ leaq 64(%r9), %r9
+ movq -56(%rbp), %rax # 8-byte Reload
+ jl .LBB2_8
+# BB#14: # %polly.loop_exit25
+ # in Loop: Header=BB2_15 Depth=2
+ addq $256, -64(%rbp) # 8-byte Folded Spill
+ # imm = 0x100
+ movq -80(%rbp), %rcx # 8-byte Reload
+ addq $256, %rcx # imm = 0x100
+ addq $64, %r15
+ cmpq $1472, %r12 # imm = 0x5C0
+ leaq 64(%r12), %r12
+ jl .LBB2_15
+# BB#6: # %polly.loop_exit18
+ # in Loop: Header=BB2_5 Depth=1
+ movq -88(%rbp), %rbx # 8-byte Reload
+ addq $393216, %rbx # imm = 0x60000
+ cmpq $1472, %rax # imm = 0x5C0
+ leaq 64(%rax), %rax
+ jl .LBB2_5
+# BB#7: # %polly.loop_exit11
+ xorl %eax, %eax
+ addq $56, %rsp
+ popq %rbx
+ popq %r12
+ popq %r13
+ popq %r14
+ popq %r15
+ popq %rbp
+ ret
+.Ltmp28:
+ .size main, .Ltmp28-main
+ .cfi_endproc
+
+ .type A,@object # @A
+ .comm A,9437184,16
+ .type B,@object # @B
+ .comm B,9437184,16
+ .type .L.str,@object # @.str
+ .section .rodata.str1.1,"aMS",@progbits,1
+.L.str:
+ .asciz "%lf "
+ .size .L.str, 5
+
+ .type C,@object # @C
+ .comm C,9437184,16
+
+ .section ".note.GNU-stack","",@progbits
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged.exe b/polly/docs/experiments/matmul/matmul.polly.interchanged.exe
new file mode 100755
index 000000000000..240c95a7f790
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged.exe
Binary files differ
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged.ll
new file mode 100644
index 000000000000..52fbccc7ed5c
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged.ll
Binary files differ
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged.s b/polly/docs/experiments/matmul/matmul.polly.interchanged.s
new file mode 100644
index 000000000000..a764da0b3f22
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged.s
@@ -0,0 +1,286 @@
+ .file "matmul.polly.interchanged.ll"
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 8
+.LCPI0_0:
+ .quad 4602678819172646912 # double 0.5
+ .text
+ .globl init_array
+ .align 16, 0x90
+ .type init_array,@function
+init_array: # @init_array
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp2:
+ .cfi_def_cfa_offset 16
+.Ltmp3:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp4:
+ .cfi_def_cfa_register %rbp
+ xorl %r8d, %r8d
+ vmovsd .LCPI0_0(%rip), %xmm0
+ .align 16, 0x90
+.LBB0_1: # %polly.loop_preheader3
+ # =>This Loop Header: Depth=1
+ # Child Loop BB0_2 Depth 2
+ xorl %ecx, %ecx
+ .align 16, 0x90
+.LBB0_2: # %polly.loop_header2
+ # Parent Loop BB0_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ movl %ecx, %edx
+ imull %r8d, %edx
+ movl %edx, %esi
+ sarl $31, %esi
+ shrl $22, %esi
+ addl %edx, %esi
+ andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
+ negl %esi
+ movq %r8, %rax
+ shlq $11, %rax
+ leal 1(%rdx,%rsi), %edi
+ leaq (%rax,%rax,2), %rsi
+ leaq 1(%rcx), %rdx
+ cmpq $1536, %rdx # imm = 0x600
+ vcvtsi2sdl %edi, %xmm0, %xmm1
+ vmulsd %xmm0, %xmm1, %xmm1
+ vcvtsd2ss %xmm1, %xmm1, %xmm1
+ vmovss %xmm1, A(%rsi,%rcx,4)
+ vmovss %xmm1, B(%rsi,%rcx,4)
+ movq %rdx, %rcx
+ jne .LBB0_2
+# BB#3: # %polly.loop_exit4
+ # in Loop: Header=BB0_1 Depth=1
+ incq %r8
+ cmpq $1536, %r8 # imm = 0x600
+ jne .LBB0_1
+# BB#4: # %polly.loop_exit
+ popq %rbp
+ ret
+.Ltmp5:
+ .size init_array, .Ltmp5-init_array
+ .cfi_endproc
+
+ .globl print_array
+ .align 16, 0x90
+ .type print_array,@function
+print_array: # @print_array
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp9:
+ .cfi_def_cfa_offset 16
+.Ltmp10:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp11:
+ .cfi_def_cfa_register %rbp
+ pushq %r15
+ pushq %r14
+ pushq %r12
+ pushq %rbx
+.Ltmp12:
+ .cfi_offset %rbx, -48
+.Ltmp13:
+ .cfi_offset %r12, -40
+.Ltmp14:
+ .cfi_offset %r14, -32
+.Ltmp15:
+ .cfi_offset %r15, -24
+ xorl %r14d, %r14d
+ movl $C, %r15d
+ .align 16, 0x90
+.LBB1_1: # %for.cond1.preheader
+ # =>This Loop Header: Depth=1
+ # Child Loop BB1_2 Depth 2
+ movq stdout(%rip), %rax
+ movq %r15, %r12
+ xorl %ebx, %ebx
+ .align 16, 0x90
+.LBB1_2: # %for.body3
+ # Parent Loop BB1_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ vmovss (%r12), %xmm0
+ vcvtss2sd %xmm0, %xmm0, %xmm0
+ movq %rax, %rdi
+ movl $.L.str, %esi
+ movb $1, %al
+ callq fprintf
+ movslq %ebx, %rax
+ imulq $1717986919, %rax, %rcx # imm = 0x66666667
+ movq %rcx, %rdx
+ shrq $63, %rdx
+ sarq $37, %rcx
+ addl %edx, %ecx
+ imull $80, %ecx, %ecx
+ subl %ecx, %eax
+ cmpl $79, %eax
+ jne .LBB1_4
+# BB#3: # %if.then
+ # in Loop: Header=BB1_2 Depth=2
+ movq stdout(%rip), %rsi
+ movl $10, %edi
+ callq fputc
+.LBB1_4: # %for.inc
+ # in Loop: Header=BB1_2 Depth=2
+ addq $4, %r12
+ incq %rbx
+ movq stdout(%rip), %rax
+ cmpq $1536, %rbx # imm = 0x600
+ jne .LBB1_2
+# BB#5: # %for.end
+ # in Loop: Header=BB1_1 Depth=1
+ movl $10, %edi
+ movq %rax, %rsi
+ callq fputc
+ addq $6144, %r15 # imm = 0x1800
+ incq %r14
+ cmpq $1536, %r14 # imm = 0x600
+ jne .LBB1_1
+# BB#6: # %for.end12
+ popq %rbx
+ popq %r12
+ popq %r14
+ popq %r15
+ popq %rbp
+ ret
+.Ltmp16:
+ .size print_array, .Ltmp16-print_array
+ .cfi_endproc
+
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 8
+.LCPI2_0:
+ .quad 4602678819172646912 # double 0.5
+ .text
+ .globl main
+ .align 16, 0x90
+ .type main,@function
+main: # @main
+ .cfi_startproc
+# BB#0: # %entry
+ pushq %rbp
+.Ltmp20:
+ .cfi_def_cfa_offset 16
+.Ltmp21:
+ .cfi_offset %rbp, -16
+ movq %rsp, %rbp
+.Ltmp22:
+ .cfi_def_cfa_register %rbp
+ pushq %r14
+ pushq %rbx
+.Ltmp23:
+ .cfi_offset %rbx, -32
+.Ltmp24:
+ .cfi_offset %r14, -24
+ xorl %ebx, %ebx
+ vmovsd .LCPI2_0(%rip), %xmm0
+ .align 16, 0x90
+.LBB2_1: # %polly.loop_preheader3.i
+ # =>This Loop Header: Depth=1
+ # Child Loop BB2_2 Depth 2
+ xorl %ecx, %ecx
+ .align 16, 0x90
+.LBB2_2: # %polly.loop_header2.i
+ # Parent Loop BB2_1 Depth=1
+ # => This Inner Loop Header: Depth=2
+ movl %ecx, %edx
+ imull %ebx, %edx
+ movl %edx, %esi
+ sarl $31, %esi
+ shrl $22, %esi
+ addl %edx, %esi
+ andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
+ negl %esi
+ movq %rbx, %rax
+ shlq $11, %rax
+ leal 1(%rdx,%rsi), %edi
+ leaq (%rax,%rax,2), %rsi
+ leaq 1(%rcx), %rdx
+ cmpq $1536, %rdx # imm = 0x600
+ vcvtsi2sdl %edi, %xmm0, %xmm1
+ vmulsd %xmm0, %xmm1, %xmm1
+ vcvtsd2ss %xmm1, %xmm1, %xmm1
+ vmovss %xmm1, A(%rsi,%rcx,4)
+ vmovss %xmm1, B(%rsi,%rcx,4)
+ movq %rdx, %rcx
+ jne .LBB2_2
+# BB#3: # %polly.loop_exit4.i
+ # in Loop: Header=BB2_1 Depth=1
+ incq %rbx
+ cmpq $1536, %rbx # imm = 0x600
+ jne .LBB2_1
+# BB#4: # %polly.loop_preheader3.preheader
+ movl $C, %r14d
+ movl $C, %edi
+ xorl %esi, %esi
+ movl $9437184, %edx # imm = 0x900000
+ callq memset
+ xorl %eax, %eax
+ .align 16, 0x90
+.LBB2_5: # %polly.loop_preheader17
+ # =>This Loop Header: Depth=1
+ # Child Loop BB2_10 Depth 2
+ # Child Loop BB2_8 Depth 3
+ movl $B, %ebx
+ xorl %edx, %edx
+ .align 16, 0x90
+.LBB2_10: # %polly.loop_preheader24
+ # Parent Loop BB2_5 Depth=1
+ # => This Loop Header: Depth=2
+ # Child Loop BB2_8 Depth 3
+ leaq (%rax,%rax,2), %rcx
+ shlq $11, %rcx
+ vmovss A(%rcx,%rdx,4), %xmm0
+ movl $1536, %esi # imm = 0x600
+ movq %r14, %rdi
+ movq %rbx, %rcx
+ .align 16, 0x90
+.LBB2_8: # %polly.loop_header23
+ # Parent Loop BB2_5 Depth=1
+ # Parent Loop BB2_10 Depth=2
+ # => This Inner Loop Header: Depth=3
+ vmulss (%rcx), %xmm0, %xmm1
+ vaddss (%rdi), %xmm1, %xmm1
+ vmovss %xmm1, (%rdi)
+ addq $4, %rdi
+ addq $4, %rcx
+ decq %rsi
+ jne .LBB2_8
+# BB#9: # %polly.loop_exit25
+ # in Loop: Header=BB2_10 Depth=2
+ addq $6144, %rbx # imm = 0x1800
+ incq %rdx
+ cmpq $1536, %rdx # imm = 0x600
+ jne .LBB2_10
+# BB#6: # %polly.loop_exit18
+ # in Loop: Header=BB2_5 Depth=1
+ addq $6144, %r14 # imm = 0x1800
+ incq %rax
+ cmpq $1536, %rax # imm = 0x600
+ jne .LBB2_5
+# BB#7: # %polly.loop_exit11
+ xorl %eax, %eax
+ popq %rbx
+ popq %r14
+ popq %rbp
+ ret
+.Ltmp25:
+ .size main, .Ltmp25-main
+ .cfi_endproc
+
+ .type A,@object # @A
+ .comm A,9437184,16
+ .type B,@object # @B
+ .comm B,9437184,16
+ .type .L.str,@object # @.str
+ .section .rodata.str1.1,"aMS",@progbits,1
+.L.str:
+ .asciz "%lf "
+ .size .L.str, 5
+
+ .type C,@object # @C
+ .comm C,9437184,16
+
+ .section ".note.GNU-stack","",@progbits
diff --git a/polly/docs/experiments/matmul/matmul.preopt.ll b/polly/docs/experiments/matmul/matmul.preopt.ll
new file mode 100644
index 000000000000..db5366425740
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.preopt.ll
@@ -0,0 +1,171 @@
+; ModuleID = 'matmul.s'
+source_filename = "matmul.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
+%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
+
+@A = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@B = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@stdout = external global %struct._IO_FILE*, align 8
+@.str = private unnamed_addr constant [5 x i8] c"%lf \00", align 1
+@C = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@.str.1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
+
+; Function Attrs: nounwind uwtable
+define void @init_array() #0 {
+entry:
+ br label %entry.split
+
+entry.split: ; preds = %entry
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry.split, %for.inc17
+ %indvars.iv5 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next6, %for.inc17 ]
+ br label %for.body3
+
+for.body3: ; preds = %for.cond1.preheader, %for.body3
+ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+ %0 = mul nuw nsw i64 %indvars.iv, %indvars.iv5
+ %1 = trunc i64 %0 to i32
+ %rem = srem i32 %1, 1024
+ %add = add nsw i32 %rem, 1
+ %conv = sitofp i32 %add to double
+ %div = fmul double %conv, 5.000000e-01
+ %conv4 = fptrunc double %div to float
+ %arrayidx6 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %indvars.iv5, i64 %indvars.iv
+ store float %conv4, float* %arrayidx6, align 4
+ %2 = mul nuw nsw i64 %indvars.iv, %indvars.iv5
+ %3 = trunc i64 %2 to i32
+ %rem8 = srem i32 %3, 1024
+ %add9 = add nsw i32 %rem8, 1
+ %conv10 = sitofp i32 %add9 to double
+ %div11 = fmul double %conv10, 5.000000e-01
+ %conv12 = fptrunc double %div11 to float
+ %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %indvars.iv5, i64 %indvars.iv
+ store float %conv12, float* %arrayidx16, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 1536
+ br i1 %exitcond, label %for.body3, label %for.inc17
+
+for.inc17: ; preds = %for.body3
+ %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1
+ %exitcond7 = icmp ne i64 %indvars.iv.next6, 1536
+ br i1 %exitcond7, label %for.cond1.preheader, label %for.end19
+
+for.end19: ; preds = %for.inc17
+ ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @print_array() #0 {
+entry:
+ br label %entry.split
+
+entry.split: ; preds = %entry
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry.split, %for.end
+ %indvars.iv6 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next7, %for.end ]
+ %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8
+ br label %for.body3
+
+for.body3: ; preds = %for.cond1.preheader, %for.inc
+ %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
+ %1 = phi %struct._IO_FILE* [ %0, %for.cond1.preheader ], [ %5, %for.inc ]
+ %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
+ %2 = load float, float* %arrayidx5, align 4
+ %conv = fpext float %2 to double
+ %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2
+ %3 = trunc i64 %indvars.iv to i32
+ %rem = srem i32 %3, 80
+ %cmp6 = icmp eq i32 %rem, 79
+ br i1 %cmp6, label %if.then, label %for.inc
+
+if.then: ; preds = %for.body3
+ %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8
+ %fputc3 = tail call i32 @fputc(i32 10, %struct._IO_FILE* %4)
+ br label %for.inc
+
+for.inc: ; preds = %for.body3, %if.then
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8
+ %exitcond = icmp ne i64 %indvars.iv.next, 1536
+ br i1 %exitcond, label %for.body3, label %for.end
+
+for.end: ; preds = %for.inc
+ %.lcssa = phi %struct._IO_FILE* [ %5, %for.inc ]
+ %fputc = tail call i32 @fputc(i32 10, %struct._IO_FILE* %.lcssa)
+ %indvars.iv.next7 = add nuw nsw i64 %indvars.iv6, 1
+ %exitcond8 = icmp ne i64 %indvars.iv.next7, 1536
+ br i1 %exitcond8, label %for.cond1.preheader, label %for.end12
+
+for.end12: ; preds = %for.end
+ ret void
+}
+
+declare i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1
+
+; Function Attrs: nounwind uwtable
+define i32 @main() #0 {
+entry:
+ br label %entry.split
+
+entry.split: ; preds = %entry
+ tail call void @init_array()
+ br label %for.cond1.preheader
+
+for.cond1.preheader: ; preds = %entry.split, %for.inc28
+ %indvars.iv7 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next8, %for.inc28 ]
+ br label %for.body3
+
+for.body3: ; preds = %for.cond1.preheader, %for.inc25
+ %indvars.iv4 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next5, %for.inc25 ]
+ %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4
+ store float 0.000000e+00, float* %arrayidx5, align 4
+ br label %for.body8
+
+for.body8: ; preds = %for.body3, %for.body8
+ %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body8 ]
+ %arrayidx12 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4
+ %0 = load float, float* %arrayidx12, align 4
+ %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %indvars.iv7, i64 %indvars.iv
+ %1 = load float, float* %arrayidx16, align 4
+ %arrayidx20 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv4
+ %2 = load float, float* %arrayidx20, align 4
+ %mul = fmul float %1, %2
+ %add = fadd float %0, %mul
+ %arrayidx24 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4
+ store float %add, float* %arrayidx24, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp ne i64 %indvars.iv.next, 1536
+ br i1 %exitcond, label %for.body8, label %for.inc25
+
+for.inc25: ; preds = %for.body8
+ %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
+ %exitcond6 = icmp ne i64 %indvars.iv.next5, 1536
+ br i1 %exitcond6, label %for.body3, label %for.inc28
+
+for.inc28: ; preds = %for.inc25
+ %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
+ %exitcond9 = icmp ne i64 %indvars.iv.next8, 1536
+ br i1 %exitcond9, label %for.cond1.preheader, label %for.end30
+
+for.end30: ; preds = %for.inc28
+ ret i32 0
+}
+
+; Function Attrs: nounwind
+declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #2
+
+; Function Attrs: nounwind
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture) #2
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 4.0.0 (http://llvm.org/git/clang.git 081569d9a29c7bc827b2d41f8e62891bbc895e2f) (http://llvm.org/git/llvm.git e117e506536626352e8e47f6c72cd6e2a276622c)"}
diff --git a/polly/docs/experiments/matmul/matmul.s b/polly/docs/experiments/matmul/matmul.s
new file mode 100644
index 000000000000..17147be24476
--- /dev/null
+++ b/polly/docs/experiments/matmul/matmul.s
@@ -0,0 +1,269 @@
+; ModuleID = 'matmul.c'
+source_filename = "matmul.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
+%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
+
+@A = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@B = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@stdout = external global %struct._IO_FILE*, align 8
+@.str = private unnamed_addr constant [5 x i8] c"%lf \00", align 1
+@C = common global [1536 x [1536 x float]] zeroinitializer, align 16
+@.str.1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
+
+; Function Attrs: nounwind uwtable
+define void @init_array() #0 {
+entry:
+ %i = alloca i32, align 4
+ %j = alloca i32, align 4
+ store i32 0, i32* %i, align 4
+ br label %for.cond
+
+for.cond: ; preds = %for.inc17, %entry
+ %0 = load i32, i32* %i, align 4
+ %cmp = icmp slt i32 %0, 1536
+ br i1 %cmp, label %for.body, label %for.end19
+
+for.body: ; preds = %for.cond
+ store i32 0, i32* %j, align 4
+ br label %for.cond1
+
+for.cond1: ; preds = %for.inc, %for.body
+ %1 = load i32, i32* %j, align 4
+ %cmp2 = icmp slt i32 %1, 1536
+ br i1 %cmp2, label %for.body3, label %for.end
+
+for.body3: ; preds = %for.cond1
+ %2 = load i32, i32* %i, align 4
+ %3 = load i32, i32* %j, align 4
+ %mul = mul nsw i32 %2, %3
+ %rem = srem i32 %mul, 1024
+ %add = add nsw i32 1, %rem
+ %conv = sitofp i32 %add to double
+ %div = fdiv double %conv, 2.000000e+00
+ %conv4 = fptrunc double %div to float
+ %4 = load i32, i32* %j, align 4
+ %idxprom = sext i32 %4 to i64
+ %5 = load i32, i32* %i, align 4
+ %idxprom5 = sext i32 %5 to i64
+ %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %idxprom5
+ %arrayidx6 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom
+ store float %conv4, float* %arrayidx6, align 4
+ %6 = load i32, i32* %i, align 4
+ %7 = load i32, i32* %j, align 4
+ %mul7 = mul nsw i32 %6, %7
+ %rem8 = srem i32 %mul7, 1024
+ %add9 = add nsw i32 1, %rem8
+ %conv10 = sitofp i32 %add9 to double
+ %div11 = fdiv double %conv10, 2.000000e+00
+ %conv12 = fptrunc double %div11 to float
+ %8 = load i32, i32* %j, align 4
+ %idxprom13 = sext i32 %8 to i64
+ %9 = load i32, i32* %i, align 4
+ %idxprom14 = sext i32 %9 to i64
+ %arrayidx15 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %idxprom14
+ %arrayidx16 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx15, i64 0, i64 %idxprom13
+ store float %conv12, float* %arrayidx16, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body3
+ %10 = load i32, i32* %j, align 4
+ %inc = add nsw i32 %10, 1
+ store i32 %inc, i32* %j, align 4
+ br label %for.cond1
+
+for.end: ; preds = %for.cond1
+ br label %for.inc17
+
+for.inc17: ; preds = %for.end
+ %11 = load i32, i32* %i, align 4
+ %inc18 = add nsw i32 %11, 1
+ store i32 %inc18, i32* %i, align 4
+ br label %for.cond
+
+for.end19: ; preds = %for.cond
+ ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @print_array() #0 {
+entry:
+ %i = alloca i32, align 4
+ %j = alloca i32, align 4
+ store i32 0, i32* %i, align 4
+ br label %for.cond
+
+for.cond: ; preds = %for.inc10, %entry
+ %0 = load i32, i32* %i, align 4
+ %cmp = icmp slt i32 %0, 1536
+ br i1 %cmp, label %for.body, label %for.end12
+
+for.body: ; preds = %for.cond
+ store i32 0, i32* %j, align 4
+ br label %for.cond1
+
+for.cond1: ; preds = %for.inc, %for.body
+ %1 = load i32, i32* %j, align 4
+ %cmp2 = icmp slt i32 %1, 1536
+ br i1 %cmp2, label %for.body3, label %for.end
+
+for.body3: ; preds = %for.cond1
+ %2 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8
+ %3 = load i32, i32* %j, align 4
+ %idxprom = sext i32 %3 to i64
+ %4 = load i32, i32* %i, align 4
+ %idxprom4 = sext i32 %4 to i64
+ %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom4
+ %arrayidx5 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom
+ %5 = load float, float* %arrayidx5, align 4
+ %conv = fpext float %5 to double
+ %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %2, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0), double %conv)
+ %6 = load i32, i32* %j, align 4
+ %rem = srem i32 %6, 80
+ %cmp6 = icmp eq i32 %rem, 79
+ br i1 %cmp6, label %if.then, label %if.end
+
+if.then: ; preds = %for.body3
+ %7 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8
+ %call8 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %7, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i32 0, i32 0))
+ br label %if.end
+
+if.end: ; preds = %if.then, %for.body3
+ br label %for.inc
+
+for.inc: ; preds = %if.end
+ %8 = load i32, i32* %j, align 4
+ %inc = add nsw i32 %8, 1
+ store i32 %inc, i32* %j, align 4
+ br label %for.cond1
+
+for.end: ; preds = %for.cond1
+ %9 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8
+ %call9 = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %9, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i32 0, i32 0))
+ br label %for.inc10
+
+for.inc10: ; preds = %for.end
+ %10 = load i32, i32* %i, align 4
+ %inc11 = add nsw i32 %10, 1
+ store i32 %inc11, i32* %i, align 4
+ br label %for.cond
+
+for.end12: ; preds = %for.cond
+ ret void
+}
+
+declare i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1
+
+; Function Attrs: nounwind uwtable
+define i32 @main() #0 {
+entry:
+ %retval = alloca i32, align 4
+ %i = alloca i32, align 4
+ %j = alloca i32, align 4
+ %k = alloca i32, align 4
+ %t_start = alloca double, align 8
+ %t_end = alloca double, align 8
+ store i32 0, i32* %retval, align 4
+ call void @init_array()
+ store i32 0, i32* %i, align 4
+ br label %for.cond
+
+for.cond: ; preds = %for.inc28, %entry
+ %0 = load i32, i32* %i, align 4
+ %cmp = icmp slt i32 %0, 1536
+ br i1 %cmp, label %for.body, label %for.end30
+
+for.body: ; preds = %for.cond
+ store i32 0, i32* %j, align 4
+ br label %for.cond1
+
+for.cond1: ; preds = %for.inc25, %for.body
+ %1 = load i32, i32* %j, align 4
+ %cmp2 = icmp slt i32 %1, 1536
+ br i1 %cmp2, label %for.body3, label %for.end27
+
+for.body3: ; preds = %for.cond1
+ %2 = load i32, i32* %j, align 4
+ %idxprom = sext i32 %2 to i64
+ %3 = load i32, i32* %i, align 4
+ %idxprom4 = sext i32 %3 to i64
+ %arrayidx = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom4
+ %arrayidx5 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx, i64 0, i64 %idxprom
+ store float 0.000000e+00, float* %arrayidx5, align 4
+ store i32 0, i32* %k, align 4
+ br label %for.cond6
+
+for.cond6: ; preds = %for.inc, %for.body3
+ %4 = load i32, i32* %k, align 4
+ %cmp7 = icmp slt i32 %4, 1536
+ br i1 %cmp7, label %for.body8, label %for.end
+
+for.body8: ; preds = %for.cond6
+ %5 = load i32, i32* %j, align 4
+ %idxprom9 = sext i32 %5 to i64
+ %6 = load i32, i32* %i, align 4
+ %idxprom10 = sext i32 %6 to i64
+ %arrayidx11 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom10
+ %arrayidx12 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx11, i64 0, i64 %idxprom9
+ %7 = load float, float* %arrayidx12, align 4
+ %8 = load i32, i32* %k, align 4
+ %idxprom13 = sext i32 %8 to i64
+ %9 = load i32, i32* %i, align 4
+ %idxprom14 = sext i32 %9 to i64
+ %arrayidx15 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @A, i64 0, i64 %idxprom14
+ %arrayidx16 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx15, i64 0, i64 %idxprom13
+ %10 = load float, float* %arrayidx16, align 4
+ %11 = load i32, i32* %j, align 4
+ %idxprom17 = sext i32 %11 to i64
+ %12 = load i32, i32* %k, align 4
+ %idxprom18 = sext i32 %12 to i64
+ %arrayidx19 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @B, i64 0, i64 %idxprom18
+ %arrayidx20 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx19, i64 0, i64 %idxprom17
+ %13 = load float, float* %arrayidx20, align 4
+ %mul = fmul float %10, %13
+ %add = fadd float %7, %mul
+ %14 = load i32, i32* %j, align 4
+ %idxprom21 = sext i32 %14 to i64
+ %15 = load i32, i32* %i, align 4
+ %idxprom22 = sext i32 %15 to i64
+ %arrayidx23 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %idxprom22
+ %arrayidx24 = getelementptr inbounds [1536 x float], [1536 x float]* %arrayidx23, i64 0, i64 %idxprom21
+ store float %add, float* %arrayidx24, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body8
+ %16 = load i32, i32* %k, align 4
+ %inc = add nsw i32 %16, 1
+ store i32 %inc, i32* %k, align 4
+ br label %for.cond6
+
+for.end: ; preds = %for.cond6
+ br label %for.inc25
+
+for.inc25: ; preds = %for.end
+ %17 = load i32, i32* %j, align 4
+ %inc26 = add nsw i32 %17, 1
+ store i32 %inc26, i32* %j, align 4
+ br label %for.cond1
+
+for.end27: ; preds = %for.cond1
+ br label %for.inc28
+
+for.inc28: ; preds = %for.end27
+ %18 = load i32, i32* %i, align 4
+ %inc29 = add nsw i32 %18, 1
+ store i32 %inc29, i32* %i, align 4
+ br label %for.cond
+
+for.end30: ; preds = %for.cond
+ ret i32 0
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 4.0.0 (http://llvm.org/git/clang.git 081569d9a29c7bc827b2d41f8e62891bbc895e2f) (http://llvm.org/git/llvm.git e117e506536626352e8e47f6c72cd6e2a276622c)"}
diff --git a/polly/docs/experiments/matmul/runall.sh b/polly/docs/experiments/matmul/runall.sh
new file mode 100755
index 000000000000..575b58f98246
--- /dev/null
+++ b/polly/docs/experiments/matmul/runall.sh
@@ -0,0 +1,95 @@
+#!/bin/sh -a
+
+echo "--> 1. Create LLVM-IR from C"
+clang -S -emit-llvm matmul.c -o matmul.s
+
+echo "--> 2. Prepare the LLVM-IR for Polly"
+opt -S -polly-canonicalize matmul.s > matmul.preopt.ll
+
+echo "--> 3. Show the SCoPs detected by Polly"
+opt -basicaa -polly-ast -analyze -q matmul.preopt.ll \
+ -polly-process-unprofitable
+
+echo "--> 4.1 Highlight the detected SCoPs in the CFGs of the program"
+# We only create .dot files, as directly -view-scops directly calls graphviz
+# which would require user interaction to continue the script.
+# opt -basicaa -view-scops -disable-output matmul.preopt.ll
+opt -basicaa -dot-scops -disable-output matmul.preopt.ll
+
+echo "--> 4.2 Highlight the detected SCoPs in the CFGs of the program (print \
+no instructions)"
+# We only create .dot files, as directly -view-scops-only directly calls
+# graphviz which would require user interaction to continue the script.
+# opt -basicaa -view-scops-only -disable-output matmul.preopt.ll
+opt -basicaa -dot-scops-only -disable-output matmul.preopt.ll
+
+echo "--> 4.3 Create .png files from the .dot files"
+for i in `ls *.dot`; do dot -Tpng $i > $i.png; done
+
+echo "--> 5. View the polyhedral representation of the SCoPs"
+opt -basicaa -polly-scops -analyze matmul.preopt.ll -polly-process-unprofitable
+
+echo "--> 6. Show the dependences for the SCoPs"
+opt -basicaa -polly-dependences -analyze matmul.preopt.ll \
+ -polly-process-unprofitable
+
+echo "--> 7. Export jscop files"
+opt -basicaa -polly-export-jscop matmul.preopt.ll -polly-process-unprofitable
+
+echo "--> 8. Import the updated jscop files and print the new SCoPs. (optional)"
+opt -basicaa -polly-import-jscop -polly-ast -analyze matmul.preopt.ll \
+ -polly-process-unprofitable
+opt -basicaa -polly-import-jscop -polly-ast -analyze matmul.preopt.ll \
+ -polly-import-jscop-postfix=interchanged -polly-process-unprofitable
+opt -basicaa -polly-import-jscop -polly-ast -analyze matmul.preopt.ll \
+ -polly-import-jscop-postfix=interchanged+tiled -polly-process-unprofitable
+opt -basicaa -polly-import-jscop -polly-ast -analyze matmul.preopt.ll \
+ -polly-import-jscop-postfix=interchanged+tiled+vector \
+ -polly-process-unprofitable
+
+echo "--> 9. Codegenerate the SCoPs"
+opt -basicaa -polly-import-jscop -polly-import-jscop-postfix=interchanged \
+ -polly-codegen -polly-process-unprofitable\
+ matmul.preopt.ll | opt -O3 > matmul.polly.interchanged.ll
+opt -basicaa -polly-import-jscop \
+ -polly-import-jscop-postfix=interchanged+tiled -polly-codegen \
+ matmul.preopt.ll -polly-process-unprofitable \
+ | opt -O3 > matmul.polly.interchanged+tiled.ll
+opt -basicaa -polly-import-jscop -polly-process-unprofitable\
+ -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen \
+ matmul.preopt.ll -polly-vectorizer=polly\
+ | opt -O3 > matmul.polly.interchanged+tiled+vector.ll
+opt -basicaa -polly-import-jscop -polly-process-unprofitable\
+ -polly-import-jscop-postfix=interchanged+tiled+vector -polly-codegen \
+ matmul.preopt.ll -polly-vectorizer=polly -polly-parallel\
+ | opt -O3 > matmul.polly.interchanged+tiled+vector+openmp.ll
+opt matmul.preopt.ll | opt -O3 > matmul.normalopt.ll
+
+echo "--> 10. Create the executables"
+llc matmul.polly.interchanged.ll -o matmul.polly.interchanged.s && gcc matmul.polly.interchanged.s \
+ -o matmul.polly.interchanged.exe
+llc matmul.polly.interchanged+tiled.ll -o matmul.polly.interchanged+tiled.s && gcc matmul.polly.interchanged+tiled.s \
+ -o matmul.polly.interchanged+tiled.exe
+llc matmul.polly.interchanged+tiled+vector.ll \
+ -o matmul.polly.interchanged+tiled+vector.s \
+ && gcc matmul.polly.interchanged+tiled+vector.s \
+ -o matmul.polly.interchanged+tiled+vector.exe
+llc matmul.polly.interchanged+tiled+vector+openmp.ll \
+ -o matmul.polly.interchanged+tiled+vector+openmp.s \
+ && gcc -lgomp matmul.polly.interchanged+tiled+vector+openmp.s \
+ -o matmul.polly.interchanged+tiled+vector+openmp.exe
+llc matmul.normalopt.ll -o matmul.normalopt.s && gcc matmul.normalopt.s \
+ -o matmul.normalopt.exe
+
+echo "--> 11. Compare the runtime of the executables"
+
+echo "time ./matmul.normalopt.exe"
+time -f "%E real, %U user, %S sys" ./matmul.normalopt.exe
+echo "time ./matmul.polly.interchanged.exe"
+time -f "%E real, %U user, %S sys" ./matmul.polly.interchanged.exe
+echo "time ./matmul.polly.interchanged+tiled.exe"
+time -f "%E real, %U user, %S sys" ./matmul.polly.interchanged+tiled.exe
+echo "time ./matmul.polly.interchanged+tiled+vector.exe"
+time -f "%E real, %U user, %S sys" ./matmul.polly.interchanged+tiled+vector.exe
+echo "time ./matmul.polly.interchanged+tiled+vector+openmp.exe"
+time -f "%E real, %U user, %S sys" ./matmul.polly.interchanged+tiled+vector+openmp.exe
diff --git a/polly/docs/experiments/matmul/scops.init_array.dot b/polly/docs/experiments/matmul/scops.init_array.dot
new file mode 100644
index 000000000000..3b9d6c9c5865
--- /dev/null
+++ b/polly/docs/experiments/matmul/scops.init_array.dot
@@ -0,0 +1,39 @@
+digraph "Scop Graph for 'init_array' function" {
+ label="Scop Graph for 'init_array' function";
+
+ Node0x5b5b5a0 [shape=record,label="{entry:\l br label %entry.split\l}"];
+ Node0x5b5b5a0 -> Node0x5b5de30;
+ Node0x5b5de30 [shape=record,label="{entry.split: \l br label %for.cond1.preheader\l}"];
+ Node0x5b5de30 -> Node0x5b5de50;
+ Node0x5b5de50 [shape=record,label="{for.cond1.preheader: \l %indvars.iv5 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next6, %for.inc17 ]\l br label %for.body3\l}"];
+ Node0x5b5de50 -> Node0x5b5b570;
+ Node0x5b5b570 [shape=record,label="{for.body3: \l %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next,\l... %for.body3 ]\l %0 = mul nuw nsw i64 %indvars.iv, %indvars.iv5\l %1 = trunc i64 %0 to i32\l %rem = srem i32 %1, 1024\l %add = add nsw i32 %rem, 1\l %conv = sitofp i32 %add to double\l %div = fmul double %conv, 5.000000e-01\l %conv4 = fptrunc double %div to float\l %arrayidx6 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @A, i64 0, i64 %indvars.iv5, i64 %indvars.iv\l store float %conv4, float* %arrayidx6, align 4\l %2 = mul nuw nsw i64 %indvars.iv, %indvars.iv5\l %3 = trunc i64 %2 to i32\l %rem8 = srem i32 %3, 1024\l %add9 = add nsw i32 %rem8, 1\l %conv10 = sitofp i32 %add9 to double\l %div11 = fmul double %conv10, 5.000000e-01\l %conv12 = fptrunc double %div11 to float\l %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @B, i64 0, i64 %indvars.iv5, i64 %indvars.iv\l store float %conv12, float* %arrayidx16, align 4\l %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l %exitcond = icmp ne i64 %indvars.iv.next, 1536\l br i1 %exitcond, label %for.body3, label %for.inc17\l}"];
+ Node0x5b5b570 -> Node0x5b5b570[constraint=false];
+ Node0x5b5b570 -> Node0x5b5df30;
+ Node0x5b5df30 [shape=record,label="{for.inc17: \l %indvars.iv.next6 = add nuw nsw i64 %indvars.iv5, 1\l %exitcond7 = icmp ne i64 %indvars.iv.next6, 1536\l br i1 %exitcond7, label %for.cond1.preheader, label %for.end19\l}"];
+ Node0x5b5df30 -> Node0x5b5de50[constraint=false];
+ Node0x5b5df30 -> Node0x5b5df90;
+ Node0x5b5df90 [shape=record,label="{for.end19: \l ret void\l}"];
+ colorscheme = "paired12"
+ subgraph cluster_0x5b4bdd0 {
+ label = "";
+ style = solid;
+ color = 1
+ subgraph cluster_0x5b4bf50 {
+ label = "Region can not profitably be optimized!";
+ style = solid;
+ color = 6
+ subgraph cluster_0x5b4c0d0 {
+ label = "";
+ style = solid;
+ color = 5
+ Node0x5b5b570;
+ }
+ Node0x5b5de50;
+ Node0x5b5df30;
+ }
+ Node0x5b5b5a0;
+ Node0x5b5de30;
+ Node0x5b5df90;
+ }
+}
diff --git a/polly/docs/experiments/matmul/scops.init_array.dot.png b/polly/docs/experiments/matmul/scops.init_array.dot.png
new file mode 100644
index 000000000000..48a9f38946a9
--- /dev/null
+++ b/polly/docs/experiments/matmul/scops.init_array.dot.png
Binary files differ
diff --git a/polly/docs/experiments/matmul/scops.main.dot b/polly/docs/experiments/matmul/scops.main.dot
new file mode 100644
index 000000000000..e4abe8fbec88
--- /dev/null
+++ b/polly/docs/experiments/matmul/scops.main.dot
@@ -0,0 +1,50 @@
+digraph "Scop Graph for 'main' function" {
+ label="Scop Graph for 'main' function";
+
+ Node0x5b5c850 [shape=record,label="{entry:\l br label %entry.split\l}"];
+ Node0x5b5c850 -> Node0x5b5a440;
+ Node0x5b5a440 [shape=record,label="{entry.split: \l tail call void @init_array()\l br label %for.cond1.preheader\l}"];
+ Node0x5b5a440 -> Node0x5b38cd0;
+ Node0x5b38cd0 [shape=record,label="{for.cond1.preheader: \l %indvars.iv7 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next8, %for.inc28 ]\l br label %for.body3\l}"];
+ Node0x5b38cd0 -> Node0x5b4bd30;
+ Node0x5b4bd30 [shape=record,label="{for.body3: \l %indvars.iv4 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next5,\l... %for.inc25 ]\l %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4\l store float 0.000000e+00, float* %arrayidx5, align 4\l br label %for.body8\l}"];
+ Node0x5b4bd30 -> Node0x5b38c50;
+ Node0x5b38c50 [shape=record,label="{for.body8: \l %indvars.iv = phi i64 [ 0, %for.body3 ], [ %indvars.iv.next, %for.body8 ]\l %arrayidx12 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4\l %0 = load float, float* %arrayidx12, align 4\l %arrayidx16 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @A, i64 0, i64 %indvars.iv7, i64 %indvars.iv\l %1 = load float, float* %arrayidx16, align 4\l %arrayidx20 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv4\l %2 = load float, float* %arrayidx20, align 4\l %mul = fmul float %1, %2\l %add = fadd float %0, %mul\l %arrayidx24 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536\l... x float]]* @C, i64 0, i64 %indvars.iv7, i64 %indvars.iv4\l store float %add, float* %arrayidx24, align 4\l %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l %exitcond = icmp ne i64 %indvars.iv.next, 1536\l br i1 %exitcond, label %for.body8, label %for.inc25\l}"];
+ Node0x5b38c50 -> Node0x5b38c50[constraint=false];
+ Node0x5b38c50 -> Node0x5b5a290;
+ Node0x5b5a290 [shape=record,label="{for.inc25: \l %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1\l %exitcond6 = icmp ne i64 %indvars.iv.next5, 1536\l br i1 %exitcond6, label %for.body3, label %for.inc28\l}"];
+ Node0x5b5a290 -> Node0x5b4bd30[constraint=false];
+ Node0x5b5a290 -> Node0x5b5a340;
+ Node0x5b5a340 [shape=record,label="{for.inc28: \l %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1\l %exitcond9 = icmp ne i64 %indvars.iv.next8, 1536\l br i1 %exitcond9, label %for.cond1.preheader, label %for.end30\l}"];
+ Node0x5b5a340 -> Node0x5b38cd0[constraint=false];
+ Node0x5b5a340 -> Node0x5b5a3a0;
+ Node0x5b5a3a0 [shape=record,label="{for.end30: \l ret i32 0\l}"];
+ colorscheme = "paired12"
+ subgraph cluster_0x5b5c970 {
+ label = "";
+ style = solid;
+ color = 1
+ subgraph cluster_0x5b5c5a0 {
+ label = "";
+ style = filled;
+ color = 3 subgraph cluster_0x5b5c9f0 {
+ label = "";
+ style = solid;
+ color = 5
+ subgraph cluster_0x5b5c110 {
+ label = "";
+ style = solid;
+ color = 7
+ Node0x5b38c50;
+ }
+ Node0x5b4bd30;
+ Node0x5b5a290;
+ }
+ Node0x5b38cd0;
+ Node0x5b5a340;
+ }
+ Node0x5b5c850;
+ Node0x5b5a440;
+ Node0x5b5a3a0;
+ }
+}
diff --git a/polly/docs/experiments/matmul/scops.main.dot.png b/polly/docs/experiments/matmul/scops.main.dot.png
new file mode 100644
index 000000000000..4e73701a08d7
--- /dev/null
+++ b/polly/docs/experiments/matmul/scops.main.dot.png
Binary files differ
diff --git a/polly/docs/experiments/matmul/scops.print_array.dot b/polly/docs/experiments/matmul/scops.print_array.dot
new file mode 100644
index 000000000000..748ccb170cd0
--- /dev/null
+++ b/polly/docs/experiments/matmul/scops.print_array.dot
@@ -0,0 +1,51 @@
+digraph "Scop Graph for 'print_array' function" {
+ label="Scop Graph for 'print_array' function";
+
+ Node0x5b5ee00 [shape=record,label="{entry:\l br label %entry.split\l}"];
+ Node0x5b5ee00 -> Node0x5b5ee50;
+ Node0x5b5ee50 [shape=record,label="{entry.split: \l br label %for.cond1.preheader\l}"];
+ Node0x5b5ee50 -> Node0x5b5ee70;
+ Node0x5b5ee70 [shape=record,label="{for.cond1.preheader: \l %indvars.iv6 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next7, %for.end ]\l %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l br label %for.body3\l}"];
+ Node0x5b5ee70 -> Node0x5b5ee20;
+ Node0x5b5ee20 [shape=record,label="{for.body3: \l %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next,\l... %for.inc ]\l %1 = phi %struct._IO_FILE* [ %0, %for.cond1.preheader ], [ %5, %for.inc ]\l %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x\l... float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv\l %2 = load float, float* %arrayidx5, align 4\l %conv = fpext float %2 to double\l %call = tail call i32 (%struct._IO_FILE*, i8*, ...)\l... @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x\l... i8]* @.str, i64 0, i64 0), double %conv) #2\l %3 = trunc i64 %indvars.iv to i32\l %rem = srem i32 %3, 80\l %cmp6 = icmp eq i32 %rem, 79\l br i1 %cmp6, label %if.then, label %for.inc\l}"];
+ Node0x5b5ee20 -> Node0x5b60d10;
+ Node0x5b5ee20 -> Node0x5b60d70;
+ Node0x5b60d10 [shape=record,label="{if.then: \l %4 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l %fputc3 = tail call i32 @fputc(i32 10, %struct._IO_FILE* %4)\l br label %for.inc\l}"];
+ Node0x5b60d10 -> Node0x5b60d70;
+ Node0x5b60d70 [shape=record,label="{for.inc: \l %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\l %5 = load %struct._IO_FILE*, %struct._IO_FILE** @stdout, align 8\l %exitcond = icmp ne i64 %indvars.iv.next, 1536\l br i1 %exitcond, label %for.body3, label %for.end\l}"];
+ Node0x5b60d70 -> Node0x5b5ee20[constraint=false];
+ Node0x5b60d70 -> Node0x5b60e10;
+ Node0x5b60e10 [shape=record,label="{for.end: \l %.lcssa = phi %struct._IO_FILE* [ %5, %for.inc ]\l %fputc = tail call i32 @fputc(i32 10, %struct._IO_FILE* %.lcssa)\l %indvars.iv.next7 = add nuw nsw i64 %indvars.iv6, 1\l %exitcond8 = icmp ne i64 %indvars.iv.next7, 1536\l br i1 %exitcond8, label %for.cond1.preheader, label %for.end12\l}"];
+ Node0x5b60e10 -> Node0x5b5ee70[constraint=false];
+ Node0x5b60e10 -> Node0x5b60e70;
+ Node0x5b60e70 [shape=record,label="{for.end12: \l ret void\l}"];
+ colorscheme = "paired12"
+ subgraph cluster_0x5b349a0 {
+ label = "";
+ style = solid;
+ color = 1
+ subgraph cluster_0x5b5c2c0 {
+ label = "Call instruction: %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2";
+ style = solid;
+ color = 6
+ subgraph cluster_0x5b5c240 {
+ label = "Call instruction: %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2";
+ style = solid;
+ color = 5
+ subgraph cluster_0x5b34a20 {
+ label = "Region can not profitably be optimized!";
+ style = solid;
+ color = 7
+ Node0x5b5ee20;
+ Node0x5b60d10;
+ }
+ Node0x5b60d70;
+ }
+ Node0x5b5ee70;
+ Node0x5b60e10;
+ }
+ Node0x5b5ee00;
+ Node0x5b5ee50;
+ Node0x5b60e70;
+ }
+}
diff --git a/polly/docs/experiments/matmul/scops.print_array.dot.png b/polly/docs/experiments/matmul/scops.print_array.dot.png
new file mode 100644
index 000000000000..e3b973b131ab
--- /dev/null
+++ b/polly/docs/experiments/matmul/scops.print_array.dot.png
Binary files differ
diff --git a/polly/docs/experiments/matmul/scopsonly.init_array.dot b/polly/docs/experiments/matmul/scopsonly.init_array.dot
new file mode 100644
index 000000000000..3d2092b21c93
--- /dev/null
+++ b/polly/docs/experiments/matmul/scopsonly.init_array.dot
@@ -0,0 +1,39 @@
+digraph "Scop Graph for 'init_array' function" {
+ label="Scop Graph for 'init_array' function";
+
+ Node0x5ae2570 [shape=record,label="{entry}"];
+ Node0x5ae2570 -> Node0x5ae4e90;
+ Node0x5ae4e90 [shape=record,label="{entry.split}"];
+ Node0x5ae4e90 -> Node0x5ae4f50;
+ Node0x5ae4f50 [shape=record,label="{for.cond1.preheader}"];
+ Node0x5ae4f50 -> Node0x5ae50e0;
+ Node0x5ae50e0 [shape=record,label="{for.body3}"];
+ Node0x5ae50e0 -> Node0x5ae50e0[constraint=false];
+ Node0x5ae50e0 -> Node0x5ae5100;
+ Node0x5ae5100 [shape=record,label="{for.inc17}"];
+ Node0x5ae5100 -> Node0x5ae4f50[constraint=false];
+ Node0x5ae5100 -> Node0x5ae4ff0;
+ Node0x5ae4ff0 [shape=record,label="{for.end19}"];
+ colorscheme = "paired12"
+ subgraph cluster_0x5ad2dd0 {
+ label = "";
+ style = solid;
+ color = 1
+ subgraph cluster_0x5ad2f50 {
+ label = "Region can not profitably be optimized!";
+ style = solid;
+ color = 6
+ subgraph cluster_0x5ad30d0 {
+ label = "";
+ style = solid;
+ color = 5
+ Node0x5ae50e0;
+ }
+ Node0x5ae4f50;
+ Node0x5ae5100;
+ }
+ Node0x5ae2570;
+ Node0x5ae4e90;
+ Node0x5ae4ff0;
+ }
+}
diff --git a/polly/docs/experiments/matmul/scopsonly.init_array.dot.png b/polly/docs/experiments/matmul/scopsonly.init_array.dot.png
new file mode 100644
index 000000000000..f101d4d30815
--- /dev/null
+++ b/polly/docs/experiments/matmul/scopsonly.init_array.dot.png
Binary files differ
diff --git a/polly/docs/experiments/matmul/scopsonly.main.dot b/polly/docs/experiments/matmul/scopsonly.main.dot
new file mode 100644
index 000000000000..c2d60c7ded64
--- /dev/null
+++ b/polly/docs/experiments/matmul/scopsonly.main.dot
@@ -0,0 +1,50 @@
+digraph "Scop Graph for 'main' function" {
+ label="Scop Graph for 'main' function";
+
+ Node0x5abfcf0 [shape=record,label="{entry}"];
+ Node0x5abfcf0 -> Node0x5ade060;
+ Node0x5ade060 [shape=record,label="{entry.split}"];
+ Node0x5ade060 -> Node0x5ade0e0;
+ Node0x5ade0e0 [shape=record,label="{for.cond1.preheader}"];
+ Node0x5ade0e0 -> Node0x5ade100;
+ Node0x5ade100 [shape=record,label="{for.body3}"];
+ Node0x5ade100 -> Node0x5ae0020;
+ Node0x5ae0020 [shape=record,label="{for.body8}"];
+ Node0x5ae0020 -> Node0x5ae0020[constraint=false];
+ Node0x5ae0020 -> Node0x5ae0080;
+ Node0x5ae0080 [shape=record,label="{for.inc25}"];
+ Node0x5ae0080 -> Node0x5ade100[constraint=false];
+ Node0x5ae0080 -> Node0x5adfef0;
+ Node0x5adfef0 [shape=record,label="{for.inc28}"];
+ Node0x5adfef0 -> Node0x5ade0e0[constraint=false];
+ Node0x5adfef0 -> Node0x5adff50;
+ Node0x5adff50 [shape=record,label="{for.end30}"];
+ colorscheme = "paired12"
+ subgraph cluster_0x5ad2c80 {
+ label = "";
+ style = solid;
+ color = 1
+ subgraph cluster_0x5ad2e50 {
+ label = "";
+ style = filled;
+ color = 3 subgraph cluster_0x5ad2d00 {
+ label = "";
+ style = solid;
+ color = 5
+ subgraph cluster_0x5ad2dd0 {
+ label = "";
+ style = solid;
+ color = 7
+ Node0x5ae0020;
+ }
+ Node0x5ade100;
+ Node0x5ae0080;
+ }
+ Node0x5ade0e0;
+ Node0x5adfef0;
+ }
+ Node0x5abfcf0;
+ Node0x5ade060;
+ Node0x5adff50;
+ }
+}
diff --git a/polly/docs/experiments/matmul/scopsonly.main.dot.png b/polly/docs/experiments/matmul/scopsonly.main.dot.png
new file mode 100644
index 000000000000..32634243888d
--- /dev/null
+++ b/polly/docs/experiments/matmul/scopsonly.main.dot.png
Binary files differ
diff --git a/polly/docs/experiments/matmul/scopsonly.print_array.dot b/polly/docs/experiments/matmul/scopsonly.print_array.dot
new file mode 100644
index 000000000000..0f7de45e8772
--- /dev/null
+++ b/polly/docs/experiments/matmul/scopsonly.print_array.dot
@@ -0,0 +1,51 @@
+digraph "Scop Graph for 'print_array' function" {
+ label="Scop Graph for 'print_array' function";
+
+ Node0x5ae5e30 [shape=record,label="{entry}"];
+ Node0x5ae5e30 -> Node0x5ae5f50;
+ Node0x5ae5f50 [shape=record,label="{entry.split}"];
+ Node0x5ae5f50 -> Node0x5ae7d90;
+ Node0x5ae7d90 [shape=record,label="{for.cond1.preheader}"];
+ Node0x5ae7d90 -> Node0x5ae7f20;
+ Node0x5ae7f20 [shape=record,label="{for.body3}"];
+ Node0x5ae7f20 -> Node0x5ae7f40;
+ Node0x5ae7f20 -> Node0x5ae7f60;
+ Node0x5ae7f40 [shape=record,label="{if.then}"];
+ Node0x5ae7f40 -> Node0x5ae7f60;
+ Node0x5ae7f60 [shape=record,label="{for.inc}"];
+ Node0x5ae7f60 -> Node0x5ae7f20[constraint=false];
+ Node0x5ae7f60 -> Node0x5ae7e30;
+ Node0x5ae7e30 [shape=record,label="{for.end}"];
+ Node0x5ae7e30 -> Node0x5ae7d90[constraint=false];
+ Node0x5ae7e30 -> Node0x5ae8110;
+ Node0x5ae8110 [shape=record,label="{for.end12}"];
+ colorscheme = "paired12"
+ subgraph cluster_0x5abb9a0 {
+ label = "";
+ style = solid;
+ color = 1
+ subgraph cluster_0x5ae32c0 {
+ label = "Call instruction: %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2";
+ style = solid;
+ color = 6
+ subgraph cluster_0x5ae3240 {
+ label = "Call instruction: %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2";
+ style = solid;
+ color = 5
+ subgraph cluster_0x5abba20 {
+ label = "Region can not profitably be optimized!";
+ style = solid;
+ color = 7
+ Node0x5ae7f20;
+ Node0x5ae7f40;
+ }
+ Node0x5ae7f60;
+ }
+ Node0x5ae7d90;
+ Node0x5ae7e30;
+ }
+ Node0x5ae5e30;
+ Node0x5ae5f50;
+ Node0x5ae8110;
+ }
+}
diff --git a/polly/docs/experiments/matmul/scopsonly.print_array.dot.png b/polly/docs/experiments/matmul/scopsonly.print_array.dot.png
new file mode 100644
index 000000000000..b0d4b45aace4
--- /dev/null
+++ b/polly/docs/experiments/matmul/scopsonly.print_array.dot.png
Binary files differ