diff options
author | dorit <dorit@138bc75d-0d04-0410-961f-82ee72b054a4> | 2007-08-19 12:02:48 +0000 |
---|---|---|
committer | dorit <dorit@138bc75d-0d04-0410-961f-82ee72b054a4> | 2007-08-19 12:02:48 +0000 |
commit | b0eb8c663b1bca6c460b0a6754fd8c49ca018266 (patch) | |
tree | 25bb935b47a86dcee54460eecb1a8c69809ec5da | |
parent | 221e9a92bd54d3f572f14697a066205ee80ec187 (diff) | |
download | gcc-b0eb8c663b1bca6c460b0a6754fd8c49ca018266.tar.gz |
* tree-data-refs.c (split_constant_offset): Expose.
* tree-data-refs.h (split_constant_offset): Add declaration.
* tree-vectorizer.h (dr_alignment_support): Renamed
dr_unaligned_software_pipeline to dr_explicit_realign_optimized.
Added a new value dr_explicit_realign.
(_stmt_vec_info): Added new fields: dr_base_address, dr_init,
dr_offset, dr_step, and dr_aligned_to, along with new access
functions for these fields: STMT_VINFO_DR_BASE_ADDRESS,
STMT_VINFO_DR_INIT, STMT_VINFO_DR_OFFSET, STMT_VINFO_DR_STEP, and
STMT_VINFO_DR_ALIGNED_TO.
* tree-vectorizer.c (vect_supportable_dr_alignment): Add
documentation.
In case of outer-loop vectorization with non-fixed misalignment - use
the dr_explicit_realign scheme instead of the optimized realignment
scheme.
(new_stmt_vec_info): Initialize new fields.
* tree-vect-analyze.c (vect_compute_data_ref_alignment): Handle the
'nested_in_vect_loop' case. Change verbosity level.
(vect_analyze_data_ref_access): Handle the 'nested_in_vect_loop' case.
Don't fail on zero step in the outer-loop for loads.
(vect_analyze_data_refs): Call split_constant_offset to calculate base,
offset and init relative to the outer-loop.
* tree-vect-transform.c (vect_create_data_ref_ptr): Replace the unused
BSI function argument with a new function argument - at_loop.
Simplify the condition that determines STEP. Takes additional argument
INV_P. Support outer-loop vectorization (handle the nested_in_vect_loop
case), including zero step in the outer-loop. Call
vect_create_addr_base_for_vector_ref with additional argument.
(vect_create_addr_base_for_vector_ref): Takes additional argument LOOP.
Updated function documentation. Handle the 'nested_in_vect_loop' case.
Fixed and simplified calculation of step.
(vectorizable_store): Call vect_create_data_ref_ptr with loop instead
of bsi, and with additional argument. Call bump_vector_ptr with
additional argument. Fix typos. Handle the 'nested_in_vect_loop' case.
(vect_setup_realignment): Takes additional arguments INIT_ADDR and
DR_ALIGNMENT_SUPPORT. Returns another value AT_LOOP. Handle the case
when the realignment setup needs to take place inside the loop. Support
the dr_explicit_realign scheme. Allow generating the optimized
realignment scheme for outer-loop vectorization. Added documentation.
(vectorizable_load): Support the dr_explicit_realign scheme. Handle the
'nested_in_vect_loop' case, including loads that are invariant in the
outer-loop and the realignment schemes. Handle the case when the
realignment setup needs to take place inside the loop. Call
vect_setup_realignment with additional arguments. Call
vect_create_data_ref_ptr with additional argument and with loop instead
of bsi. Fix 80-column overflow. Fix typos. Rename PHI_STMT to PHI.
(vect_gen_niters_for_prolog_loop): Call
vect_create_addr_base_for_vector_ref with additional arguments.
(vect_create_cond_for_align_checks): Likewise.
(bump_vector_ptr): Updated to support the new dr_explicit_realign
scheme: takes additional argument bump; argument ptr_incr is now
optional; updated documentation.
(vect_init_vector): Takes additional argument (bsi). Use it, if
available, to insert the vector initialization.
(get_initial_def_for_induction): Pass additional argument in call to
vect_init_vector.
(vect_get_vec_def_for_operand): Likewise.
(vect_setup_realignment): Likewise.
(vectorizable_load): Likewise.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@127624 138bc75d-0d04-0410-961f-82ee72b054a4
40 files changed, 2499 insertions, 212 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 9470bc3fd99..057037b6d5e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,71 @@ 2007-08-19 Dorit Nuzman <dorit@il.ibm.com> + * tree-data-refs.c (split_constant_offset): Expose. + * tree-data-refs.h (split_constant_offset): Add declaration. + + * tree-vectorizer.h (dr_alignment_support): Renamed + dr_unaligned_software_pipeline to dr_explicit_realign_optimized. + Added a new value dr_explicit_realign. + (_stmt_vec_info): Added new fields: dr_base_address, dr_init, + dr_offset, dr_step, and dr_aligned_to, along with new access + functions for these fields: STMT_VINFO_DR_BASE_ADDRESS, + STMT_VINFO_DR_INIT, STMT_VINFO_DR_OFFSET, STMT_VINFO_DR_STEP, and + STMT_VINFO_DR_ALIGNED_TO. + + * tree-vectorizer.c (vect_supportable_dr_alignment): Add + documentation. + In case of outer-loop vectorization with non-fixed misalignment - use + the dr_explicit_realign scheme instead of the optimized realignment + scheme. + (new_stmt_vec_info): Initialize new fields. + + * tree-vect-analyze.c (vect_compute_data_ref_alignment): Handle the + 'nested_in_vect_loop' case. Change verbosity level. + (vect_analyze_data_ref_access): Handle the 'nested_in_vect_loop' case. + Don't fail on zero step in the outer-loop for loads. + (vect_analyze_data_refs): Call split_constant_offset to calculate base, + offset and init relative to the outer-loop. + + * tree-vect-transform.c (vect_create_data_ref_ptr): Replace the unused + BSI function argument with a new function argument - at_loop. + Simplify the condition that determines STEP. Takes additional argument + INV_P. Support outer-loop vectorization (handle the nested_in_vect_loop + case), including zero step in the outer-loop. Call + vect_create_addr_base_for_vector_ref with additional argument. + (vect_create_addr_base_for_vector_ref): Takes additional argument LOOP. + Updated function documentation. Handle the 'nested_in_vect_loop' case. + Fixed and simplified calculation of step. + (vectorizable_store): Call vect_create_data_ref_ptr with loop instead + of bsi, and with additional argument. Call bump_vector_ptr with + additional argument. Fix typos. Handle the 'nested_in_vect_loop' case. + (vect_setup_realignment): Takes additional arguments INIT_ADDR and + DR_ALIGNMENT_SUPPORT. Returns another value AT_LOOP. Handle the case + when the realignment setup needs to take place inside the loop. Support + the dr_explicit_realign scheme. Allow generating the optimized + realignment scheme for outer-loop vectorization. Added documentation. + (vectorizable_load): Support the dr_explicit_realign scheme. Handle the + 'nested_in_vect_loop' case, including loads that are invariant in the + outer-loop and the realignment schemes. Handle the case when the + realignment setup needs to take place inside the loop. Call + vect_setup_realignment with additional arguments. Call + vect_create_data_ref_ptr with additional argument and with loop instead + of bsi. Fix 80-column overflow. Fix typos. Rename PHI_STMT to PHI. + (vect_gen_niters_for_prolog_loop): Call + vect_create_addr_base_for_vector_ref with additional arguments. + (vect_create_cond_for_align_checks): Likewise. + (bump_vector_ptr): Updated to support the new dr_explicit_realign + scheme: takes additional argument bump; argument ptr_incr is now + optional; updated documentation. + (vect_init_vector): Takes additional argument (bsi). Use it, if + available, to insert the vector initialization. + (get_initial_def_for_induction): Pass additional argument in call to + vect_init_vector. + (vect_get_vec_def_for_operand): Likewise. + (vect_setup_realignment): Likewise. + (vectorizable_load): Likewise. + +2007-08-19 Dorit Nuzman <dorit@il.ibm.com> + * tree-vectorizer.h (vect_is_simple_reduction): Takes a loop_vec_info as argument instead of struct loop. (nested_in_vect_loop_p): New function. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 1a34a13b709..0ef6f864005 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,40 @@ 2007-08-19 Dorit Nuzman <dorit@il.ibm.com> + * gcc.dg/vect/vect-117.c: Change inner-loop bound to + unknown (so that outer-loop wont get analyzed). + * gcc.dg/vect/vect-outer-1a.c: New test. + * gcc.dg/vect/vect-outer-1b.c: New test. + * gcc.dg/vect/vect-outer-1.c: New test. + * gcc.dg/vect/vect-outer-2a.c: New test. + * gcc.dg/vect/vect-outer-2b.c: New test. + * gcc.dg/vect/vect-outer-2c.c: New test. + * gcc.dg/vect/vect-outer-2.c: New test. + * gcc.dg/vect/vect-outer-3a.c: New test. + * gcc.dg/vect/vect-outer-3b.c: New test. + * gcc.dg/vect/vect-outer-3c.c: New test. + * gcc.dg/vect/vect-outer-3.c: New test. + * gcc.dg/vect/vect-outer-4a.c: New test. + * gcc.dg/vect/vect-outer-4b.c: New test. + * gcc.dg/vect/vect-outer-4c.c: New test. + * gcc.dg/vect/vect-outer-4d.c: New test. + * gcc.dg/vect/vect-outer-4e.c: New test. + * gcc.dg/vect/vect-outer-4f.c: New test. + * gcc.dg/vect/vect-outer-4g.c: New test. + * gcc.dg/vect/no-section-anchors-vect-outer-4h.c: New test. + * gcc.dg/vect/vect-outer-4i.c: New test. + * gcc.dg/vect/vect-outer-4j.c: New test. + * gcc.dg/vect/vect-outer-4k.c: New test. + * gcc.dg/vect/vect-outer-4l.c: New test. + * gcc.dg/vect/vect-outer-4m.c: New test. + * gcc.dg/vect/vect-outer-4.c: New test. + * gcc.dg/vect/vect-outer-5.c: New test. + * gcc.dg/vect/vect-outer-6.c: New test. + * gcc.dg/vect/vect-outer-fir.c: New test. + * gcc.dg/vect/vect-outer-fir-lb.c: New test. + * gcc.dg/vect/costmodel/ppc/costmodel-vect-outer-fir.c: New test. + +2007-08-19 Dorit Nuzman <dorit@il.ibm.com> + * gcc.dg/vect/vect.exp: Compile tests with -fno-tree-scev-cprop and -fno-tree-reassoc. * gcc.dg/vect/no-tree-scev-cprop-vect-iv-1.c: Moved to... diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-outer-fir.c b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-outer-fir.c new file mode 100644 index 00000000000..97163e6c46b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-outer-fir.c @@ -0,0 +1,75 @@ +/* { dg-require-effective-target vect_float } */ + +#include <stdarg.h> +#include "../../tree-vect.h" + +#define N 40 +#define M 128 +float in[N+M]; +float coeff[M]; +float out[N]; +float fir_out[N]; + +/* Should be vectorized. Fixed misaligment in the inner-loop. */ +/* Currently not vectorized because we get too many BBs in the inner-loop, + because the compiler doesn't realize that the inner-loop executes at + least once (cause k<4), and so there's no need to create a guard code + to skip the inner-loop in case it doesn't execute. */ +void foo (){ + int i,j,k; + float diff; + + for (i = 0; i < N; i++) { + out[i] = 0; + } + + for (k = 0; k < 4; k++) { + for (i = 0; i < N; i++) { + diff = 0; + for (j = k; j < M; j+=4) { + diff += in[j+i]*coeff[j]; + } + out[i] += diff; + } + } + +/* Vectorized. Changing misalignment in the inner-loop. */ +void fir (){ + int i,j,k; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j++) { + diff += in[j+i]*coeff[j]; + } + fir_out[i] = diff; + } +} + + +int main (void) +{ + check_vect (); + int i, j; + float diff; + + for (i = 0; i < M; i++) + coeff[i] = i; + for (i = 0; i < N+M; i++) + in[i] = i; + + foo (); + fir (); + + for (i = 0; i < N; i++) { + if (out[i] != fir_out[i]) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 2 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail vect_no_align } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c new file mode 100644 index 00000000000..ab923814d32 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/no-section-anchors-vect-outer-4h.c @@ -0,0 +1,47 @@ +/* { dg-require-effective-target vect_int } */ +#include <stdarg.h> +#include "tree-vect.h" + + +#define N 40 +#define M 128 +unsigned short a[M][N]; +unsigned int out[N]; + +/* Outer-loop vectorization. */ + +void +foo (){ + int i,j; + unsigned int diff; + + for (i = 0; i < N; i++) { + for (j = 0; j < M; j++) { + a[j][i] = 4; + } + out[i]=5; + } +} + +int main (void) +{ + int i, j; + check_vect (); + + foo (); + + for (i = 0; i < N; i++) { + for (j = 0; j < M; j++) { + if (a[j][i] != 4) + abort (); + } + if (out[i] != 5) + abort (); + } + + return 0; +} + + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-117.c b/gcc/testsuite/gcc.dg/vect/vect-117.c index b9ad93c55e9..2180e22ac7c 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-117.c +++ b/gcc/testsuite/gcc.dg/vect/vect-117.c @@ -20,7 +20,7 @@ static int c[N][N] = {{ 1, 2, 3, 4, 5}, volatile int foo; -int main1 (int A[N][N]) +int main1 (int A[N][N], int n) { int i,j; @@ -28,7 +28,7 @@ int main1 (int A[N][N]) /* vectorizable */ for (i = 1; i < N; i++) { - for (j = 0; j < N; j++) + for (j = 0; j < n; j++) { A[i][j] = A[i-1][j] + A[i][j]; } @@ -42,7 +42,7 @@ int main (void) int i,j; foo = 0; - main1 (a); + main1 (a, N); /* check results: */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-1.c b/gcc/testsuite/gcc.dg/vect/vect-outer-1.c new file mode 100644 index 00000000000..79a2ba41a36 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-1.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ + +#define N 40 +signed short image[N][N] __attribute__ ((__aligned__(16))); +signed short block[N][N] __attribute__ ((__aligned__(16))); +signed short out[N] __attribute__ ((__aligned__(16))); + +/* Can't do outer-loop vectorization because of non-consecutive access. */ + +void +foo (){ + int i,j; + int diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j+=8) { + diff += (image[i][j] - block[i][j]); + } + out[i]=diff; + } +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "strided access in outer loop" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-1a.c b/gcc/testsuite/gcc.dg/vect/vect-outer-1a.c new file mode 100644 index 00000000000..2d6eb06d0c8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-1a.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ + +#define N 40 +signed short image[N][N] __attribute__ ((__aligned__(16))); +signed short block[N][N] __attribute__ ((__aligned__(16))); + +/* Can't do outer-loop vectorization because of non-consecutive access. + Currently fails to vectorize because the reduction pattern is not + recognized. */ + +int +foo (){ + int i,j; + int diff = 0; + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j+=8) { + diff += (image[i][j] - block[i][j]); + } + } + return diff; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* FORNOW */ +/* { dg-final { scan-tree-dump-times "strided access in outer loop" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "unexpected pattern" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-1b.c b/gcc/testsuite/gcc.dg/vect/vect-outer-1b.c new file mode 100644 index 00000000000..e093d0ea365 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-1b.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ + +#define N 40 +signed short image[N][N]; +signed short block[N][N]; +signed short out[N]; + +/* Outer-loop cannot get vectorized because of non-consecutive access. */ + +void +foo (){ + int i,j; + int diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j+=4) { + diff += (image[i][j] - block[i][j]); + } + out[i]=diff; + } +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "strided access in outer loop" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-2.c b/gcc/testsuite/gcc.dg/vect/vect-outer-2.c new file mode 100644 index 00000000000..caf38d7ef61 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-2.c @@ -0,0 +1,40 @@ +/* { dg-require-effective-target vect_float } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +float image[N][N] __attribute__ ((__aligned__(16))); +float out[N]; + +/* Outer-loop vectorization. */ + +void +foo (){ + int i,j; + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + image[j][i] = j+i; + } + } +} + +int main (void) +{ + check_vect (); + int i, j; + + foo (); + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + if (image[j][i] != j+i) + abort (); + } + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-2a.c b/gcc/testsuite/gcc.dg/vect/vect-outer-2a.c new file mode 100644 index 00000000000..4925c0826ce --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-2a.c @@ -0,0 +1,41 @@ +/* { dg-require-effective-target vect_float } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +float image[N][N][N] __attribute__ ((__aligned__(16))); + +void +foo (){ + int i,j,k; + + for (k=0; k<N; k++) { + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + image[k][j][i] = j+i+k; + } + } + } +} + +int main (void) +{ + check_vect (); + int i, j, k; + + foo (); + + for (k=0; k<N; k++) { + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + if (image[k][j][i] != j+i+k) + abort (); + } + } + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-2b.c b/gcc/testsuite/gcc.dg/vect/vect-outer-2b.c new file mode 100644 index 00000000000..65c64fc7524 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-2b.c @@ -0,0 +1,41 @@ +/* { dg-require-effective-target vect_float } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +float image[2*N][N][N] __attribute__ ((__aligned__(16))); + +void +foo (){ + int i,j,k; + + for (k=0; k<N; k++) { + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + image[k+i][j][i] = j+i+k; + } + } + } +} + +int main (void) +{ + check_vect (); + int i, j, k; + + foo (); + + for (k=0; k<N; k++) { + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + if (image[k+i][j][i] != j+i+k) + abort (); + } + } + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "strided access in outer loop." 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-2c.c b/gcc/testsuite/gcc.dg/vect/vect-outer-2c.c new file mode 100644 index 00000000000..c7242590667 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-2c.c @@ -0,0 +1,41 @@ +/* { dg-require-effective-target vect_float } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +float image[2*N][2*N][N] __attribute__ ((__aligned__(16))); + +void +foo (){ + int i,j,k; + + for (k=0; k<N; k++) { + for (i = 0; i < N; i++) { + for (j = 0; j < N; j+=2) { + image[k][j][i] = j+i+k; + } + } + } +} + +int main (void) +{ + check_vect (); + int i, j, k; + + foo (); + + for (k=0; k<N; k++) { + for (i = 0; i < N; i++) { + for (j = 0; j < N; j+=2) { + if (image[k][j][i] != j+i+k) + abort (); + } + } + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-2d.c b/gcc/testsuite/gcc.dg/vect/vect-outer-2d.c new file mode 100644 index 00000000000..f6f9c359096 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-2d.c @@ -0,0 +1,41 @@ +/* { dg-require-effective-target vect_float } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +float image[N][N][N+1] __attribute__ ((__aligned__(16))); + +void +foo (){ + int i,j,k; + + for (k=0; k<N; k++) { + for (i = 0; i < N; i++) { + for (j = 0; j < i+1; j++) { + image[k][j][i] = j+i+k; + } + } + } +} + +int main (void) +{ + check_vect (); + int i, j, k; + + foo (); + + for (k=0; k<N; k++) { + for (i = 0; i < N; i++) { + for (j = 0; j < i+1; j++) { + if (image[k][j][i] != j+i+k) + abort (); + } + } + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-3.c b/gcc/testsuite/gcc.dg/vect/vect-outer-3.c new file mode 100644 index 00000000000..6f41b826e4e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-3.c @@ -0,0 +1,52 @@ +/* { dg-require-effective-target vect_float } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +float image[N][N] __attribute__ ((__aligned__(16))); +float out[N]; + +/* Outer-loop vectoriation. */ + +void +foo (){ + int i,j; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j++) { + diff += image[j][i]; + } + out[i]=diff; + } +} + +int main (void) +{ + check_vect (); + int i, j; + float diff; + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + image[i][j]=i+j; + } + } + + foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j++) { + diff += image[j][i]; + } + if (out[i] != diff) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-3a.c b/gcc/testsuite/gcc.dg/vect/vect-outer-3a.c new file mode 100644 index 00000000000..a5fd23df51e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-3a.c @@ -0,0 +1,53 @@ +/* { dg-require-effective-target vect_float } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +float image[N][N+1] __attribute__ ((__aligned__(16))); +float out[N]; + +/* Outer-loop vectorization with misaliged accesses in the inner-loop. */ + +void +foo (){ + int i,j; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j++) { + diff += image[j][i]; + } + out[i]=diff; + } +} + +int main (void) +{ + check_vect (); + int i, j; + float diff; + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + image[i][j]=i+j; + } + } + + foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j++) { + diff += image[j][i]; + } + if (out[i] != diff) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail vect_no_align } } } */ +/* { dg-final { scan-tree-dump-times "step doesn't divide the vector-size" 2 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-3b.c b/gcc/testsuite/gcc.dg/vect/vect-outer-3b.c new file mode 100644 index 00000000000..67edb603245 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-3b.c @@ -0,0 +1,53 @@ +/* { dg-require-effective-target vect_float } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +float image[N][N] __attribute__ ((__aligned__(16))); +float out[N]; + +/* Outer-loop vectorization with non-consecutive access. Not vectorized yet. */ + +void +foo (){ + int i,j; + float diff; + + for (i = 0; i < N/2; i++) { + diff = 0; + for (j = 0; j < N; j++) { + diff += image[j][2*i]; + } + out[i]=diff; + } +} + +int main (void) +{ + check_vect (); + int i, j; + float diff; + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + image[i][j]=i+j; + } + } + + foo (); + + for (i = 0; i < N/2; i++) { + diff = 0; + for (j = 0; j < N; j++) { + diff += image[j][2*i]; + } + if (out[i] != diff) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "strided access in outer loop" 2 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-3c.c b/gcc/testsuite/gcc.dg/vect/vect-outer-3c.c new file mode 100644 index 00000000000..1e28777c165 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-3c.c @@ -0,0 +1,52 @@ +/* { dg-require-effective-target vect_float } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +float image[N][N+1] __attribute__ ((__aligned__(16))); +float out[N]; + +/* Outer-loop vectorization. */ + +void +foo (){ + int i,j; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j+=4) { + diff += image[j][i]; + } + out[i]=diff; + } +} + +int main (void) +{ + check_vect (); + int i, j; + float diff; + + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + image[i][j]=i+j; + } + } + + foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < N; j+=4) { + diff += image[j][i]; + } + if (out[i] != diff) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4.c new file mode 100644 index 00000000000..3b8f911cb73 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4.c @@ -0,0 +1,55 @@ +/* { dg-require-effective-target vect_float } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +#define M 128 +float in[N+M]; +float coeff[M]; +float out[N]; + +/* Outer-loop vectorization. */ + +void +foo (){ + int i,j; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=4) { + diff += in[j+i]*coeff[j]; + } + out[i]=diff; + } +} + +int main (void) +{ + check_vect (); + int i, j; + float diff; + + for (i = 0; i < M; i++) + coeff[i] = i; + for (i = 0; i < N+M; i++) + in[i] = i; + + foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=4) { + diff += in[j+i]*coeff[j]; + } + if (out[i] != diff) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "zero step in outer loop." 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4a.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4a.c new file mode 100644 index 00000000000..8fd1a03db14 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4a.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ + +#define N 40 +#define M 128 +signed short in[N+M]; +signed short coeff[M]; +signed short out[N]; + +/* Outer-loop vectorization. + Currently not vectorized because of multiple-data-types in the inner-loop. */ + +void +foo (){ + int i,j; + int diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]*coeff[j]; + } + out[i]=diff; + } +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* FORNOW. not vectorized until we support 0-stride acceses like coeff[j]. should be: + { scan-tree-dump-not "multiple types in nested loop." "vect" { xfail *-*-* } } } */ + +/* { dg-final { scan-tree-dump-times "zero step in outer loop." 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4b.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4b.c new file mode 100644 index 00000000000..ba2f7b4d0df --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4b.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ + +#define N 40 +#define M 128 +signed short in[N+M]; +signed short coeff[M]; +int out[N]; + +/* Outer-loop vectorization. + Currently not vectorized because of multiple-data-types in the inner-loop. */ + +void +foo (){ + int i,j; + int diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]*coeff[j]; + } + out[i]=diff; + } +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* FORNOW. not vectorized until we support 0-stride acceses like coeff[j]. should be: + { scan-tree-dump-not "multiple types in nested loop." "vect" { xfail *-*-* } } } */ + +/* { dg-final { scan-tree-dump-times "zero step in outer loop." 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4c.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4c.c new file mode 100644 index 00000000000..eb6b30f41ed --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4c.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ + +#define N 40 +#define M 128 +unsigned short in[N+M]; +unsigned short coeff[M]; +unsigned int out[N]; + +/* Outer-loop vectorization. */ + +void +foo (){ + int i,j; + unsigned short diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]*coeff[j]; + } + out[i]=diff; + } +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { target vect_short_mult } } } */ +/* { dg-final { scan-tree-dump-times "zero step in outer loop." 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4d.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4d.c new file mode 100644 index 00000000000..444a332b38b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4d.c @@ -0,0 +1,51 @@ +/* { dg-require-effective-target vect_float } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +#define M 128 +float in[N+M]; +float out[N]; + +/* Outer-loop vectorization. */ + +void +foo (){ + int i,j; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=4) { + diff += in[j+i]; + } + out[i]=diff; + } +} + +int main (void) +{ + check_vect (); + int i, j; + float diff; + + for (i = 0; i < N; i++) + in[i] = i; + + foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=4) { + diff += in[j+i]; + } + if (out[i] != diff) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4e.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4e.c new file mode 100644 index 00000000000..243cc1af7ea --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4e.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ + +#define N 40 +#define M 128 +unsigned int in[N+M]; +unsigned short out[N]; + +/* Outer-loop vectorization. */ + +void +foo (){ + int i,j; + unsigned int diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + out[i]=(unsigned short)diff; + } + + return; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4f.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4f.c new file mode 100644 index 00000000000..ebd0ef38f70 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4f.c @@ -0,0 +1,70 @@ +/* { dg-require-effective-target vect_int } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +#define M 128 +unsigned short in[N+M]; +unsigned int out[N]; +unsigned char arr[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +unsigned int +foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=diff; + } + return s; +} + +unsigned int +bar (int i, unsigned int diff, unsigned short *in) +{ + int j; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + return diff; +} + +int main (void) +{ + int i, j; + unsigned int diff; + unsigned int s=0,sum=0; + + check_vect (); + + for (i = 0; i < N+M; i++) { + in[i] = i; + } + + sum=foo (); + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + diff = bar (i, diff, in); + s += diff; + } + + if (s != sum) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: not allowed" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4g.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4g.c new file mode 100644 index 00000000000..ebd0ef38f70 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4g.c @@ -0,0 +1,70 @@ +/* { dg-require-effective-target vect_int } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +#define M 128 +unsigned short in[N+M]; +unsigned int out[N]; +unsigned char arr[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +unsigned int +foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=diff; + } + return s; +} + +unsigned int +bar (int i, unsigned int diff, unsigned short *in) +{ + int j; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + return diff; +} + +int main (void) +{ + int i, j; + unsigned int diff; + unsigned int s=0,sum=0; + + check_vect (); + + for (i = 0; i < N+M; i++) { + in[i] = i; + } + + sum=foo (); + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + diff = bar (i, diff, in); + s += diff; + } + + if (s != sum) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: not allowed" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4i.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4i.c new file mode 100644 index 00000000000..bc43c5bc6d5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4i.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ + +#define N 40 +#define M 128 +unsigned char in[N+M]; +unsigned short out[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +unsigned short +foo (){ + int i,j; + unsigned short diff; + unsigned short s=0; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=diff; + } + return s; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4j.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4j.c new file mode 100644 index 00000000000..7e1b7ec81ee --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4j.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ + +#define N 40 +#define M 128 +unsigned char in[N+M]; +unsigned short out[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +void +foo (){ + int i,j; + unsigned short diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + out[i]=diff; + } +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4k.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4k.c new file mode 100644 index 00000000000..ebd0ef38f70 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4k.c @@ -0,0 +1,70 @@ +/* { dg-require-effective-target vect_int } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +#define M 128 +unsigned short in[N+M]; +unsigned int out[N]; +unsigned char arr[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +unsigned int +foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=diff; + } + return s; +} + +unsigned int +bar (int i, unsigned int diff, unsigned short *in) +{ + int j; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + return diff; +} + +int main (void) +{ + int i, j; + unsigned int diff; + unsigned int s=0,sum=0; + + check_vect (); + + for (i = 0; i < N+M; i++) { + in[i] = i; + } + + sum=foo (); + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + diff = bar (i, diff, in); + s += diff; + } + + if (s != sum) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: not allowed" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4l.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4l.c new file mode 100644 index 00000000000..ebd0ef38f70 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4l.c @@ -0,0 +1,70 @@ +/* { dg-require-effective-target vect_int } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +#define M 128 +unsigned short in[N+M]; +unsigned int out[N]; +unsigned char arr[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +unsigned int +foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=diff; + } + return s; +} + +unsigned int +bar (int i, unsigned int diff, unsigned short *in) +{ + int j; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + return diff; +} + +int main (void) +{ + int i, j; + unsigned int diff; + unsigned int s=0,sum=0; + + check_vect (); + + for (i = 0; i < N+M; i++) { + in[i] = i; + } + + sum=foo (); + + for (i = 0; i < N; i++) { + arr[i] = 3; + diff = 0; + diff = bar (i, diff, in); + s += diff; + } + + if (s != sum) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: not allowed" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-4m.c b/gcc/testsuite/gcc.dg/vect/vect-outer-4m.c new file mode 100644 index 00000000000..f85ddbfd467 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-4m.c @@ -0,0 +1,58 @@ +/* { dg-require-effective-target vect_int } */ +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +#define M 128 +unsigned short in[N+M]; +unsigned int out[N]; + +/* Outer-loop vectorization. */ +/* Not vectorized due to multiple-types in the inner-loop. */ + +unsigned int +foo (){ + int i,j; + unsigned int diff; + unsigned int s=0; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s+=((unsigned short)diff>>3); + } + return s; +} + +int main (void) +{ + int i, j; + unsigned int diff; + unsigned int s=0,sum=0; + + check_vect (); + + for (i = 0; i < N+M; i++) { + in[i] = i; + } + + sum=foo (); + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j+=8) { + diff += in[j+i]; + } + s += ((unsigned short)diff>>3); + } + + if (s != sum) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-5.c b/gcc/testsuite/gcc.dg/vect/vect-outer-5.c new file mode 100644 index 00000000000..f998cb21b7c --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-5.c @@ -0,0 +1,83 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <signal.h> +#include "tree-vect.h" + +#define N 64 +#define MAX 42 + +extern void abort(void); + +int main1 () +{ + float A[N] __attribute__ ((__aligned__(16))); + float B[N] __attribute__ ((__aligned__(16))); + float C[N] __attribute__ ((__aligned__(16))); + float D[N] __attribute__ ((__aligned__(16))); + float s; + + int i, j; + + for (i = 0; i < N; i++) + { + A[i] = i; + B[i] = i; + C[i] = i; + D[i] = i; + } + + /* Outer-loop 1: Vectorizable with respect to dependence distance. */ + for (i = 0; i < N-20; i++) + { + s = 0; + for (j=0; j<N; j+=4) + s += C[j]; + A[i] = A[i+20] + s; + } + + /* check results: */ + for (i = 0; i < N-20; i++) + { + s = 0; + for (j=0; j<N; j+=4) + s += C[j]; + if (A[i] != D[i+20] + s) + abort (); + } + + /* Outer-loop 2: Not vectorizable because of dependence distance. */ + for (i = 0; i < 4; i++) + { + s = 0; + for (j=0; j<N; j+=4) + s += C[j]; + B[i] = B[i+3] + s; + } + + /* check results: */ + for (i = 0; i < 4; i++) + { + s = 0; + for (j=0; j<N; j+=4) + s += C[j]; + if (B[i] != D[i+3] + s) + abort (); + } + + return 0; +} + +int main () +{ + check_vect (); + return main1(); +} + +/* NOTE: We temporarily xfail the following check until versioning for + aliasing is fixed to avoid versioning when the dependence distance + is known. */ +/* { dg-final { scan-tree-dump-times "not vectorized: possible dependence between data-refs" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "zero step in outer loop." 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-6.c b/gcc/testsuite/gcc.dg/vect/vect-outer-6.c new file mode 100644 index 00000000000..f4adfaa06fc --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-6.c @@ -0,0 +1,65 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <signal.h> +#include "tree-vect.h" + +#define N 64 +#define MAX 42 + +float A[N] __attribute__ ((__aligned__(16))); +float B[N] __attribute__ ((__aligned__(16))); +float C[N] __attribute__ ((__aligned__(16))); +float D[N] __attribute__ ((__aligned__(16))); +extern void abort(void); + +int main1 () +{ + float s; + + int i, j; + + for (i = 0; i < 8; i++) + { + s = 0; + for (j=0; j<8; j+=4) + s += C[j]; + A[i] = s; + } + + return 0; +} + +int main () +{ + int i,j; + float s; + + check_vect (); + + for (i = 0; i < N; i++) + { + A[i] = i; + B[i] = i; + C[i] = i; + D[i] = i; + } + + main1(); + + /* check results: */ + for (i = 0; i < 8; i++) + { + s = 0; + for (j=0; j<8; j+=4) + s += C[j]; + if (A[i] != s) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "zero step in outer loop." 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-fir-lb.c b/gcc/testsuite/gcc.dg/vect/vect-outer-fir-lb.c new file mode 100644 index 00000000000..768b1043261 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-fir-lb.c @@ -0,0 +1,80 @@ +/* { dg-require-effective-target vect_float } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +#define M 64 +float in[N+M]; +float coeff[M]; +float out[N]; +float fir_out[N]; + +/* Should be vectorized. Fixed misaligment in the inner-loop. */ +/* Currently not vectorized because the loop-count for the inner-loop + has a maybe_zero component. Will be fixed when we incorporate the + "cond_expr in rhs" patch. */ +void foo (){ + int i,j,k; + float diff; + + for (i = 0; i < N; i++) { + out[i] = 0; + } + + for (k = 0; k < 4; k++) { + for (i = 0; i < N; i++) { + diff = 0; + j = k; + + do { + diff += in[j+i]*coeff[j]; + j+=4; + } while (j < M); + + out[i] += diff; + } + } + +} + +/* Vectorized. Changing misalignment in the inner-loop. */ +void fir (){ + int i,j,k; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j++) { + diff += in[j+i]*coeff[j]; + } + fir_out[i] = diff; + } +} + + +int main (void) +{ + check_vect (); + int i, j; + float diff; + + for (i = 0; i < M; i++) + coeff[i] = i; + for (i = 0; i < N+M; i++) + in[i] = i; + + foo (); + fir (); + + for (i = 0; i < N; i++) { + if (out[i] != fir_out[i]) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 2 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail vect_no_align } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-fir.c b/gcc/testsuite/gcc.dg/vect/vect-outer-fir.c new file mode 100644 index 00000000000..0b03dc9a848 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-outer-fir.c @@ -0,0 +1,77 @@ +/* { dg-require-effective-target vect_float } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 40 +#define M 128 +float in[N+M]; +float coeff[M]; +float out[N]; +float fir_out[N]; + +/* Should be vectorized. Fixed misaligment in the inner-loop. */ +/* Currently not vectorized because we get too many BBs in the inner-loop, + because the compiler doesn't realize that the inner-loop executes at + least once (cause k<4), and so there's no need to create a guard code + to skip the inner-loop in case it doesn't execute. */ +void foo (){ + int i,j,k; + float diff; + + for (i = 0; i < N; i++) { + out[i] = 0; + } + + for (k = 0; k < 4; k++) { + for (i = 0; i < N; i++) { + diff = 0; + for (j = k; j < M; j+=4) { + diff += in[j+i]*coeff[j]; + } + out[i] += diff; + } + } + +} + +/* Vectorized. Changing misalignment in the inner-loop. */ +void fir (){ + int i,j,k; + float diff; + + for (i = 0; i < N; i++) { + diff = 0; + for (j = 0; j < M; j++) { + diff += in[j+i]*coeff[j]; + } + fir_out[i] = diff; + } +} + + +int main (void) +{ + check_vect (); + int i, j; + float diff; + + for (i = 0; i < M; i++) + coeff[i] = i; + for (i = 0; i < N+M; i++) + in[i] = i; + + foo (); + fir (); + + for (i = 0; i < N; i++) { + if (out[i] != fir_out[i]) + abort (); + } + + return 0; +} + +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 2 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail vect_no_align } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c index 4991937d2ec..48a50ddfb10 100644 --- a/gcc/tree-data-ref.c +++ b/gcc/tree-data-ref.c @@ -489,7 +489,7 @@ dump_ddrs (FILE *file, VEC (ddr_p, heap) *ddrs) /* Expresses EXP as VAR + OFF, where off is a constant. The type of OFF will be ssizetype. */ -static void +void split_constant_offset (tree exp, tree *var, tree *off) { tree type = TREE_TYPE (exp), otype; diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h index 99f8b23f138..2ae58266db6 100644 --- a/gcc/tree-data-ref.h +++ b/gcc/tree-data-ref.h @@ -388,4 +388,7 @@ index_in_loop_nest (int var, VEC (loop_p, heap) *loop_nest) /* In lambda-code.c */ bool lambda_transform_legal_p (lambda_trans_matrix, int, VEC (ddr_p, heap) *); +/* In tree-data-refs.c */ +void split_constant_offset (tree , tree *, tree *); + #endif /* GCC_TREE_DATA_REF_H */ diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c index 5fb54621359..35e38d01471 100644 --- a/gcc/tree-vect-analyze.c +++ b/gcc/tree-vect-analyze.c @@ -1279,6 +1279,8 @@ vect_compute_data_ref_alignment (struct data_reference *dr) { tree stmt = DR_STMT (dr); stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree ref = DR_REF (dr); tree vectype; tree base, base_addr; @@ -1295,13 +1297,42 @@ vect_compute_data_ref_alignment (struct data_reference *dr) misalign = DR_INIT (dr); aligned_to = DR_ALIGNED_TO (dr); base_addr = DR_BASE_ADDRESS (dr); + + /* In case the dataref is in an inner-loop of the loop that is being + vectorized (LOOP), we use the base and misalignment information + relative to the outer-loop (LOOP). This is ok only if the misalignment + stays the same throughout the execution of the inner-loop, which is why + we have to check that the stride of the dataref in the inner-loop evenly + divides by the vector size. */ + if (nested_in_vect_loop_p (loop, stmt)) + { + tree step = DR_STEP (dr); + HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); + + if (dr_step % UNITS_PER_SIMD_WORD == 0) + { + if (vect_print_dump_info (REPORT_ALIGNMENT)) + fprintf (vect_dump, "inner step divides the vector-size."); + misalign = STMT_VINFO_DR_INIT (stmt_info); + aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info); + base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info); + } + else + { + if (vect_print_dump_info (REPORT_ALIGNMENT)) + fprintf (vect_dump, "inner step doesn't divide the vector-size."); + misalign = NULL_TREE; + } + } + base = build_fold_indirect_ref (base_addr); vectype = STMT_VINFO_VECTYPE (stmt_info); alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT); - if (tree_int_cst_compare (aligned_to, alignment) < 0) + if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0) + || !misalign) { - if (vect_print_dump_info (REPORT_DETAILS)) + if (vect_print_dump_info (REPORT_ALIGNMENT)) { fprintf (vect_dump, "Unknown alignment for access: "); print_generic_expr (vect_dump, base, TDF_SLIM); @@ -1980,20 +2011,39 @@ static bool vect_analyze_data_ref_access (struct data_reference *dr) { tree step = DR_STEP (dr); - HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); tree scalar_type = TREE_TYPE (DR_REF (dr)); HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); tree stmt = DR_STMT (dr); - /* For interleaving, STRIDE is STEP counted in elements, i.e., the size of the - interleaving group (including gaps). */ - HOST_WIDE_INT stride = dr_step / type_size; + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); + HOST_WIDE_INT stride; + + /* Don't allow invariant accesses. */ + if (dr_step == 0) + return false; - if (!step) + if (nested_in_vect_loop_p (loop, stmt)) { - if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "bad data-ref access"); - return false; + /* For the rest of the analysis we use the outer-loop step. */ + step = STMT_VINFO_DR_STEP (stmt_info); + dr_step = TREE_INT_CST_LOW (step); + + if (dr_step == 0) + { + if (vect_print_dump_info (REPORT_ALIGNMENT)) + fprintf (vect_dump, "zero step in outer loop."); + if (DR_IS_READ (dr)) + return true; + else + return false; + } } + + /* For interleaving, STRIDE is STEP counted in elements, i.e., the size of the + interleaving group (including gaps). */ + stride = dr_step / type_size; /* Consecutive? */ if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))) @@ -2003,6 +2053,13 @@ vect_analyze_data_ref_access (struct data_reference *dr) return true; } + if (nested_in_vect_loop_p (loop, stmt)) + { + if (vect_print_dump_info (REPORT_ALIGNMENT)) + fprintf (vect_dump, "strided access in outer loop."); + return false; + } + /* Not consecutive access is possible only if it is a part of interleaving. */ if (!DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt))) { @@ -2231,6 +2288,7 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo) tree stmt; stmt_vec_info stmt_info; basic_block bb; + tree base, offset, init; if (!dr || !DR_REF (dr)) { @@ -2238,36 +2296,13 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo) fprintf (vect_dump, "not vectorized: unhandled data-ref "); return false; } - - /* Update DR field in stmt_vec_info struct. */ + stmt = DR_STMT (dr); stmt_info = vinfo_for_stmt (stmt); - /* If outer-loop vectorization: we don't yet support datarefs - in the innermost loop. */ - bb = bb_for_stmt (stmt); - if (bb->loop_father != LOOP_VINFO_LOOP (loop_vinfo)) - { - if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) - fprintf (vect_dump, "not vectorized: data-ref in nested loop"); - return false; - } - - if (STMT_VINFO_DATA_REF (stmt_info)) - { - if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) - { - fprintf (vect_dump, - "not vectorized: more than one data ref in stmt: "); - print_generic_expr (vect_dump, stmt, TDF_SLIM); - } - return false; - } - STMT_VINFO_DATA_REF (stmt_info) = dr; - /* Check that analysis of the data-ref succeeded. */ if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) - || !DR_STEP (dr)) + || !DR_STEP (dr)) { if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) { @@ -2294,7 +2329,127 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo) } return false; } - + + base = unshare_expr (DR_BASE_ADDRESS (dr)); + offset = unshare_expr (DR_OFFSET (dr)); + init = unshare_expr (DR_INIT (dr)); + + /* Update DR field in stmt_vec_info struct. */ + bb = bb_for_stmt (stmt); + + /* If the dataref is in an inner-loop of the loop that is considered for + for vectorization, we also want to analyze the access relative to + the outer-loop (DR contains information only relative to the + inner-most enclosing loop). We do that by building a reference to the + first location accessed by the inner-loop, and analyze it relative to + the outer-loop. */ + if (nested_in_vect_loop_p (loop, stmt)) + { + tree outer_step, outer_base, outer_init; + HOST_WIDE_INT pbitsize, pbitpos; + tree poffset; + enum machine_mode pmode; + int punsignedp, pvolatilep; + affine_iv base_iv, offset_iv; + tree dinit; + + /* Build a reference to the first location accessed by the + inner-loop: *(BASE+INIT). (The first location is actually + BASE+INIT+OFFSET, but we add OFFSET separately later. */ + tree inner_base = build_fold_indirect_ref + (fold_build2 (PLUS_EXPR, TREE_TYPE (base), base, init)); + + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (dump_file, "analyze in outer-loop: "); + print_generic_expr (dump_file, inner_base, TDF_SLIM); + } + + outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos, + &poffset, &pmode, &punsignedp, &pvolatilep, false); + gcc_assert (outer_base != NULL_TREE); + + if (pbitpos % BITS_PER_UNIT != 0) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (dump_file, "failed: bit offset alignment.\n"); + return false; + } + + outer_base = build_fold_addr_expr (outer_base); + if (!simple_iv (loop, stmt, outer_base, &base_iv, false)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (dump_file, "failed: evolution of base is not affine.\n"); + return false; + } + + if (offset) + { + if (poffset) + poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, poffset); + else + poffset = offset; + } + + if (!poffset) + { + offset_iv.base = ssize_int (0); + offset_iv.step = ssize_int (0); + } + else if (!simple_iv (loop, stmt, poffset, &offset_iv, false)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (dump_file, "evolution of offset is not affine.\n"); + return false; + } + + outer_init = ssize_int (pbitpos / BITS_PER_UNIT); + split_constant_offset (base_iv.base, &base_iv.base, &dinit); + outer_init = size_binop (PLUS_EXPR, outer_init, dinit); + split_constant_offset (offset_iv.base, &offset_iv.base, &dinit); + outer_init = size_binop (PLUS_EXPR, outer_init, dinit); + + outer_step = size_binop (PLUS_EXPR, + fold_convert (ssizetype, base_iv.step), + fold_convert (ssizetype, offset_iv.step)); + + STMT_VINFO_DR_STEP (stmt_info) = outer_step; + /* FIXME: Use canonicalize_base_object_address (base_iv.base); */ + STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base; + STMT_VINFO_DR_INIT (stmt_info) = outer_init; + STMT_VINFO_DR_OFFSET (stmt_info) = + fold_convert (ssizetype, offset_iv.base); + STMT_VINFO_DR_ALIGNED_TO (stmt_info) = + size_int (highest_pow2_factor (offset_iv.base)); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\touter base_address: "); + print_generic_expr (dump_file, STMT_VINFO_DR_BASE_ADDRESS (stmt_info), TDF_SLIM); + fprintf (dump_file, "\n\touter offset from base address: "); + print_generic_expr (dump_file, STMT_VINFO_DR_OFFSET (stmt_info), TDF_SLIM); + fprintf (dump_file, "\n\touter constant offset from base address: "); + print_generic_expr (dump_file, STMT_VINFO_DR_INIT (stmt_info), TDF_SLIM); + fprintf (dump_file, "\n\touter step: "); + print_generic_expr (dump_file, STMT_VINFO_DR_STEP (stmt_info), TDF_SLIM); + fprintf (dump_file, "\n\touter aligned to: "); + print_generic_expr (dump_file, STMT_VINFO_DR_ALIGNED_TO (stmt_info), TDF_SLIM); + } + } + + if (STMT_VINFO_DATA_REF (stmt_info)) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) + { + fprintf (vect_dump, + "not vectorized: more than one data ref in stmt: "); + print_generic_expr (vect_dump, stmt, TDF_SLIM); + } + return false; + } + STMT_VINFO_DATA_REF (stmt_info) = dr; + /* Set vectype for STMT. */ scalar_type = TREE_TYPE (DR_REF (dr)); STMT_VINFO_VECTYPE (stmt_info) = diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c index 6e88fa97634..7c5b1b20b4d 100644 --- a/gcc/tree-vect-transform.c +++ b/gcc/tree-vect-transform.c @@ -49,14 +49,14 @@ along with GCC; see the file COPYING3. If not see static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *); static tree vect_create_destination_var (tree, tree); static tree vect_create_data_ref_ptr - (tree, block_stmt_iterator *, tree, tree *, tree *, bool, tree); -static tree vect_create_addr_base_for_vector_ref (tree, tree *, tree); -static tree vect_setup_realignment (tree, block_stmt_iterator *, tree *); + (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *); +static tree vect_create_addr_base_for_vector_ref + (tree, tree *, tree, struct loop *); static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *); static tree vect_get_vec_def_for_operand (tree, tree, tree *); -static tree vect_init_vector (tree, tree, tree); +static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *); static void vect_finish_stmt_generation - (tree stmt, tree vec_stmt, block_stmt_iterator *bsi); + (tree stmt, tree vec_stmt, block_stmt_iterator *); static bool vect_is_simple_cond (tree, loop_vec_info); static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree); static tree get_initial_def_for_reduction (tree, tree, tree *); @@ -371,6 +371,8 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code, enum machine_mode mode; tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1); int op_type = TREE_CODE_LENGTH (TREE_CODE (operation)); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); /* Cost of reduction op inside loop. */ STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST; @@ -393,30 +395,33 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code, We have a reduction operator that will reduce the vector in one statement. Also requires scalar extract. */ - if (reduc_code < NUM_TREE_CODES) - outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST; - else + if (!nested_in_vect_loop_p (loop, orig_stmt)) { - int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); - tree bitsize = - TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0))); - int element_bitsize = tree_low_cst (bitsize, 1); - int nelements = vec_size_in_bits / element_bitsize; - - optab = optab_for_tree_code (code, vectype); - - /* We have a whole vector shift available. */ - if (VECTOR_MODE_P (mode) - && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing - && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) - /* Final reduction via vector shifts and the reduction operator. Also - requires scalar extract. */ - outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST - + TARG_VEC_TO_SCALAR_COST); - else - /* Use extracts and reduction op for final reduction. For N elements, - we have N extracts and N-1 reduction ops. */ - outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST); + if (reduc_code < NUM_TREE_CODES) + outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST; + else + { + int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); + tree bitsize = + TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0))); + int element_bitsize = tree_low_cst (bitsize, 1); + int nelements = vec_size_in_bits / element_bitsize; + + optab = optab_for_tree_code (code, vectype); + + /* We have a whole vector shift available. */ + if (VECTOR_MODE_P (mode) + && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing + && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) + /* Final reduction via vector shifts and the reduction operator. Also + requires scalar extract. */ + outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST + + TARG_VEC_TO_SCALAR_COST); + else + /* Use extracts and reduction op for final reduction. For N elements, + we have N extracts and N-1 reduction ops. */ + outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST); + } } STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost; @@ -609,7 +614,19 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies) break; } - case dr_unaligned_software_pipeline: + case dr_explicit_realign: + { + inner_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST); + + /* FIXME: If the misalignment remains fixed across the iterations of + the containing loop, the following cost should be added to the + outside costs. */ + if (targetm.vectorize.builtin_mask_for_load) + inner_cost += TARG_VEC_STMT_COST; + + break; + } + case dr_explicit_realign_optimized: { int outer_cost = 0; @@ -706,6 +723,19 @@ vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) STMT: The statement containing the data reference. NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. OFFSET: Optional. If supplied, it is be added to the initial address. + LOOP: Specify relative to which loop-nest should the address be computed. + For example, when the dataref is in an inner-loop nested in an + outer-loop that is now being vectorized, LOOP can be either the + outer-loop, or the inner-loop. The first memory location accessed + by the following dataref ('in' points to short): + + for (i=0; i<N; i++) + for (j=0; j<M; j++) + s += in[i+j] + + is as follows: + if LOOP=i_loop: &in (relative to i_loop) + if LOOP=j_loop: &in+i*2B (relative to j_loop) Output: 1. Return an SSA_NAME whose value is the address of the memory location of @@ -718,14 +748,15 @@ vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) static tree vect_create_addr_base_for_vector_ref (tree stmt, tree *new_stmt_list, - tree offset) + tree offset, + struct loop *loop) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - tree data_ref_base_expr = unshare_expr (DR_BASE_ADDRESS (dr)); - tree base_name = build_fold_indirect_ref (data_ref_base_expr); + struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; + tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr)); + tree base_name; tree data_ref_base_var; - tree data_ref_base; tree new_base_stmt; tree vec_stmt; tree addr_base, addr_expr; @@ -733,12 +764,26 @@ vect_create_addr_base_for_vector_ref (tree stmt, tree base_offset = unshare_expr (DR_OFFSET (dr)); tree init = unshare_expr (DR_INIT (dr)); tree vect_ptr_type, addr_expr2; - - + tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); + + gcc_assert (loop); + if (loop != containing_loop) + { + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + + gcc_assert (nested_in_vect_loop_p (loop, stmt)); + + data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); + base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info)); + init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info)); + } + /* Create data_ref_base */ - data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base_expr), "batmp"); + base_name = build_fold_indirect_ref (data_ref_base); + data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp"); add_referenced_var (data_ref_base_var); - data_ref_base = force_gimple_operand (data_ref_base_expr, &new_base_stmt, + data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt, true, data_ref_base_var); append_to_statement_list_force(new_base_stmt, new_stmt_list); @@ -753,16 +798,6 @@ vect_create_addr_base_for_vector_ref (tree stmt, if (offset) { tree tmp = create_tmp_var (sizetype, "offset"); - tree step; - - /* For interleaved access step we divide STEP by the size of the - interleaving group. */ - if (DR_GROUP_SIZE (stmt_info)) - step = fold_build2 (TRUNC_DIV_EXPR, TREE_TYPE (offset), DR_STEP (dr), - build_int_cst (TREE_TYPE (offset), - DR_GROUP_SIZE (stmt_info))); - else - step = DR_STEP (dr); add_referenced_var (tmp); offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step); @@ -773,8 +808,8 @@ vect_create_addr_base_for_vector_ref (tree stmt, } /* base + base_offset */ - addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base, - base_offset); + addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base), + data_ref_base, base_offset); vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); @@ -811,7 +846,7 @@ vect_create_addr_base_for_vector_ref (tree stmt, 1. STMT: a stmt that references memory. Expected to be of the form GIMPLE_MODIFY_STMT <name, data-ref> or GIMPLE_MODIFY_STMT <data-ref, name>. - 2. BSI: block_stmt_iterator where new stmts can be added. + 2. AT_LOOP: the loop where the vector memref is to be created. 3. OFFSET (optional): an offset to be added to the initial address accessed by the data-ref in STMT. 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain @@ -838,18 +873,22 @@ vect_create_addr_base_for_vector_ref (tree stmt, Return the increment stmt that updates the pointer in PTR_INCR. - 3. Return the pointer. */ + 3. Set INV_P to true if the access pattern of the data reference in the + vectorized loop is invariant. Set it to false otherwise. + + 4. Return the pointer. */ static tree -vect_create_data_ref_ptr (tree stmt, - block_stmt_iterator *bsi ATTRIBUTE_UNUSED, +vect_create_data_ref_ptr (tree stmt, struct loop *at_loop, tree offset, tree *initial_address, tree *ptr_incr, - bool only_init, tree type) + bool only_init, tree type, bool *inv_p) { tree base_name; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); + struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree vect_ptr_type; tree vect_ptr; @@ -857,11 +896,31 @@ vect_create_data_ref_ptr (tree stmt, tree new_temp; tree vec_stmt; tree new_stmt_list = NULL_TREE; - edge pe = loop_preheader_edge (loop); + edge pe; basic_block new_bb; tree vect_ptr_init; struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + tree vptr; + block_stmt_iterator incr_bsi; + bool insert_after; + tree indx_before_incr, indx_after_incr; + tree incr; + tree step; + + /* Check the step (evolution) of the load in LOOP, and record + whether it's invariant. */ + if (nested_in_vect_loop) + step = STMT_VINFO_DR_STEP (stmt_info); + else + step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info)); + + if (tree_int_cst_compare (step, size_zero_node) == 0) + *inv_p = true; + else + *inv_p = false; + /* Create an expression for the first address accessed by this load + in LOOP. */ base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr))); if (vect_print_dump_info (REPORT_DETAILS)) @@ -904,12 +963,44 @@ vect_create_data_ref_ptr (tree stmt, var_ann (vect_ptr)->subvars = DR_SUBVARS (dr); + /** Note: If the dataref is in an inner-loop nested in LOOP, and we are + vectorizing LOOP (i.e. outer-loop vectorization), we need to create two + def-use update cycles for the pointer: One relative to the outer-loop + (LOOP), which is what steps (3) and (4) below do. The other is relative + to the inner-loop (which is the inner-most loop containing the dataref), + and this is done be step (5) below. + + When vectorizing inner-most loops, the vectorized loop (LOOP) is also the + inner-most loop, and so steps (3),(4) work the same, and step (5) is + redundant. Steps (3),(4) create the following: + + vp0 = &base_addr; + LOOP: vp1 = phi(vp0,vp2) + ... + ... + vp2 = vp1 + step + goto LOOP + + If there is an inner-loop nested in loop, then step (5) will also be + applied, and an additional update in the inner-loop will be created: + + vp0 = &base_addr; + LOOP: vp1 = phi(vp0,vp2) + ... + inner: vp3 = phi(vp1,vp4) + vp4 = vp3 + inner_step + if () goto inner + ... + vp2 = vp1 + step + if () goto LOOP */ + /** (3) Calculate the initial address the vector-pointer, and set the vector-pointer to point to it before the loop: **/ /* Create: (&(base[init_val+offset]) in the loop preheader. */ + new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, - offset); + offset, loop); pe = loop_preheader_edge (loop); new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list); gcc_assert (!new_bb); @@ -924,25 +1015,31 @@ vect_create_data_ref_ptr (tree stmt, gcc_assert (!new_bb); - /** (4) Handle the updating of the vector-pointer inside the loop: **/ + /** (4) Handle the updating of the vector-pointer inside the loop. + This is needed when ONLY_INIT is false, and also when AT_LOOP + is the inner-loop nested in LOOP (during outer-loop vectorization). + **/ - if (only_init) /* No update in loop is required. */ + if (only_init && at_loop == loop) /* No update in loop is required. */ { /* Copy the points-to information if it exists. */ if (DR_PTR_INFO (dr)) duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr)); - return vect_ptr_init; + vptr = vect_ptr_init; } else { - block_stmt_iterator incr_bsi; - bool insert_after; - tree indx_before_incr, indx_after_incr; - tree incr; + /* The step of the vector pointer is the Vector Size. */ + tree step = TYPE_SIZE_UNIT (vectype); + /* One exception to the above is when the scalar step of the load in + LOOP is zero. In this case the step here is also zero. */ + if (*inv_p) + step = size_zero_node; standard_iv_increment_position (loop, &incr_bsi, &insert_after); + create_iv (vect_ptr_init, - fold_convert (vect_ptr_type, TYPE_SIZE_UNIT (vectype)), + fold_convert (vect_ptr_type, step), NULL_TREE, loop, &incr_bsi, insert_after, &indx_before_incr, &indx_after_incr); incr = bsi_stmt (incr_bsi); @@ -960,15 +1057,51 @@ vect_create_data_ref_ptr (tree stmt, if (ptr_incr) *ptr_incr = incr; - return indx_before_incr; + vptr = indx_before_incr; + } + + if (!nested_in_vect_loop || only_init) + return vptr; + + + /** (5) Handle the updating of the vector-pointer inside the inner-loop + nested in LOOP, if exists: **/ + + gcc_assert (nested_in_vect_loop); + if (!only_init) + { + standard_iv_increment_position (containing_loop, &incr_bsi, + &insert_after); + create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE, + containing_loop, &incr_bsi, insert_after, &indx_before_incr, + &indx_after_incr); + incr = bsi_stmt (incr_bsi); + set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo)); + + /* Copy the points-to information if it exists. */ + if (DR_PTR_INFO (dr)) + { + duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr)); + duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr)); + } + merge_alias_info (vect_ptr_init, indx_before_incr); + merge_alias_info (vect_ptr_init, indx_after_incr); + if (ptr_incr) + *ptr_incr = incr; + + return indx_before_incr; } + else + gcc_unreachable (); } /* Function bump_vector_ptr - Increment a pointer (to a vector type) by vector-size. Connect the new - increment stmt to the existing def-use update-chain of the pointer. + Increment a pointer (to a vector type) by vector-size. If requested, + i.e. if PTR-INCR is given, then also connect the new increment stmt + to the existing def-use update-chain of the pointer, by modifying + the PTR_INCR as illustrated below: The pointer def-use update-chain before this function: DATAREF_PTR = phi (p_0, p_2) @@ -978,18 +1111,20 @@ vect_create_data_ref_ptr (tree stmt, The pointer def-use update-chain after this function: DATAREF_PTR = phi (p_0, p_2) .... - NEW_DATAREF_PTR = DATAREF_PTR + vector_size + NEW_DATAREF_PTR = DATAREF_PTR + BUMP .... PTR_INCR: p_2 = NEW_DATAREF_PTR + step Input: DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated in the loop. - PTR_INCR - the stmt that updates the pointer in each iteration of the loop. - The increment amount across iterations is also expected to be - vector_size. + PTR_INCR - optional. The stmt that updates the pointer in each iteration of + the loop. The increment amount across iterations is expected + to be vector_size. BSI - location where the new update stmt is to be placed. STMT - the original scalar memory-access stmt that is being vectorized. + BUMP - optional. The offset by which to bump the pointer. If not given, + the offset is assumed to be vector_size. Output: Return NEW_DATAREF_PTR as illustrated above. @@ -997,7 +1132,7 @@ vect_create_data_ref_ptr (tree stmt, static tree bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi, - tree stmt) + tree stmt, tree bump) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); @@ -1010,6 +1145,9 @@ bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi, use_operand_p use_p; tree new_dataref_ptr; + if (bump) + update = bump; + incr_stmt = build_gimple_modify_stmt (ptr_var, build2 (POINTER_PLUS_EXPR, vptr_type, dataref_ptr, update)); @@ -1017,6 +1155,14 @@ bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi, GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr; vect_finish_stmt_generation (stmt, incr_stmt, bsi); + /* Copy the points-to information if it exists. */ + if (DR_PTR_INFO (dr)) + duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); + merge_alias_info (new_dataref_ptr, dataref_ptr); + + if (!ptr_incr) + return new_dataref_ptr; + /* Update the vector-pointer's cross-iteration increment. */ FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) { @@ -1028,11 +1174,6 @@ bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi, gcc_assert (tree_int_cst_compare (use, update) == 0); } - /* Copy the points-to information if it exists. */ - if (DR_PTR_INFO (dr)) - duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); - merge_alias_info (new_dataref_ptr, dataref_ptr); - return new_dataref_ptr; } @@ -1067,15 +1208,16 @@ vect_create_destination_var (tree scalar_dest, tree vectype) /* Function vect_init_vector. Insert a new stmt (INIT_STMT) that initializes a new vector variable with - the vector elements of VECTOR_VAR. Return the DEF of INIT_STMT. It will be - used in the vectorization of STMT. */ + the vector elements of VECTOR_VAR. Place the initialization at BSI if it + is not NULL. Otherwise, place the initialization at the loop preheader. + Return the DEF of INIT_STMT. + It will be used in the vectorization of STMT. */ static tree -vect_init_vector (tree stmt, tree vector_var, tree vector_type) +vect_init_vector (tree stmt, tree vector_var, tree vector_type, + block_stmt_iterator *bsi) { stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree new_var; tree init_stmt; tree vec_oprnd; @@ -1083,19 +1225,25 @@ vect_init_vector (tree stmt, tree vector_var, tree vector_type) tree new_temp; basic_block new_bb; - if (nested_in_vect_loop_p (loop, stmt)) - loop = loop->inner; - new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_"); add_referenced_var (new_var); - init_stmt = build_gimple_modify_stmt (new_var, vector_var); new_temp = make_ssa_name (new_var, init_stmt); GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp; - pe = loop_preheader_edge (loop); - new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); - gcc_assert (!new_bb); + if (bsi) + vect_finish_stmt_generation (stmt, init_stmt, bsi); + else + { + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + + if (nested_in_vect_loop_p (loop, stmt)) + loop = loop->inner; + pe = loop_preheader_edge (loop); + new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); + gcc_assert (!new_bb); + } if (vect_print_dump_info (REPORT_DETAILS)) { @@ -1233,7 +1381,7 @@ get_initial_def_for_induction (tree iv_phi) } /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */ vec = build_constructor_from_list (vectype, nreverse (t)); - vec_init = vect_init_vector (iv_phi, vec, vectype); + vec_init = vect_init_vector (iv_phi, vec, vectype, NULL); } @@ -1254,7 +1402,7 @@ get_initial_def_for_induction (tree iv_phi) for (i = 0; i < nunits; i++) t = tree_cons (NULL_TREE, unshare_expr (new_name), t); vec = build_constructor_from_list (vectype, t); - vec_step = vect_init_vector (iv_phi, vec, vectype); + vec_step = vect_init_vector (iv_phi, vec, vectype, NULL); /* Create the following def-use cycle: @@ -1310,7 +1458,7 @@ get_initial_def_for_induction (tree iv_phi) for (i = 0; i < nunits; i++) t = tree_cons (NULL_TREE, unshare_expr (new_name), t); vec = build_constructor_from_list (vectype, t); - vec_step = vect_init_vector (iv_phi, vec, vectype); + vec_step = vect_init_vector (iv_phi, vec, vectype, NULL); vec_def = induc_def; prev_stmt_vinfo = vinfo_for_stmt (induction_phi); @@ -1447,7 +1595,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); vec_cst = build_vector (vector_type, t); - return vect_init_vector (stmt, vec_cst, vector_type); + return vect_init_vector (stmt, vec_cst, vector_type, NULL); } /* Case 2: operand is defined outside the loop - loop invariant. */ @@ -1468,8 +1616,7 @@ vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) /* FIXME: use build_constructor directly. */ vector_type = get_vectype_for_scalar_type (TREE_TYPE (def)); vec_inv = build_constructor_from_list (vector_type, t); - - return vect_init_vector (stmt, vec_inv, vector_type); + return vect_init_vector (stmt, vec_inv, vector_type, NULL); } /* Case 3: operand is defined inside the loop. */ @@ -4112,7 +4259,7 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum machine_mode vec_mode; tree dummy; - enum dr_alignment_support alignment_support_cheme; + enum dr_alignment_support alignment_support_scheme; tree def, def_stmt; enum vect_def_type dt; stmt_vec_info prev_stmt_info = NULL; @@ -4124,7 +4271,10 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) bool strided_store = false; unsigned int group_size, i; VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL; + bool inv_p; + gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) { @@ -4198,6 +4348,9 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++; + /* FORNOW */ + gcc_assert (!nested_in_vect_loop_p (loop, stmt)); + /* We vectorize all the stmts of the interleaving group when we reach the last stmt in the group. */ if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt)) @@ -4220,9 +4373,9 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) dr_chain = VEC_alloc (tree, heap, group_size); oprnds = VEC_alloc (tree, heap, group_size); - alignment_support_cheme = vect_supportable_dr_alignment (first_dr); - gcc_assert (alignment_support_cheme); - gcc_assert (alignment_support_cheme == dr_aligned); /* FORNOW */ + alignment_support_scheme = vect_supportable_dr_alignment (first_dr); + gcc_assert (alignment_support_scheme); + gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */ /* In case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate @@ -4292,9 +4445,10 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) VEC_quick_push(tree, oprnds, vec_oprnd); next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); } - dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, NULL_TREE, + dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE, &dummy, &ptr_incr, false, - TREE_TYPE (vec_oprnd)); + TREE_TYPE (vec_oprnd), &inv_p); + gcc_assert (!inv_p); } else { @@ -4312,7 +4466,8 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) VEC_replace(tree, dr_chain, i, vec_oprnd); VEC_replace(tree, oprnds, i, vec_oprnd); } - dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); + dataref_ptr = + bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); } if (strided_store) @@ -4348,7 +4503,8 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (!next_stmt) break; /* Bump the vector pointer. */ - dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); + dataref_ptr = + bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); } } @@ -4359,14 +4515,17 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* Function vect_setup_realignment This function is called when vectorizing an unaligned load using - the dr_unaligned_software_pipeline scheme. + the dr_explicit_realign[_optimized] scheme. This function generates the following code at the loop prolog: p = initial_addr; - msq_init = *(floor(p)); # prolog load + x msq_init = *(floor(p)); # prolog load realignment_token = call target_builtin; loop: - msq = phi (msq_init, ---) + x msq = phi (msq_init, ---) + + The stmts marked with x are generated only for the case of + dr_explicit_realign_optimized. The code above sets up a new (vector) pointer, pointing to the first location accessed by STMT, and a "floor-aligned" load using that pointer. @@ -4375,19 +4534,29 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) whose arguments are the result of the prolog-load (created by this function) and the result of a load that takes place in the loop (to be created by the caller to this function). + + For the case of dr_explicit_realign_optimized: The caller to this function uses the phi-result (msq) to create the realignment code inside the loop, and sets up the missing phi argument, as follows: - loop: msq = phi (msq_init, lsq) lsq = *(floor(p')); # load in loop result = realign_load (msq, lsq, realignment_token); + For the case of dr_explicit_realign: + loop: + msq = *(floor(p)); # load in loop + p' = p + (VS-1); + lsq = *(floor(p')); # load in loop + result = realign_load (msq, lsq, realignment_token); + Input: STMT - (scalar) load stmt to be vectorized. This load accesses a memory location that may be unaligned. BSI - place where new code is to be inserted. + ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes + is used. Output: REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load @@ -4396,43 +4565,144 @@ vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) static tree vect_setup_realignment (tree stmt, block_stmt_iterator *bsi, - tree *realignment_token) + tree *realignment_token, + enum dr_alignment_support alignment_support_scheme, + tree init_addr, + struct loop **at_loop) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - edge pe = loop_preheader_edge (loop); + edge pe; tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); tree vec_dest; - tree init_addr; tree inc; tree ptr; tree data_ref; tree new_stmt; basic_block new_bb; - tree msq_init; + tree msq_init = NULL_TREE; tree new_temp; tree phi_stmt; - tree msq; + tree msq = NULL_TREE; + tree stmts = NULL_TREE; + bool inv_p; + bool compute_in_loop = false; + bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); + struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; + struct loop *loop_for_initial_load; + + gcc_assert (alignment_support_scheme == dr_explicit_realign + || alignment_support_scheme == dr_explicit_realign_optimized); + + /* We need to generate three things: + 1. the misalignment computation + 2. the extra vector load (for the optimized realignment scheme). + 3. the phi node for the two vectors from which the realignment is + done (for the optimized realignment scheme). + */ + + /* 1. Determine where to generate the misalignment computation. + + If INIT_ADDR is NULL_TREE, this indicates that the misalignment + calculation will be generated by this function, outside the loop (in the + preheader). Otherwise, INIT_ADDR had already been computed for us by the + caller, inside the loop. + + Background: If the misalignment remains fixed throughout the iterations of + the loop, then both realignment schemes are applicable, and also the + misalignment computation can be done outside LOOP. This is because we are + vectorizing LOOP, and so the memory accesses in LOOP advance in steps that + are a multiple of VS (the Vector Size), and therefore the misalignment in + different vectorized LOOP iterations is always the same. + The problem arises only if the memory access is in an inner-loop nested + inside LOOP, which is now being vectorized using outer-loop vectorization. + This is the only case when the misalignment of the memory access may not + remain fixed thtoughout the iterations of the inner-loop (as exaplained in + detail in vect_supportable_dr_alignment). In this case, not only is the + optimized realignment scheme not applicable, but also the misalignment + computation (and generation of the realignment token that is passed to + REALIGN_LOAD) have to be done inside the loop. + + In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode + or not, which in turn determines if the misalignment is computed inside + the inner-loop, or outside LOOP. */ + + if (init_addr != NULL_TREE) + { + compute_in_loop = true; + gcc_assert (alignment_support_scheme == dr_explicit_realign); + } + + + /* 2. Determine where to generate the extra vector load. + + For the optimized realignment scheme, instead of generating two vector + loads in each iteration, we generate a single extra vector load in the + preheader of the loop, and in each iteration reuse the result of the + vector load from the previous iteration. In case the memory access is in + an inner-loop nested inside LOOP, which is now being vectorized using + outer-loop vectorization, we need to determine whether this initial vector + load should be generated at the preheader of the inner-loop, or can be + generated at the preheader of LOOP. If the memory access has no evolution + in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has + to be generated inside LOOP (in the preheader of the inner-loop). */ - /* 1. Create msq_init = *(floor(p1)) in the loop preheader */ - vec_dest = vect_create_destination_var (scalar_dest, vectype); - ptr = vect_create_data_ref_ptr (stmt, bsi, NULL_TREE, &init_addr, &inc, true, - NULL_TREE); - data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); - new_stmt = build_gimple_modify_stmt (vec_dest, data_ref); - new_temp = make_ssa_name (vec_dest, new_stmt); - GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; - new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); - gcc_assert (!new_bb); - msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0); + if (nested_in_vect_loop) + { + tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); + bool invariant_in_outerloop = + (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); + loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); + } + else + loop_for_initial_load = loop; + if (at_loop) + *at_loop = loop_for_initial_load; + + /* 3. For the case of the optimized realignment, create the first vector + load at the loop preheader. */ + + if (alignment_support_scheme == dr_explicit_realign_optimized) + { + /* Create msq_init = *(floor(p1)) in the loop preheader */ + + gcc_assert (!compute_in_loop); + pe = loop_preheader_edge (loop_for_initial_load); + vec_dest = vect_create_destination_var (scalar_dest, vectype); + ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE, + &init_addr, &inc, true, NULL_TREE, &inv_p); + data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); + new_stmt = build_gimple_modify_stmt (vec_dest, data_ref); + new_temp = make_ssa_name (vec_dest, new_stmt); + GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; + new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); + gcc_assert (!new_bb); + msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0); + } + + /* 4. Create realignment token using a target builtin, if available. + It is done either inside the containing loop, or before LOOP (as + determined above). */ - /* 2. Create permutation mask, if required, in loop preheader. */ if (targetm.vectorize.builtin_mask_for_load) { tree builtin_decl; + /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ + if (compute_in_loop) + gcc_assert (init_addr); /* already computed by the caller. */ + else + { + /* Generate the INIT_ADDR computation outside LOOP. */ + init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, + NULL_TREE, loop); + pe = loop_preheader_edge (loop); + new_bb = bsi_insert_on_edge_immediate (pe, stmts); + gcc_assert (!new_bb); + } + builtin_decl = targetm.vectorize.builtin_mask_for_load (); new_stmt = build_call_expr (builtin_decl, 1, init_addr); vec_dest = vect_create_destination_var (scalar_dest, @@ -4440,8 +4710,17 @@ vect_setup_realignment (tree stmt, block_stmt_iterator *bsi, new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); new_temp = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; - new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); - gcc_assert (!new_bb); + + if (compute_in_loop) + bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT); + else + { + /* Generate the misalignment computation outside LOOP. */ + pe = loop_preheader_edge (loop); + new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); + gcc_assert (!new_bb); + } + *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0); /* The result of the CALL_EXPR to this builtin is determined from @@ -4452,12 +4731,21 @@ vect_setup_realignment (tree stmt, block_stmt_iterator *bsi, gcc_assert (TREE_READONLY (builtin_decl)); } - /* 3. Create msq = phi <msq_init, lsq> in loop */ + if (alignment_support_scheme == dr_explicit_realign) + return msq; + + gcc_assert (!compute_in_loop); + gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); + + + /* 5. Create msq = phi <msq_init, lsq> in loop */ + + pe = loop_preheader_edge (containing_loop); vec_dest = vect_create_destination_var (scalar_dest, vectype); msq = make_ssa_name (vec_dest, NULL_TREE); - phi_stmt = create_phi_node (msq, loop->header); + phi_stmt = create_phi_node (msq, containing_loop->header); SSA_NAME_DEF_STMT (msq) = phi_stmt; - add_phi_arg (phi_stmt, msq_init, loop_preheader_edge (loop)); + add_phi_arg (phi_stmt, msq_init, pe); return msq; } @@ -4747,13 +5035,15 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) stmt_vec_info prev_stmt_info; loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; + bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr; tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree new_temp; int mode; tree new_stmt = NULL_TREE; tree dummy; - enum dr_alignment_support alignment_support_cheme; + enum dr_alignment_support alignment_support_scheme; tree dataref_ptr = NULL_TREE; tree ptr_incr; int nunits = TYPE_VECTOR_SUBPARTS (vectype); @@ -4762,14 +5052,19 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) tree msq = NULL_TREE, lsq; tree offset = NULL_TREE; tree realignment_token = NULL_TREE; - tree phi_stmt = NULL_TREE; + tree phi = NULL_TREE; VEC(tree,heap) *dr_chain = NULL; bool strided_load = false; tree first_stmt; + tree scalar_type; + bool inv_p; + bool compute_in_loop = false; + struct loop *at_loop; gcc_assert (ncopies >= 1); + /* FORNOW. This restriction should be relaxed. */ - if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) + if (nested_in_vect_loop && ncopies > 1) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "multiple types in nested loop."); @@ -4807,6 +5102,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (!STMT_VINFO_DATA_REF (stmt_info)) return false; + scalar_type = TREE_TYPE (DR_REF (dr)); mode = (int) TYPE_MODE (vectype); /* FORNOW. In some cases can vectorize even if data-type not supported @@ -4822,6 +5118,8 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (DR_GROUP_FIRST_DR (stmt_info)) { strided_load = true; + /* FORNOW */ + gcc_assert (! nested_in_vect_loop); /* Check if interleaving is supported. */ if (!vect_strided_load_supported (vectype)) @@ -4860,9 +5158,8 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) group_size = 1; } - alignment_support_cheme = vect_supportable_dr_alignment (first_dr); - gcc_assert (alignment_support_cheme); - + alignment_support_scheme = vect_supportable_dr_alignment (first_dr); + gcc_assert (alignment_support_scheme); /* In case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate @@ -4944,7 +5241,7 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) } Otherwise, the data reference is potentially unaligned on a target that - does not support unaligned accesses (dr_unaligned_software_pipeline) - + does not support unaligned accesses (dr_explicit_realign_optimized) - then generate the following code, in which the data in each iteration is obtained by two vector loads, one from the previous iteration, and one from the current iteration: @@ -4961,27 +5258,52 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) msq = lsq; } */ - if (alignment_support_cheme == dr_unaligned_software_pipeline) + /* If the misalignment remains the same throughout the execution of the + loop, we can create the init_addr and permutation mask at the loop + preheader. Otherwise, it needs to be created inside the loop. + This can only occur when vectorizing memory accesses in the inner-loop + nested within an outer-loop that is being vectorized. */ + + if (nested_in_vect_loop_p (loop, stmt) + && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0)) + { + gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized); + compute_in_loop = true; + } + + if ((alignment_support_scheme == dr_explicit_realign_optimized + || alignment_support_scheme == dr_explicit_realign) + && !compute_in_loop) { - msq = vect_setup_realignment (first_stmt, bsi, &realignment_token); - phi_stmt = SSA_NAME_DEF_STMT (msq); - offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); + msq = vect_setup_realignment (first_stmt, bsi, &realignment_token, + alignment_support_scheme, NULL_TREE, + &at_loop); + if (alignment_support_scheme == dr_explicit_realign_optimized) + { + phi = SSA_NAME_DEF_STMT (msq); + offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); + } } + else + at_loop = loop; prev_stmt_info = NULL; for (j = 0; j < ncopies; j++) { /* 1. Create the vector pointer update chain. */ if (j == 0) - dataref_ptr = vect_create_data_ref_ptr (first_stmt, bsi, offset, &dummy, - &ptr_incr, false, NULL_TREE); + dataref_ptr = vect_create_data_ref_ptr (first_stmt, + at_loop, offset, + &dummy, &ptr_incr, false, + NULL_TREE, &inv_p); else - dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); + dataref_ptr = + bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); for (i = 0; i < group_size; i++) { /* 2. Create the vector-load in the loop. */ - switch (alignment_support_cheme) + switch (alignment_support_scheme) { case dr_aligned: gcc_assert (aligned_access_p (first_dr)); @@ -4992,14 +5314,39 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) int mis = DR_MISALIGNMENT (first_dr); tree tmis = (mis == -1 ? size_zero_node : size_int (mis)); - gcc_assert (!aligned_access_p (first_dr)); tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT)); data_ref = build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis); break; } - case dr_unaligned_software_pipeline: - gcc_assert (!aligned_access_p (first_dr)); + case dr_explicit_realign: + { + tree ptr, bump; + tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); + + if (compute_in_loop) + msq = vect_setup_realignment (first_stmt, bsi, + &realignment_token, + dr_explicit_realign, + dataref_ptr, NULL); + + data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); + vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_stmt = build_gimple_modify_stmt (vec_dest, data_ref); + new_temp = make_ssa_name (vec_dest, new_stmt); + GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; + vect_finish_stmt_generation (stmt, new_stmt, bsi); + copy_virtual_operands (new_stmt, stmt); + mark_symbols_for_renaming (new_stmt); + msq = new_temp; + + bump = size_binop (MULT_EXPR, vs_minus_1, + TYPE_SIZE_UNIT (scalar_type)); + ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump); + data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); + break; + } + case dr_explicit_realign_optimized: data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); break; default: @@ -5012,29 +5359,70 @@ vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) vect_finish_stmt_generation (stmt, new_stmt, bsi); mark_symbols_for_renaming (new_stmt); - /* 3. Handle explicit realignment if necessary/supported. */ - if (alignment_support_cheme == dr_unaligned_software_pipeline) + /* 3. Handle explicit realignment if necessary/supported. Create in + loop: vec_dest = realign_load (msq, lsq, realignment_token) */ + if (alignment_support_scheme == dr_explicit_realign_optimized + || alignment_support_scheme == dr_explicit_realign) { - /* Create in loop: - <vec_dest = realign_load (msq, lsq, realignment_token)> */ lsq = GIMPLE_STMT_OPERAND (new_stmt, 0); if (!realignment_token) realignment_token = dataref_ptr; vec_dest = vect_create_destination_var (scalar_dest, vectype); - new_stmt = - build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, realignment_token); + new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, + realignment_token); new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); new_temp = make_ssa_name (vec_dest, new_stmt); GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; vect_finish_stmt_generation (stmt, new_stmt, bsi); - if (i == group_size - 1 && j == ncopies - 1) - add_phi_arg (phi_stmt, lsq, loop_latch_edge (loop)); - msq = lsq; + + if (alignment_support_scheme == dr_explicit_realign_optimized) + { + if (i == group_size - 1 && j == ncopies - 1) + add_phi_arg (phi, lsq, loop_latch_edge (containing_loop)); + msq = lsq; + } + } + + /* 4. Handle invariant-load. */ + if (inv_p) + { + gcc_assert (!strided_load); + gcc_assert (nested_in_vect_loop_p (loop, stmt)); + if (j == 0) + { + int k; + tree t = NULL_TREE; + tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type); + + /* CHECKME: bitpos depends on endianess? */ + bitpos = bitsize_zero_node; + vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp, + bitsize, bitpos); + BIT_FIELD_REF_UNSIGNED (vec_inv) = + TYPE_UNSIGNED (scalar_type); + vec_dest = + vect_create_destination_var (scalar_dest, NULL_TREE); + new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv); + new_temp = make_ssa_name (vec_dest, new_stmt); + GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; + vect_finish_stmt_generation (stmt, new_stmt, bsi); + + for (k = nunits - 1; k >= 0; --k) + t = tree_cons (NULL_TREE, new_temp, t); + /* FIXME: use build_constructor directly. */ + vec_inv = build_constructor_from_list (vectype, t); + new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi); + new_stmt = SSA_NAME_DEF_STMT (new_temp); + } + else + gcc_unreachable (); /* FORNOW. */ } + if (strided_load) VEC_quick_push (tree, dr_chain, new_temp); if (i < group_size - 1) - dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt); + dataref_ptr = + bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); } if (strided_load) @@ -5805,8 +6193,8 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters) else { tree new_stmts = NULL_TREE; - tree start_addr = - vect_create_addr_base_for_vector_ref (dr_stmt, &new_stmts, NULL_TREE); + tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, + &new_stmts, NULL_TREE, loop); tree ptr_type = TREE_TYPE (start_addr); tree size = TYPE_SIZE (ptr_type); tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1); @@ -5979,6 +6367,7 @@ static tree vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, tree *cond_expr_stmt_list) { + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); VEC(tree,heap) *may_misalign_stmts = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); tree ref_stmt, tmp; @@ -6014,8 +6403,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, /* create: addr_tmp = (int)(address_of_first_vector) */ addr_base = vect_create_addr_base_for_vector_ref (ref_stmt, - &new_stmt_list, - NULL_TREE); + &new_stmt_list, NULL_TREE, loop); if (new_stmt_list != NULL_TREE) append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list); @@ -6087,7 +6475,7 @@ vect_vfa_segment_size (struct data_reference *dr, tree vect_factor) { tree segment_length; - if (vect_supportable_dr_alignment (dr) == dr_unaligned_software_pipeline) + if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized) { tree vector_size = build_int_cst (integer_type_node, @@ -6100,8 +6488,6 @@ vect_vfa_segment_size (struct data_reference *dr, tree vect_factor) fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr), vect_factor), vector_size)); - - } else { @@ -6139,6 +6525,7 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr, tree * cond_expr_stmt_list) { + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); VEC (ddr_p, heap) * may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); tree vect_factor = @@ -6167,10 +6554,10 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree addr_base_a = vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list, - NULL_TREE); + NULL_TREE, loop); tree addr_base_b = vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list, - NULL_TREE); + NULL_TREE, loop); tree segment_length_a = vect_vfa_segment_size (DDR_A (ddr), vect_factor); tree segment_length_b = vect_vfa_segment_size (DDR_B (ddr), vect_factor); diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 20c867c708b..372334dddd9 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -1345,6 +1345,13 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo) STMT_VINFO_IN_PATTERN_P (res) = false; STMT_VINFO_RELATED_STMT (res) = NULL; STMT_VINFO_DATA_REF (res) = NULL; + + STMT_VINFO_DR_BASE_ADDRESS (res) = NULL; + STMT_VINFO_DR_OFFSET (res) = NULL; + STMT_VINFO_DR_INIT (res) = NULL; + STMT_VINFO_DR_STEP (res) = NULL; + STMT_VINFO_DR_ALIGNED_TO (res) = NULL; + if (TREE_CODE (stmt) == PHI_NODE && is_loop_header_bb_p (bb_for_stmt (stmt))) STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type; else @@ -1655,21 +1662,103 @@ get_vectype_for_scalar_type (tree scalar_type) enum dr_alignment_support vect_supportable_dr_alignment (struct data_reference *dr) { - tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); + tree stmt = DR_STMT (dr); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); enum machine_mode mode = (int) TYPE_MODE (vectype); + struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info)); + bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); + bool invariant_in_outerloop = false; if (aligned_access_p (dr)) return dr_aligned; + if (nested_in_vect_loop) + { + tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); + invariant_in_outerloop = + (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); + } + /* Possibly unaligned access. */ + + /* We can choose between using the implicit realignment scheme (generating + a misaligned_move stmt) and the explicit realignment scheme (generating + aligned loads with a REALIGN_LOAD). There are two variants to the explicit + realignment scheme: optimized, and unoptimized. + We can optimize the realignment only if the step between consecutive + vector loads is equal to the vector size. Since the vector memory + accesses advance in steps of VS (Vector Size) in the vectorized loop, it + is guaranteed that the misalignment amount remains the same throughout the + execution of the vectorized loop. Therefore, we can create the + "realignment token" (the permutation mask that is passed to REALIGN_LOAD) + at the loop preheader. + + However, in the case of outer-loop vectorization, when vectorizing a + memory access in the inner-loop nested within the LOOP that is now being + vectorized, while it is guaranteed that the misalignment of the + vectorized memory access will remain the same in different outer-loop + iterations, it is *not* guaranteed that is will remain the same throughout + the execution of the inner-loop. This is because the inner-loop advances + with the original scalar step (and not in steps of VS). If the inner-loop + step happens to be a multiple of VS, then the misalignment remaines fixed + and we can use the optimized realignment scheme. For example: + + for (i=0; i<N; i++) + for (j=0; j<M; j++) + s += a[i+j]; + + When vectorizing the i-loop in the above example, the step between + consecutive vector loads is 1, and so the misalignment does not remain + fixed across the execution of the inner-loop, and the realignment cannot + be optimized (as illustrated in the following pseudo vectorized loop): + + for (i=0; i<N; i+=4) + for (j=0; j<M; j++){ + vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} + // when j is {0,1,2,3,4,5,6,7,...} respectively. + // (assuming that we start from an aligned address). + } + + We therefore have to use the unoptimized realignment scheme: + + for (i=0; i<N; i+=4) + for (j=k; j<M; j+=4) + vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming + // that the misalignment of the initial address is + // 0). + + The loop can then be vectorized as follows: + + for (k=0; k<4; k++){ + rt = get_realignment_token (&vp[k]); + for (i=0; i<N; i+=4){ + v1 = vp[i+k]; + for (j=k; j<M; j+=4){ + v2 = vp[i+j+VS-1]; + va = REALIGN_LOAD <v1,v2,rt>; + vs += va; + v1 = v2; + } + } + } */ + if (DR_IS_READ (dr)) { - if (optab_handler (vec_realign_load_optab, mode)->insn_code != CODE_FOR_nothing + if (optab_handler (vec_realign_load_optab, mode)->insn_code != + CODE_FOR_nothing && (!targetm.vectorize.builtin_mask_for_load || targetm.vectorize.builtin_mask_for_load ())) - return dr_unaligned_software_pipeline; + { + if (nested_in_vect_loop + && TREE_INT_CST_LOW (DR_STEP (dr)) != UNITS_PER_SIMD_WORD) + return dr_explicit_realign; + else + return dr_explicit_realign_optimized; + } - if (optab_handler (movmisalign_optab, mode)->insn_code != CODE_FOR_nothing) + if (optab_handler (movmisalign_optab, mode)->insn_code != + CODE_FOR_nothing) /* Can't software pipeline the loads, but can at least do them. */ return dr_unaligned_supported; } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index e9f208af25c..ad7ccea4b23 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -53,7 +53,8 @@ enum operation_type { enum dr_alignment_support { dr_unaligned_unsupported, dr_unaligned_supported, - dr_unaligned_software_pipeline, + dr_explicit_realign, + dr_explicit_realign_optimized, dr_aligned }; @@ -249,9 +250,18 @@ typedef struct _stmt_vec_info { data-ref (array/pointer/struct access). A GIMPLE stmt is expected to have at most one such data-ref. **/ - /* Information about the data-ref (access function, etc). */ + /* Information about the data-ref (access function, etc), + relative to the inner-most containing loop. */ struct data_reference *data_ref_info; + /* Information about the data-ref relative to this loop + nest (the loop that is being considered for vectorization). */ + tree dr_base_address; + tree dr_init; + tree dr_offset; + tree dr_step; + tree dr_aligned_to; + /* Stmt is part of some pattern (computation idiom) */ bool in_pattern_p; @@ -310,6 +320,13 @@ typedef struct _stmt_vec_info { #define STMT_VINFO_VECTYPE(S) (S)->vectype #define STMT_VINFO_VEC_STMT(S) (S)->vectorized_stmt #define STMT_VINFO_DATA_REF(S) (S)->data_ref_info + +#define STMT_VINFO_DR_BASE_ADDRESS(S) (S)->dr_base_address +#define STMT_VINFO_DR_INIT(S) (S)->dr_init +#define STMT_VINFO_DR_OFFSET(S) (S)->dr_offset +#define STMT_VINFO_DR_STEP(S) (S)->dr_step +#define STMT_VINFO_DR_ALIGNED_TO(S) (S)->dr_aligned_to + #define STMT_VINFO_IN_PATTERN_P(S) (S)->in_pattern_p #define STMT_VINFO_RELATED_STMT(S) (S)->related_stmt #define STMT_VINFO_SAME_ALIGN_REFS(S) (S)->same_align_refs |