diff options
29 files changed, 1176 insertions, 196 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a5e5c164477..c69bc6ce812 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,74 @@ +2010-07-04 Ira Rosen <irar@il.ibm.com> + Revital Eres <eres@il.ibm.com> + + * doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST): + Document new arguments. + * doc/tm.texi: Regenerate. + * targhooks.c (default_builtin_vectorization_cost): Add new arguments. + Handle unaligned store. + * targhooks.h (default_builtin_vectorization_cost): Add new arguments. + * target.def (builtin_vectorization_cost): Add new arguments. + * target.h (enum vect_cost_for_stmt): Add unaligned_store. + * tree-vect-loop-manip.c (vect_gen_niters_for_prolog_loop): Take number + of iterations of prolog loop directly from LOOP_PEELING_FOR_ALIGNMENT. + (vect_vfa_segment_size): Fix indentation. + * tree-vectorizer.h (struct _vect_peel_info): New. + (struct _vect_peel_extended_info): New. + (struct _loop_vec_info): Add new field for peeling hash table and a + macro for its access. + (VECT_MAX_COST): Define. + (vect_get_load_cost): Declare. + (vect_get_store_cost, vect_get_known_peeling_cost, + vect_get_single_scalar_iteraion_cost): Likewise. + (vect_supportable_dr_alignment): Add new argument. + * tree-vect-loop.c (new_loop_vec_info): Initialize peeling hash table + field. + (destroy_loop_vec_info): Free peeling hash table. + (vect_analyze_loop_form): Update call to builtin_vectorization_cost. + (vect_analyze_loop): Move vect_enhance_data_refs_alignment before + vect_analyze_slp. Fix indentation. + (vect_get_single_scalar_iteraion_cost): New function. + (vect_get_known_peeling_cost): Likewise. + (vect_estimate_min_profitable_iters): Rename byte_misalign to npeel. + Call vect_get_single_scalar_iteraion_cost instead of cost_for_stmt per + statement. Move outside cost calculation inside unknown peeling case. + Call vect_get_known_peeling_cost for known amount of peeling. + * tree-vect-data-refs.c (vect_compute_data_ref_alignment): Add data + reference to the print message of forced alignment. + (vect_verify_datarefs_alignment): Update call to + vect_supportable_dr_alignment. + (vect_get_data_access_cost): New function. + (vect_peeling_hash, vect_peeling_hash_eq, vect_peeling_hash_insert, + vect_peeling_hash_get_most_frequent, vect_peeling_hash_get_lowest_cost, + vect_peeling_hash_choose_best_peeling): Likewise. + (vect_enhance_data_refs_alignment): Fix documentation. Use hash table + to store all the accesses in the loop and find best possible access to + align using peeling for known alignment case. For unknown alignment + check if stores are preferred or if peeling is worthy. + (vect_find_same_alignment_drs): Analyze pairs of loads too. + (vect_supportable_dr_alignment): Add new argument and check aligned + accesses according to it. + * tree-vect-stmts.c (vect_get_stmt_cost): New function. + (cost_for_stmt): Call vect_get_stmt_cost. + (vect_model_simple_cost): Likewise. + (vect_model_store_cost): Call vect_get_stmt_cost. Call + vect_get_store_cost to calculate the cost of the statement. + (vect_get_store_cost): New function. + (vect_model_load_cost): Call vect_get_stmt_cost. Call + vect_get_load_cost to calculate the cost of the statement. + (vect_get_load_cost): New function. + (vectorizable_store): Update call to vect_supportable_dr_alignment. + (vectorizable_load): Likewise. + * config/spu/spu.c (spu_builtin_vectorization_cost): Add new + arguments. + * config/i386/i386.c (ix86_builtin_vectorization_cost): Add new + arguments. Handle unaligned store. + * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): New. + (rs6000_builtin_support_vector_misalignment): Return true for word and + double word alignments for VSX. + * tree-vect-slp.c (vect_build_slp_tree): Update calls to + vect_supportable_dr_alignment and builtin_vectorization_cost. + 2010-07-03 John David Anglin <dave.anglin@nrc-cnrc.gc.ca> PR target/44597 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ec2cdd38d83..711fc166ad4 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -29376,7 +29376,9 @@ static const struct attribute_spec ix86_attribute_table[] = /* Implement targetm.vectorize.builtin_vectorization_cost. */ static int -ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost) +ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, + tree vectype ATTRIBUTE_UNUSED, + int misalign ATTRIBUTE_UNUSED) { switch (type_of_cost) { @@ -29405,6 +29407,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost) return ix86_cost->scalar_to_vec_cost; case unaligned_load: + case unaligned_store: return ix86_cost->vec_unalign_load_cost; case cond_branch_taken: diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index a98b4ddc864..45bc2306182 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1075,6 +1075,8 @@ static bool rs6000_builtin_support_vector_misalignment (enum machine_mode, const_tree, int, bool); +static int rs6000_builtin_vectorization_cost (enum vect_cost_for_stmt, + tree, int); static void def_builtin (int, const char *, tree, int); static bool rs6000_vector_alignment_reachable (const_tree, bool); @@ -1467,6 +1469,9 @@ static const struct attribute_spec rs6000_attribute_table[] = rs6000_builtin_support_vector_misalignment #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE rs6000_vector_alignment_reachable +#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST +#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \ + rs6000_builtin_vectorization_cost #undef TARGET_INIT_BUILTINS #define TARGET_INIT_BUILTINS rs6000_init_builtins @@ -3333,12 +3338,19 @@ rs6000_builtin_support_vector_misalignment (enum machine_mode mode, if (misalignment == -1) { - /* misalignment factor is unknown at compile time but we know + /* Misalignment factor is unknown at compile time but we know it's word aligned. */ if (rs6000_vector_alignment_reachable (type, is_packed)) - return true; + { + int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type)); + + if (element_size == 64 || element_size == 32) + return true; + } + return false; } + /* VSX supports word-aligned vector. */ if (misalignment % 4 == 0) return true; @@ -3404,6 +3416,106 @@ rs6000_builtin_vec_perm (tree type, tree *mask_element_type) return d; } + +/* Implement targetm.vectorize.builtin_vectorization_cost. */ +static int +rs6000_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, + tree vectype, int misalign) +{ + unsigned elements; + + switch (type_of_cost) + { + case scalar_stmt: + case scalar_load: + case scalar_store: + case vector_stmt: + case vector_load: + case vector_store: + case vec_to_scalar: + case scalar_to_vec: + case cond_branch_not_taken: + case vec_perm: + return 1; + + case cond_branch_taken: + return 3; + + case unaligned_load: + if (TARGET_VSX && TARGET_ALLOW_MOVMISALIGN) + { + elements = TYPE_VECTOR_SUBPARTS (vectype); + if (elements == 2) + /* Double word aligned. */ + return 2; + + if (elements == 4) + { + switch (misalign) + { + case 8: + /* Double word aligned. */ + return 2; + + case -1: + /* Unknown misalignment. */ + case 4: + case 12: + /* Word aligned. */ + return 22; + + default: + gcc_unreachable (); + } + } + } + + if (TARGET_ALTIVEC) + /* Misaligned loads are not supported. */ + gcc_unreachable (); + + return 2; + + case unaligned_store: + if (TARGET_VSX && TARGET_ALLOW_MOVMISALIGN) + { + elements = TYPE_VECTOR_SUBPARTS (vectype); + if (elements == 2) + /* Double word aligned. */ + return 2; + + if (elements == 4) + { + switch (misalign) + { + case 8: + /* Double word aligned. */ + return 2; + + case -1: + /* Unknown misalignment. */ + case 4: + case 12: + /* Word aligned. */ + return 23; + + default: + gcc_unreachable (); + } + } + } + + if (TARGET_ALTIVEC) + /* Misaligned stores are not supported. */ + gcc_unreachable (); + + return 2; + + default: + gcc_unreachable (); + } +} + /* Handle generic options of the form -mfoo=yes/no. NAME is the option name. VALUE is the option value. diff --git a/gcc/config/spu/spu.c b/gcc/config/spu/spu.c index 4b7f9162395..3d4f5870545 100644 --- a/gcc/config/spu/spu.c +++ b/gcc/config/spu/spu.c @@ -209,7 +209,7 @@ static rtx spu_addr_space_legitimize_address (rtx, rtx, enum machine_mode, static tree spu_builtin_mul_widen_even (tree); static tree spu_builtin_mul_widen_odd (tree); static tree spu_builtin_mask_for_load (void); -static int spu_builtin_vectorization_cost (enum vect_cost_for_stmt); +static int spu_builtin_vectorization_cost (enum vect_cost_for_stmt, tree, int); static bool spu_vector_alignment_reachable (const_tree, bool); static tree spu_builtin_vec_perm (tree, tree *); static enum machine_mode spu_addr_space_pointer_mode (addr_space_t); @@ -6694,7 +6694,9 @@ spu_builtin_mask_for_load (void) /* Implement targetm.vectorize.builtin_vectorization_cost. */ static int -spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost) +spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, + tree vectype ATTRIBUTE_UNUSED, + int misalign ATTRIBUTE_UNUSED) { switch (type_of_cost) { diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index d6bc604b7ee..17b582f77b2 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -5706,8 +5706,10 @@ preserved (e.g.@: used only by a reduction computation). Otherwise, the @code{widen_mult_hi/lo} idioms will be used. @end deftypefn -@deftypefn {Target Hook} int TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST (enum vect_cost_for_stmt @var{type_of_cost}) +@deftypefn {Target Hook} int TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST (enum vect_cost_for_stmt @var{type_of_cost}, tree @var{vectype}, int @var{misalign}) Returns cost of different scalar or vector statements for vectorization cost model. +For vector memory operations the cost may depend on type (@var{vectype}) and +misalignment value (@var{misalign}). @end deftypefn @deftypefn {Target Hook} bool TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE (const_tree @var{type}, bool @var{is_packed}) diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 083d56ffd3b..e79341d5db5 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -5708,6 +5708,8 @@ preserved (e.g.@: used only by a reduction computation). Otherwise, the @hook TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST Returns cost of different scalar or vector statements for vectorization cost model. +For vector memory operations the cost may depend on type (@var{vectype}) and +misalignment value (@var{misalign}). @end deftypefn @hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE diff --git a/gcc/target.def b/gcc/target.def index 8bcf877359c..627092537f1 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -789,11 +789,13 @@ DEFHOOK "", tree, (tree x), NULL) -/* Cost of different vector/scalar statements in vectorization cost model. */ +/* Cost of different vector/scalar statements in vectorization cost + model. In case of misaligned vector loads and stores the cost depends + on the data type and misalignment value. */ DEFHOOK (builtin_vectorization_cost, "", - int, (enum vect_cost_for_stmt type_of_cost), + int, (enum vect_cost_for_stmt type_of_cost, tree vectype, int misalign), default_builtin_vectorization_cost) /* Return true if vector alignment is reachable (by peeling N diff --git a/gcc/target.h b/gcc/target.h index 18d160dbeb2..99dd1eea998 100644 --- a/gcc/target.h +++ b/gcc/target.h @@ -119,6 +119,7 @@ enum vect_cost_for_stmt vector_stmt, vector_load, unaligned_load, + unaligned_store, vector_store, vec_to_scalar, scalar_to_vec, diff --git a/gcc/targhooks.c b/gcc/targhooks.c index 9271db829bf..1a49f0c1a5e 100644 --- a/gcc/targhooks.c +++ b/gcc/targhooks.c @@ -479,7 +479,9 @@ default_builtin_vectorized_conversion (unsigned int code ATTRIBUTE_UNUSED, /* Default vectorizer cost model values. */ int -default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost) +default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, + tree vectype ATTRIBUTE_UNUSED, + int misalign ATTRIBUTE_UNUSED) { switch (type_of_cost) { @@ -496,6 +498,7 @@ default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost) return 1; case unaligned_load: + case unaligned_store: return 2; case cond_branch_taken: diff --git a/gcc/targhooks.h b/gcc/targhooks.h index f491dbd4d29..eb4b547aad0 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -77,7 +77,7 @@ extern tree default_builtin_vectorized_function (tree, tree, tree); extern tree default_builtin_vectorized_conversion (unsigned int, tree, tree); -extern int default_builtin_vectorization_cost (enum vect_cost_for_stmt); +extern int default_builtin_vectorization_cost (enum vect_cost_for_stmt, tree, int); extern tree default_builtin_reciprocal (unsigned int, bool, bool); diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index a815dd13da5..5ae6c6ebe4f 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,20 @@ +2010-07-04 Ira Rosen <irar@il.ibm.com> + Revital Eres <eres@il.ibm.com> + + * gcc.dg/vect/vect-42.c: Don't expect peeling on targets that support + misaligned stores. + * gcc.dg/vect/vect-60.c, gcc.dg/vect/vect-56.c, gcc.dg/vect/vect-93.c, + gcc.dg/vect/vect-96.c: Likewise. + * gcc.dg/vect/vect-109.c: Expect vectorization only on targets that + that support misaligned stores. Change the number of expected + misaligned accesses. + * gcc.dg/vect/vect-peel-1.c: New test. + * gcc.dg/vect/vect-peel-2.c, gcc.dg/vect/vect-peel-3.c, + gcc.dg/vect/vect-peel-4.c: Likewise. + * gcc.dg/vect/vect-multitypes-1.c: Change the test to make it + vectorizable on all targets that support realignment. + * gcc.dg/vect/vect-multitypes-4.c: Likewise. + 2010-07-03 H.J. Lu <hongjiu.lu@intel.com> PR c/44806 diff --git a/gcc/testsuite/gcc.dg/vect/vect-109.c b/gcc/testsuite/gcc.dg/vect/vect-109.c index 393909312bd..ddba2635bff 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-109.c +++ b/gcc/testsuite/gcc.dg/vect/vect-109.c @@ -72,8 +72,8 @@ int main (void) return 0; } -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_hw_misalign } } } */ /* { dg-final { scan-tree-dump-times "not vectorized: unsupported unaligned store" 2 "vect" { xfail vect_hw_misalign } } } */ -/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 10 "vect" { target vect_hw_misalign } } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 3 "vect" { target vect_hw_misalign } } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-42.c b/gcc/testsuite/gcc.dg/vect/vect-42.c index 3ba1c6f7fde..fa832008698 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-42.c +++ b/gcc/testsuite/gcc.dg/vect/vect-42.c @@ -65,6 +65,7 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */ /* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 3 "vect" { target vect_no_align } } } */ /* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 1 "vect" { target { { ! vector_alignment_reachable } && { ! vect_hw_misalign } } } } } */ -/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail { vect_no_align || { ! vector_alignment_reachable } } } } } */ -/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail { vect_no_align || { ! vector_alignment_reachable } } } } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail { vect_no_align || { { ! vector_alignment_reachable } || vect_hw_misalign } } } } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 3 "vect" { target vect_hw_misalign } } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail { vect_no_align || { { ! vector_alignment_reachable } || vect_hw_misalign } } } } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-56.c b/gcc/testsuite/gcc.dg/vect/vect-56.c index 7b7da123591..1555d41df6f 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-56.c +++ b/gcc/testsuite/gcc.dg/vect/vect-56.c @@ -68,6 +68,8 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_align } } } */ -/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align } } } } */ -/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target { vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" { xfail { vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { target { vect_hw_misalign } } } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-60.c b/gcc/testsuite/gcc.dg/vect/vect-60.c index cbdf63db123..ba8ffe65400 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-60.c +++ b/gcc/testsuite/gcc.dg/vect/vect-60.c @@ -69,6 +69,8 @@ int main (void) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_align } } } */ -/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align } } } } */ -/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target { vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" { xfail { vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { target { vect_hw_misalign } } } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-93.c b/gcc/testsuite/gcc.dg/vect/vect-93.c index 85666d9e699..dfb98cfd541 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-93.c +++ b/gcc/testsuite/gcc.dg/vect/vect-93.c @@ -72,7 +72,7 @@ int main (void) /* main && main1 together: */ /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 2 "vect" { target powerpc*-*-* i?86-*-* x86_64-*-* } } } */ /* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { target { vect_no_align && {! vector_alignment_reachable} } } } } */ -/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 3 "vect" { xfail { { vect_no_align } || {! vector_alignment_reachable} } } } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 3 "vect" { xfail { { vect_no_align } || { { ! vector_alignment_reachable} || vect_hw_misalign } } } } } */ /* in main1: */ /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target !powerpc*-*-* !i?86-*-* !x86_64-*-* } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-96.c b/gcc/testsuite/gcc.dg/vect/vect-96.c index f392169f770..c7dea6123a8 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-96.c +++ b/gcc/testsuite/gcc.dg/vect/vect-96.c @@ -44,6 +44,6 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ /* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target { {! vect_no_align} && vector_alignment_reachable } } } } */ -/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail { { vect_no_align } || {! vector_alignment_reachable} } } } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail { { vect_no_align } || { { ! vector_alignment_reachable} || vect_hw_misalign } } } } } */ /* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning." 1 "vect" { target { vect_no_align || { {! vector_alignment_reachable} && {! vect_hw_misalign} } } } } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c index e8fe027f5f3..7981c4a475f 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c +++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-1.c @@ -27,13 +27,13 @@ __attribute__ ((noinline)) int main1 (int n) for (i = 0; i < n; i++) { sa[i+7] = sb[i]; - ia[i+3] = ib[i]; + ia[i+3] = ib[i+1]; } /* check results: */ for (i = 0; i < n; i++) { - if (sa[i+7] != sb[i] || ia[i+3] != ib[i]) + if (sa[i+7] != sb[i] || ia[i+3] != ib[i+1]) abort (); } @@ -44,7 +44,9 @@ __attribute__ ((noinline)) int main1 (int n) access for peeling, and therefore will examine the option of using a peeling factor = (V-3)%V = 1 for V=2,4. This will not align the access 'sa[i+3]' (for which we need to - peel 5 iterations), so the loop can not be vectorized. */ + peel 5 iterations). However, 'ia[i+3]' also gets aligned if we peel 5 + iterations, so the loop is vectorizable on all targets that support + unaligned loads. */ __attribute__ ((noinline)) int main2 (int n) { @@ -55,13 +57,13 @@ __attribute__ ((noinline)) int main2 (int n) for (i = 0; i < n; i++) { ia[i+3] = ib[i]; - sa[i+3] = sb[i]; + sa[i+3] = sb[i+1]; } /* check results: */ for (i = 0; i < n; i++) { - if (sa[i+3] != sb[i] || ia[i+3] != ib[i]) + if (sa[i+3] != sb[i+1] || ia[i+3] != ib[i]) abort (); } @@ -78,11 +80,8 @@ int main (void) return 0; } -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail {! vect_hw_misalign} } } } */ -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */ -/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail {! vect_hw_misalign} } } } */ -/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */ -/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail *-*-* } } } */ -/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail { vect_no_align } } } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail { vect_no_align } } } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail { vect_no_align } } } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c index 274fb025319..3a83491065f 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c +++ b/gcc/testsuite/gcc.dg/vect/vect-multitypes-4.c @@ -20,7 +20,9 @@ unsigned int ib[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45, access for peeling, and therefore will examine the option of using a peeling factor = VF-7%VF. This will result in a peeling factor 1, which will also align the access to 'ia[i+3]', and the loop could be - vectorized on all targets that support unaligned loads. */ + vectorized on all targets that support unaligned loads. + Without cost model on targets that support misaligned stores, no peeling + will be applied since we want to keep the four loads aligned. */ __attribute__ ((noinline)) int main1 (int n) @@ -50,7 +52,11 @@ int main1 (int n) using a peeling factor = VF-3%VF. This will result in a peeling factor 1 if VF=4,2. This will not align the access to 'sa[i+3]', for which we need to peel 5,1 iterations for VF=4,2 respectively, so the loop can not - be vectorized. */ + be vectorized. However, 'ia[i+3]' also gets aligned if we peel 5 + iterations, so the loop is vectorizable on all targets that support + unaligned loads. + Without cost model on targets that support misaligned stores, no peeling + will be applied since we want to keep the four loads aligned. */ __attribute__ ((noinline)) int main2 (int n) @@ -85,11 +91,10 @@ int main (void) return 0; } -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail {! vect_hw_misalign} } } } */ -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */ -/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail {! vect_hw_misalign} } } } */ -/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */ -/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 8 "vect" { xfail *-*-* } } } */ -/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail { vect_no_align } } } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" { target { vect_hw_misalign} } } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 8 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 4 "vect" { target { vect_hw_misalign } } } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-1.c b/gcc/testsuite/gcc.dg/vect/vect-peel-1.c new file mode 100644 index 00000000000..ae7746389d3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-peel-1.c @@ -0,0 +1,51 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 128 + +int ib[N+5]; + +__attribute__ ((noinline)) +int main1 () +{ + int i; + int ia[N+1]; + + /* All the accesses are misaligned. With cost model disabled, we count the + the number of aligned accesses for each peeling option, and in this case + we align the two loads if possible (i.e., if misaligned stores are + supported). */ + for (i = 1; i <= N; i++) + { + ia[i] = ib[i+2] + ib[i+6]; + } + + /* check results: */ + for (i = 1; i <= N; i++) + { + if (ia[i] != ib[i+2] + ib[i+6]) + abort (); + } + + return 0; +} + +int main (void) +{ + int i; + + check_vect (); + + for (i = 0; i < N+5; i++) + ib[i] = i; + + return main1 (); +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_align } } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target vect_hw_misalign } } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { xfail vect_no_align } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-2.c b/gcc/testsuite/gcc.dg/vect/vect-peel-2.c new file mode 100644 index 00000000000..ee7b8dbe62e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-peel-2.c @@ -0,0 +1,52 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 128 + +/* unaligned store. */ + +int ib[N+5]; + +__attribute__ ((noinline)) +int main1 () +{ + int i; + int ia[N+1]; + + /* The store is aligned and the loads are misaligned with the same + misalignment. Cost model is disabled. If misaligned stores are supported, + we peel according to the loads to align them. */ + for (i = 0; i <= N; i++) + { + ia[i] = ib[i+2] + ib[i+6]; + } + + /* check results: */ + for (i = 1; i <= N; i++) + { + if (ia[i] != ib[i+2] + ib[i+6]) + abort (); + } + + return 0; +} + +int main (void) +{ + int i; + + check_vect (); + + for (i = 0; i < N+5; i++) + ib[i] = i; + + return main1 (); +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_align } } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" { target vect_hw_misalign } } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" { xfail { vect_no_align || vect_hw_misalign } } } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" { target vect_hw_misalign } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-3.c b/gcc/testsuite/gcc.dg/vect/vect-peel-3.c new file mode 100644 index 00000000000..80f03c8d8af --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-peel-3.c @@ -0,0 +1,55 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 128 +#define RES 21888 + +/* unaligned store. */ + +int ib[N+10]; +int ia[N+10]; +int ic[N+10]; + +__attribute__ ((noinline)) +int main1 () +{ + int i, suma = 0, sumb = 0, sumc = 0; + + /* ib and ic have same misalignment, we peel to align them. */ + for (i = 1; i <= N; i++) + { + suma += ia[i]; + sumb += ib[i+6]; + sumc += ic[i+2]; + } + + /* check results: */ + if (suma + sumb + sumc != RES) + abort (); + + return 0; +} + +int main (void) +{ + int i; + + check_vect (); + + for (i = 0; i < N+10; i++) + { + ib[i] = i; + ic[i] = i+2; + ia[i] = i/2; + } + + return main1 (); +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-peel-4.c b/gcc/testsuite/gcc.dg/vect/vect-peel-4.c new file mode 100644 index 00000000000..971d02334ac --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-peel-4.c @@ -0,0 +1,47 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 128 + +int ib[N+5]; + +__attribute__ ((noinline)) +int main1 () +{ + int i; + int ia[N+1]; + + /* Don't peel keeping one load and the store aligned. */ + for (i = 0; i <= N; i++) + { + ia[i] = ib[i] + ib[i+6]; + } + + /* check results: */ + for (i = 1; i <= N; i++) + { + if (ia[i] != ib[i] + ib[i+6]) + abort (); + } + + return 0; +} + +int main (void) +{ + int i; + + check_vect (); + + for (i = 0; i < N+5; i++) + ib[i] = i; + + return main1 (); +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "Alignment of access forced using peeling" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index cbefc1f01c4..cf9fab221a0 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -810,7 +810,11 @@ vect_compute_data_ref_alignment (struct data_reference *dr) NOTE: This is the only change to the code we make during the analysis phase, before deciding to vectorize the loop. */ if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "force alignment"); + { + fprintf (vect_dump, "force alignment of "); + print_generic_expr (vect_dump, ref, TDF_SLIM); + } + DECL_ALIGN (base) = TYPE_ALIGN (vectype); DECL_USER_ALIGN (base) = 1; } @@ -967,7 +971,7 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) || !STMT_VINFO_VECTORIZABLE (stmt_info)) continue; - supportable_dr_alignment = vect_supportable_dr_alignment (dr); + supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); if (!supportable_dr_alignment) { if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) @@ -1061,6 +1065,189 @@ vector_alignment_reachable_p (struct data_reference *dr) return true; } + +/* Calculate the cost of the memory access represented by DR. */ + +static void +vect_get_data_access_cost (struct data_reference *dr, + unsigned int *inside_cost, + unsigned int *outside_cost) +{ + gimple stmt = DR_STMT (dr); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + int ncopies = vf / nunits; + bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); + + if (!supportable_dr_alignment) + *inside_cost = VECT_MAX_COST; + else + { + if (DR_IS_READ (dr)) + vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost); + else + vect_get_store_cost (dr, ncopies, inside_cost); + } + + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, "vect_get_data_access_cost: inside_cost = %d, " + "outside_cost = %d.", *inside_cost, *outside_cost); +} + + +static hashval_t +vect_peeling_hash (const void *elem) +{ + const struct _vect_peel_info *peel_info; + + peel_info = (const struct _vect_peel_info *) elem; + return (hashval_t) peel_info->npeel; +} + + +static int +vect_peeling_hash_eq (const void *elem1, const void *elem2) +{ + const struct _vect_peel_info *a, *b; + + a = (const struct _vect_peel_info *) elem1; + b = (const struct _vect_peel_info *) elem2; + return (a->npeel == b->npeel); +} + + +/* Insert DR into peeling hash table with NPEEL as key. */ + +static void +vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr, + int npeel) +{ + struct _vect_peel_info elem, *slot; + void **new_slot; + bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); + + elem.npeel = npeel; + slot = (vect_peel_info) htab_find (LOOP_VINFO_PEELING_HTAB (loop_vinfo), + &elem); + if (slot) + slot->count++; + else + { + slot = XNEW (struct _vect_peel_info); + slot->npeel = npeel; + slot->dr = dr; + slot->count = 1; + new_slot = htab_find_slot (LOOP_VINFO_PEELING_HTAB (loop_vinfo), slot, + INSERT); + *new_slot = slot; + } + + if (!supportable_dr_alignment && !flag_vect_cost_model) + slot->count += VECT_MAX_COST; +} + + +/* Traverse peeling hash table to find peeling option that aligns maximum + number of data accesses. */ + +static int +vect_peeling_hash_get_most_frequent (void **slot, void *data) +{ + vect_peel_info elem = (vect_peel_info) *slot; + vect_peel_extended_info max = (vect_peel_extended_info) data; + + if (elem->count > max->peel_info.count) + { + max->peel_info.npeel = elem->npeel; + max->peel_info.count = elem->count; + max->peel_info.dr = elem->dr; + } + + return 1; +} + + +/* Traverse peeling hash table and calculate cost for each peeling option. Find + one with the lowest cost. */ + +static int +vect_peeling_hash_get_lowest_cost (void **slot, void *data) +{ + vect_peel_info elem = (vect_peel_info) *slot; + vect_peel_extended_info min = (vect_peel_extended_info) data; + int save_misalignment, dummy; + unsigned int inside_cost = 0, outside_cost = 0, i; + gimple stmt = DR_STMT (elem->dr); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); + struct data_reference *dr; + + for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++) + { + stmt = DR_STMT (dr); + stmt_info = vinfo_for_stmt (stmt); + /* For interleaving, only the alignment of the first access + matters. */ + if (STMT_VINFO_STRIDED_ACCESS (stmt_info) + && DR_GROUP_FIRST_DR (stmt_info) != stmt) + continue; + + save_misalignment = DR_MISALIGNMENT (dr); + vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel); + vect_get_data_access_cost (dr, &inside_cost, &outside_cost); + SET_DR_MISALIGNMENT (dr, save_misalignment); + } + + outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel, &dummy, + vect_get_single_scalar_iteraion_cost (loop_vinfo)); + + if (inside_cost < min->inside_cost + || (inside_cost == min->inside_cost && outside_cost < min->outside_cost)) + { + min->inside_cost = inside_cost; + min->outside_cost = outside_cost; + min->peel_info.dr = elem->dr; + min->peel_info.npeel = elem->npeel; + } + + return 1; +} + + +/* Choose best peeling option by traversing peeling hash table and either + choosing an option with the lowest cost (if cost model is enabled) or the + option that aligns as many accesses as possible. */ + +static struct data_reference * +vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo, + unsigned int *npeel) +{ + struct _vect_peel_extended_info res; + + res.peel_info.dr = NULL; + + if (flag_vect_cost_model) + { + res.inside_cost = INT_MAX; + res.outside_cost = INT_MAX; + htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo), + vect_peeling_hash_get_lowest_cost, &res); + } + else + { + res.peel_info.count = 0; + htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo), + vect_peeling_hash_get_most_frequent, &res); + } + + *npeel = res.peel_info.npeel; + return res.peel_info.dr; +} + + /* Function vect_enhance_data_refs_alignment This pass will use loop versioning and loop peeling in order to enhance @@ -1158,15 +1345,21 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); enum dr_alignment_support supportable_dr_alignment; - struct data_reference *dr0 = NULL; + struct data_reference *dr0 = NULL, *first_store = NULL; struct data_reference *dr; - unsigned int i; + unsigned int i, j; bool do_peeling = false; bool do_versioning = false; bool stat; gimple stmt; stmt_vec_info stmt_info; int vect_versioning_for_alias_required; + unsigned int npeel = 0; + bool all_misalignments_unknown = true; + unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + unsigned possible_npeel_number = 1; + tree vectype; + unsigned int nelements, mis, same_align_drs_max = 0; if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "=== vect_enhance_data_refs_alignment ==="); @@ -1201,12 +1394,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) - How many accesses will become unaligned due to the peeling, and the cost of misaligned accesses. - The cost of peeling (the extra runtime checks, the increase - in code size). - - The scheme we use FORNOW: peel to force the alignment of the first - unsupported misaligned access in the loop. - - TODO: Use a cost model. */ + in code size). */ for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++) { @@ -1219,15 +1407,108 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) && DR_GROUP_FIRST_DR (stmt_info) != stmt) continue; - if (!DR_IS_READ (dr) && !aligned_access_p (dr)) + supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); + do_peeling = vector_alignment_reachable_p (dr); + if (do_peeling) { - do_peeling = vector_alignment_reachable_p (dr); - if (do_peeling) - dr0 = dr; - if (!do_peeling && vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "vector alignment may not be reachable"); - break; - } + if (known_alignment_for_access_p (dr)) + { + unsigned int npeel_tmp; + + /* Save info about DR in the hash table. */ + if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo)) + LOOP_VINFO_PEELING_HTAB (loop_vinfo) = + htab_create (1, vect_peeling_hash, + vect_peeling_hash_eq, free); + + vectype = STMT_VINFO_VECTYPE (stmt_info); + nelements = TYPE_VECTOR_SUBPARTS (vectype); + mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE ( + TREE_TYPE (DR_REF (dr)))); + npeel_tmp = (nelements - mis) % vf; + + /* For multiple types, it is possible that the bigger type access + will have more than one peeling option. E.g., a loop with two + types: one of size (vector size / 4), and the other one of + size (vector size / 8). Vectorization factor will 8. If both + access are misaligned by 3, the first one needs one scalar + iteration to be aligned, and the second one needs 5. But the + the first one will be aligned also by peeling 5 scalar + iterations, and in that case both accesses will be aligned. + Hence, except for the immediate peeling amount, we also want + to try to add full vector size, while we don't exceed + vectorization factor. + We do this automtically for cost model, since we calculate cost + for every peeling option. */ + if (!flag_vect_cost_model) + possible_npeel_number = vf /nelements; + + /* Handle the aligned case. We may decide to align some other + access, making DR unaligned. */ + if (DR_MISALIGNMENT (dr) == 0) + { + npeel_tmp = 0; + if (!flag_vect_cost_model) + possible_npeel_number++; + } + + for (j = 0; j < possible_npeel_number; j++) + { + gcc_assert (npeel_tmp <= vf); + vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp); + npeel_tmp += nelements; + } + + all_misalignments_unknown = false; + /* Data-ref that was chosen for the case that all the + misalignments are unknown is not relevant anymore, since we + have a data-ref with known alignment. */ + dr0 = NULL; + } + else + { + /* If we don't know all the misalignment values, we prefer + peeling for data-ref that has maximum number of data-refs + with the same alignment, unless the target prefers to align + stores over load. */ + if (all_misalignments_unknown) + { + if (same_align_drs_max < VEC_length (dr_p, + STMT_VINFO_SAME_ALIGN_REFS (stmt_info)) + || !dr0) + { + same_align_drs_max = VEC_length (dr_p, + STMT_VINFO_SAME_ALIGN_REFS (stmt_info)); + dr0 = dr; + } + + if (!first_store && !DR_IS_READ (dr)) + first_store = dr; + } + + /* If there are both known and unknown misaligned accesses in the + loop, we choose peeling amount according to the known + accesses. */ + + + if (!supportable_dr_alignment) + { + dr0 = dr; + if (!first_store && !DR_IS_READ (dr)) + first_store = dr; + } + } + } + else + { + if (!aligned_access_p (dr)) + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "vector alignment may not be reachable"); + + break; + } + } } vect_versioning_for_alias_required @@ -1242,24 +1523,112 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))) do_peeling = false; + if (do_peeling && all_misalignments_unknown + && vect_supportable_dr_alignment (dr0, false)) + { + + /* Check if the target requires to prefer stores over loads, i.e., if + misaligned stores are more expensive than misaligned loads (taking + drs with same alignment into account). */ + if (first_store && DR_IS_READ (dr0)) + { + unsigned int load_inside_cost = 0, load_outside_cost = 0; + unsigned int store_inside_cost = 0, store_outside_cost = 0; + unsigned int load_inside_penalty = 0, load_outside_penalty = 0; + unsigned int store_inside_penalty = 0, store_outside_penalty = 0; + + vect_get_data_access_cost (dr0, &load_inside_cost, + &load_outside_cost); + vect_get_data_access_cost (first_store, &store_inside_cost, + &store_outside_cost); + + /* Calculate the penalty for leaving FIRST_STORE unaligned (by + aligning the load DR0). */ + load_inside_penalty = store_inside_cost; + load_outside_penalty = store_outside_cost; + for (i = 0; VEC_iterate (dr_p, STMT_VINFO_SAME_ALIGN_REFS + (vinfo_for_stmt (DR_STMT (first_store))), + i, dr); + i++) + if (DR_IS_READ (dr)) + { + load_inside_penalty += load_inside_cost; + load_outside_penalty += load_outside_cost; + } + else + { + load_inside_penalty += store_inside_cost; + load_outside_penalty += store_outside_cost; + } + + /* Calculate the penalty for leaving DR0 unaligned (by + aligning the FIRST_STORE). */ + store_inside_penalty = load_inside_cost; + store_outside_penalty = load_outside_cost; + for (i = 0; VEC_iterate (dr_p, STMT_VINFO_SAME_ALIGN_REFS + (vinfo_for_stmt (DR_STMT (dr0))), + i, dr); + i++) + if (DR_IS_READ (dr)) + { + store_inside_penalty += load_inside_cost; + store_outside_penalty += load_outside_cost; + } + else + { + store_inside_penalty += store_inside_cost; + store_outside_penalty += store_outside_cost; + } + + if (load_inside_penalty > store_inside_penalty + || (load_inside_penalty == store_inside_penalty + && load_outside_penalty > store_outside_penalty)) + dr0 = first_store; + } + + /* In case there are only loads with different unknown misalignments, use + peeling only if it may help to align other accesses in the loop. */ + if (!first_store && !VEC_length (dr_p, STMT_VINFO_SAME_ALIGN_REFS + (vinfo_for_stmt (DR_STMT (dr0)))) + && vect_supportable_dr_alignment (dr0, false) + != dr_unaligned_supported) + do_peeling = false; + } + + if (do_peeling && !dr0) + { + /* Peeling is possible, but there is no data access that is not supported + unless aligned. So we try to choose the best possible peeling. */ + + /* We should get here only if there are drs with known misalignment. */ + gcc_assert (!all_misalignments_unknown); + + /* Choose the best peeling from the hash table. */ + dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel); + if (!dr0 || !npeel) + do_peeling = false; + } + if (do_peeling) { - int mis; - int npeel = 0; - gimple stmt = DR_STMT (dr0); - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - int nelements = TYPE_VECTOR_SUBPARTS (vectype); + stmt = DR_STMT (dr0); + stmt_info = vinfo_for_stmt (stmt); + vectype = STMT_VINFO_VECTYPE (stmt_info); + nelements = TYPE_VECTOR_SUBPARTS (vectype); if (known_alignment_for_access_p (dr0)) { - /* Since it's known at compile time, compute the number of iterations - in the peeled loop (the peeling factor) for use in updating - DR_MISALIGNMENT values. The peeling factor is the vectorization - factor minus the misalignment as an element count. */ - mis = DR_MISALIGNMENT (dr0); - mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0)))); - npeel = nelements - mis; + if (!npeel) + { + /* Since it's known at compile time, compute the number of + iterations in the peeled loop (the peeling factor) for use in + updating DR_MISALIGNMENT values. The peeling factor is the + vectorization factor minus the misalignment as an element + count. */ + mis = DR_MISALIGNMENT (dr0); + mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0)))); + npeel = nelements - mis; + } /* For interleaved data access every iteration accesses all the members of the group, therefore we divide the number of iterations @@ -1290,7 +1659,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) save_misalignment = DR_MISALIGNMENT (dr); vect_update_misalignment_for_peel (dr, dr0, npeel); - supportable_dr_alignment = vect_supportable_dr_alignment (dr); + supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); SET_DR_MISALIGNMENT (dr, save_misalignment); if (!supportable_dr_alignment) @@ -1300,6 +1669,15 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) } } + if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0) + { + stat = vect_verify_datarefs_alignment (loop_vinfo, NULL); + if (!stat) + do_peeling = false; + else + return stat; + } + if (do_peeling) { /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. @@ -1314,7 +1692,10 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) vect_update_misalignment_for_peel (dr, dr0, npeel); LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0; - LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0); + if (npeel) + LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; + else + LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0); SET_DR_MISALIGNMENT (dr0, 0); if (vect_print_dump_info (REPORT_ALIGNMENT)) fprintf (vect_dump, "Alignment of access forced using peeling."); @@ -1358,7 +1739,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) && DR_GROUP_FIRST_DR (stmt_info) != stmt)) continue; - supportable_dr_alignment = vect_supportable_dr_alignment (dr); + supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); if (!supportable_dr_alignment) { @@ -1467,7 +1848,7 @@ vect_find_same_alignment_drs (struct data_dependence_relation *ddr, if (DDR_ARE_DEPENDENT (ddr) == chrec_known) return; - if ((DR_IS_READ (dra) && DR_IS_READ (drb)) || dra == drb) + if (dra == drb) return; if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) @@ -3558,13 +3939,16 @@ vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) return (alignment <= MAX_STACK_ALIGNMENT); } -/* Function vect_supportable_dr_alignment - Return whether the data reference DR is supported with respect to its +/* Return whether the data reference DR is supported with respect to its + alignment. + If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even + it is aligned, i.e., check if it is possible to vectorize it with different alignment. */ enum dr_alignment_support -vect_supportable_dr_alignment (struct data_reference *dr) +vect_supportable_dr_alignment (struct data_reference *dr, + bool check_aligned_accesses) { gimple stmt = DR_STMT (dr); stmt_vec_info stmt_info = vinfo_for_stmt (stmt); @@ -3574,7 +3958,7 @@ vect_supportable_dr_alignment (struct data_reference *dr) struct loop *vect_loop = NULL; bool nested_in_vect_loop = false; - if (aligned_access_p (dr)) + if (aligned_access_p (dr) && !check_aligned_accesses) return dr_aligned; if (!loop_vinfo) diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index f8922a2308a..38546cf1748 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -1976,25 +1976,18 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters, tree vectype = STMT_VINFO_VECTYPE (stmt_info); int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT; tree niters_type = TREE_TYPE (loop_niters); - int step = 1; - int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); int nelements = TYPE_VECTOR_SUBPARTS (vectype); - if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) - step = DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_DR (stmt_info))); - pe = loop_preheader_edge (loop); if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) { - int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); - int elem_misalign = byte_misalign / element_size; + int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "known alignment = %d.", byte_misalign); + fprintf (vect_dump, "known peeling = %d.", npeel); - iters = build_int_cst (niters_type, - (((nelements - elem_misalign) & (nelements - 1)) / step)); + iters = build_int_cst (niters_type, npeel); } else { @@ -2017,7 +2010,8 @@ vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters, /* Create: byte_misalign = addr & (vectype_size - 1) */ byte_misalign = - fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1); + fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), + vectype_size_minus_1); /* Create: elem_misalign = byte_misalign / element_size */ elem_misalign = @@ -2323,7 +2317,8 @@ vect_vfa_segment_size (struct data_reference *dr, tree vect_factor) tree segment_length = fold_build2 (MULT_EXPR, integer_type_node, DR_STEP (dr), vect_factor); - if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized) + if (vect_supportable_dr_alignment (dr, false) + == dr_explicit_realign_optimized) { tree vector_size = TYPE_SIZE_UNIT (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)))); diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 3b387169408..ef481735518 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -755,6 +755,7 @@ new_loop_vec_info (struct loop *loop) LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10); LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10); LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1; + LOOP_VINFO_PEELING_HTAB (res) = NULL; return res; } @@ -845,6 +846,9 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts) VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo)); VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo)); + if (LOOP_VINFO_PEELING_HTAB (loop_vinfo)) + htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo)); + free (loop_vinfo); loop->aux = NULL; } @@ -1122,7 +1126,11 @@ vect_analyze_loop_form (struct loop *loop) static inline int vect_get_cost (enum vect_cost_for_stmt type_of_cost) { - return targetm.vectorize.builtin_vectorization_cost (type_of_cost); + tree dummy_type = NULL; + int dummy = 0; + + return targetm.vectorize.builtin_vectorization_cost (type_of_cost, + dummy_type, dummy); } @@ -1498,17 +1506,6 @@ vect_analyze_loop (struct loop *loop) return NULL; } - /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ - ok = vect_analyze_slp (loop_vinfo, NULL); - if (ok) - { - /* Decide which possible SLP instances to SLP. */ - vect_make_slp_decision (loop_vinfo); - - /* Find stmts that need to be both vectorized and SLPed. */ - vect_detect_hybrid_slp (loop_vinfo); - } - /* This pass will decide on using loop versioning and/or loop peeling in order to enhance the alignment of data references in the loop. */ @@ -1516,11 +1513,22 @@ vect_analyze_loop (struct loop *loop) if (!ok) { if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "bad data alignment."); + fprintf (vect_dump, "bad data alignment."); destroy_loop_vec_info (loop_vinfo, true); return NULL; } + /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ + ok = vect_analyze_slp (loop_vinfo, NULL); + if (ok) + { + /* Decide which possible SLP instances to SLP. */ + vect_make_slp_decision (loop_vinfo); + + /* Find stmts that need to be both vectorized and SLPed. */ + vect_detect_hybrid_slp (loop_vinfo); + } + /* Scan all the operations in the loop and make sure they are vectorizable. */ @@ -2004,6 +2012,94 @@ vect_force_simple_reduction (loop_vec_info loop_info, gimple phi, double_reduc, true); } +/* Calculate the cost of one scalar iteration of the loop. */ +int +vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo) +{ + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); + int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0; + int innerloop_iters, i, stmt_cost; + + /* Count statements in scalar loop. Using this as scalar cost for a single + iteration for now. + + TODO: Add outer loop support. + + TODO: Consider assigning different costs to different scalar + statements. */ + + /* FORNOW. */ + if (loop->inner) + innerloop_iters = 50; /* FIXME */ + + for (i = 0; i < nbbs; i++) + { + gimple_stmt_iterator si; + basic_block bb = bbs[i]; + + if (bb->loop_father == loop->inner) + factor = innerloop_iters; + else + factor = 1; + + for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) + { + gimple stmt = gsi_stmt (si); + + if (!is_gimple_assign (stmt) && !is_gimple_call (stmt)) + continue; + + if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))) + { + if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))) + stmt_cost = vect_get_cost (scalar_load); + else + stmt_cost = vect_get_cost (scalar_store); + } + else + stmt_cost = vect_get_cost (scalar_stmt); + + scalar_single_iter_cost += stmt_cost * factor; + } + } + return scalar_single_iter_cost; +} + +/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ +int +vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, + int *peel_iters_epilogue, + int scalar_single_iter_cost) +{ + int peel_guard_costs = 0; + int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + + if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) + { + *peel_iters_epilogue = vf/2; + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, "cost model: " + "epilogue peel iters set to vf/2 because " + "loop iterations are unknown ."); + + /* If peeled iterations are known but number of scalar loop + iterations are unknown, count a taken branch per peeled loop. */ + peel_guard_costs = 2 * vect_get_cost (cond_branch_taken); + } + else + { + int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); + peel_iters_prologue = niters < peel_iters_prologue ? + niters : peel_iters_prologue; + *peel_iters_epilogue = (niters - peel_iters_prologue) % vf; + } + + return (peel_iters_prologue * scalar_single_iter_cost) + + (*peel_iters_epilogue * scalar_single_iter_cost) + + peel_guard_costs; +} + /* Function vect_estimate_min_profitable_iters Return the number of iterations required for the vector version of the @@ -2028,7 +2124,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); int nbbs = loop->num_nodes; - int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); + int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); int peel_guard_costs = 0; int innerloop_iters = 0, factor; VEC (slp_instance, heap) *slp_instances; @@ -2099,7 +2195,6 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) && (!STMT_VINFO_LIVE_P (stmt_info) || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) continue; - scalar_single_iter_cost += cost_for_stmt (stmt) * factor; vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; /* FIXME: for stmts in the inner-loop in outer-loop vectorization, some of the "outside" costs are generated inside the outer-loop. */ @@ -2107,6 +2202,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) } } + scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo); + /* Add additional cost for the peeled instructions in prologue and epilogue loop. @@ -2116,7 +2213,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) TODO: Build an expression that represents peel_iters for prologue and epilogue to be used in a run-time test. */ - if (byte_misalign < 0) + if (npeel < 0) { peel_iters_prologue = vf/2; if (vect_print_dump_info (REPORT_COST)) @@ -2137,46 +2234,18 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) not known. Hence guards remain the same. */ peel_guard_costs += 2 * (vect_get_cost (cond_branch_taken) + vect_get_cost (cond_branch_not_taken)); + vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost) + + (peel_iters_epilogue * scalar_single_iter_cost) + + peel_guard_costs; } else { - if (byte_misalign) - { - struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); - int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); - tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); - int nelements = TYPE_VECTOR_SUBPARTS (vectype); - - peel_iters_prologue = nelements - (byte_misalign / element_size); - } - else - peel_iters_prologue = 0; - - if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) - { - peel_iters_epilogue = vf/2; - if (vect_print_dump_info (REPORT_COST)) - fprintf (vect_dump, "cost model: " - "epilogue peel iters set to vf/2 because " - "loop iterations are unknown ."); - - /* If peeled iterations are known but number of scalar loop - iterations are unknown, count a taken branch per peeled loop. */ - peel_guard_costs += 2 * vect_get_cost (cond_branch_taken); - } - else - { - int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); - peel_iters_prologue = niters < peel_iters_prologue ? - niters : peel_iters_prologue; - peel_iters_epilogue = (niters - peel_iters_prologue) % vf; - } + peel_iters_prologue = npeel; + vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo, + peel_iters_prologue, &peel_iters_epilogue, + scalar_single_iter_cost); } - vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost) - + (peel_iters_epilogue * scalar_single_iter_cost) - + peel_guard_costs; - /* FORNOW: The scalar outside cost is incremented in one of the following ways: diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 5f753a26935..1ae3a652bd9 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -560,7 +560,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, if (first_load == stmt) { first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); - if (vect_supportable_dr_alignment (first_dr) + if (vect_supportable_dr_alignment (first_dr, false) == dr_unaligned_unsupported) { if (vect_print_dump_info (REPORT_SLP)) @@ -646,7 +646,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, { VEC_safe_push (slp_tree, heap, *loads, *node); *inside_cost - += targetm.vectorize.builtin_vectorization_cost (vec_perm) + += targetm.vectorize.builtin_vectorization_cost (vec_perm, NULL, 0) * group_size; } diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index c95fe7d273e..89e7c4b0ebd 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -545,6 +545,18 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo) } +/* Get cost by calling cost target builtin. */ + +static inline +int vect_get_stmt_cost (enum vect_cost_for_stmt type_of_cost) +{ + tree dummy_type = NULL; + int dummy = 0; + + return targetm.vectorize.builtin_vectorization_cost (type_of_cost, + dummy_type, dummy); +} + int cost_for_stmt (gimple stmt) { @@ -553,9 +565,9 @@ cost_for_stmt (gimple stmt) switch (STMT_VINFO_TYPE (stmt_info)) { case load_vec_info_type: - return targetm.vectorize.builtin_vectorization_cost (scalar_load); + return vect_get_stmt_cost (scalar_load); case store_vec_info_type: - return targetm.vectorize.builtin_vectorization_cost (scalar_store); + return vect_get_stmt_cost (scalar_store); case op_vec_info_type: case condition_vec_info_type: case assignment_vec_info_type: @@ -565,7 +577,7 @@ cost_for_stmt (gimple stmt) case type_demotion_vec_info_type: case type_conversion_vec_info_type: case call_vec_info_type: - return targetm.vectorize.builtin_vectorization_cost (scalar_stmt); + return vect_get_stmt_cost (scalar_stmt); case undef_vec_info_type: default: gcc_unreachable (); @@ -589,15 +601,13 @@ vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies, if (PURE_SLP_STMT (stmt_info)) return; - inside_cost = ncopies - * targetm.vectorize.builtin_vectorization_cost (vector_stmt); + inside_cost = ncopies * vect_get_stmt_cost (vector_stmt); /* FORNOW: Assuming maximum 2 args per stmts. */ for (i = 0; i < 2; i++) { if (dt[i] == vect_constant_def || dt[i] == vect_external_def) - outside_cost - += targetm.vectorize.builtin_vectorization_cost (vector_stmt); + outside_cost += vect_get_stmt_cost (vector_stmt); } if (vect_print_dump_info (REPORT_COST)) @@ -638,22 +648,39 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, enum vect_def_type dt, slp_tree slp_node) { int group_size; - int inside_cost = 0, outside_cost = 0; + unsigned int inside_cost = 0, outside_cost = 0; + struct data_reference *first_dr; + gimple first_stmt; /* The SLP costs were already calculated during SLP tree build. */ if (PURE_SLP_STMT (stmt_info)) return; if (dt == vect_constant_def || dt == vect_external_def) - outside_cost - = targetm.vectorize.builtin_vectorization_cost (scalar_to_vec); + outside_cost = vect_get_stmt_cost (scalar_to_vec); /* Strided access? */ - if (DR_GROUP_FIRST_DR (stmt_info) && !slp_node) - group_size = vect_cost_strided_group_size (stmt_info); + if (DR_GROUP_FIRST_DR (stmt_info)) + { + if (slp_node) + { + first_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (slp_node), 0); + group_size = 1; + } + else + { + first_stmt = DR_GROUP_FIRST_DR (stmt_info); + group_size = vect_cost_strided_group_size (stmt_info); + } + + first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); + } /* Not a strided access. */ else - group_size = 1; + { + group_size = 1; + first_dr = STMT_VINFO_DATA_REF (stmt_info); + } /* Is this an access in a group of stores, which provide strided access? If so, add in the cost of the permutes. */ @@ -661,7 +688,7 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, { /* Uses a high and low interleave operation for each needed permute. */ inside_cost = ncopies * exact_log2(group_size) * group_size - * targetm.vectorize.builtin_vectorization_cost (vector_stmt); + * vect_get_stmt_cost (vector_stmt); if (vect_print_dump_info (REPORT_COST)) fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .", @@ -670,8 +697,7 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, } /* Costs of the stores. */ - inside_cost += ncopies - * targetm.vectorize.builtin_vectorization_cost (vector_store); + vect_get_store_cost (first_dr, ncopies, &inside_cost); if (vect_print_dump_info (REPORT_COST)) fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, " @@ -683,6 +709,49 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, } +/* Calculate cost of DR's memory access. */ +void +vect_get_store_cost (struct data_reference *dr, int ncopies, + unsigned int *inside_cost) +{ + int alignment_support_scheme = vect_supportable_dr_alignment (dr, false); + + switch (alignment_support_scheme) + { + case dr_aligned: + { + *inside_cost += ncopies * vect_get_stmt_cost (vector_store); + + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, "vect_model_store_cost: aligned."); + + break; + } + + case dr_unaligned_supported: + { + gimple stmt = DR_STMT (dr); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + + /* Here, we assign an additional cost for the unaligned store. */ + *inside_cost += ncopies + * targetm.vectorize.builtin_vectorization_cost (unaligned_store, + vectype, DR_MISALIGNMENT (dr)); + + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, "vect_model_store_cost: unaligned supported by " + "hardware."); + + break; + } + + default: + gcc_unreachable (); + } +} + + /* Function vect_model_load_cost Models cost for loads. In the case of strided accesses, the last access @@ -695,10 +764,9 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node) { int group_size; - int alignment_support_cheme; gimple first_stmt; struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr; - int inside_cost = 0, outside_cost = 0; + unsigned int inside_cost = 0, outside_cost = 0; /* The SLP costs were already calculated during SLP tree build. */ if (PURE_SLP_STMT (stmt_info)) @@ -718,29 +786,47 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node) first_dr = dr; } - alignment_support_cheme = vect_supportable_dr_alignment (first_dr); - /* Is this an access in a group of loads providing strided access? If so, add in the cost of the permutes. */ if (group_size > 1) { /* Uses an even and odd extract operations for each needed permute. */ inside_cost = ncopies * exact_log2(group_size) * group_size - * targetm.vectorize.builtin_vectorization_cost (vector_stmt); + * vect_get_stmt_cost (vector_stmt); if (vect_print_dump_info (REPORT_COST)) fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .", group_size); - } /* The loads themselves. */ - switch (alignment_support_cheme) + vect_get_load_cost (first_dr, ncopies, + ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node), + &inside_cost, &outside_cost); + + if (vect_print_dump_info (REPORT_COST)) + fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, " + "outside_cost = %d .", inside_cost, outside_cost); + + /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */ + stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost); + stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost); +} + + +/* Calculate cost of DR's memory access. */ +void +vect_get_load_cost (struct data_reference *dr, int ncopies, + bool add_realign_cost, unsigned int *inside_cost, + unsigned int *outside_cost) +{ + int alignment_support_scheme = vect_supportable_dr_alignment (dr, false); + + switch (alignment_support_scheme) { case dr_aligned: { - inside_cost += ncopies - * targetm.vectorize.builtin_vectorization_cost (vector_load); + inside_cost += ncopies * vect_get_stmt_cost (vector_load); if (vect_print_dump_info (REPORT_COST)) fprintf (vect_dump, "vect_model_load_cost: aligned."); @@ -749,10 +835,14 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node) } case dr_unaligned_supported: { - /* Here, we assign an additional cost for the unaligned load. */ - inside_cost += ncopies - * targetm.vectorize.builtin_vectorization_cost (unaligned_load); + gimple stmt = DR_STMT (dr); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + /* Here, we assign an additional cost for the unaligned load. */ + *inside_cost += ncopies + * targetm.vectorize.builtin_vectorization_cost (unaligned_load, + vectype, DR_MISALIGNMENT (dr)); if (vect_print_dump_info (REPORT_COST)) fprintf (vect_dump, "vect_model_load_cost: unaligned supported by " "hardware."); @@ -761,16 +851,14 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node) } case dr_explicit_realign: { - inside_cost += ncopies * (2 - * targetm.vectorize.builtin_vectorization_cost (vector_load) - + targetm.vectorize.builtin_vectorization_cost (vector_stmt)); + *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load) + + vect_get_stmt_cost (vector_stmt)); /* FIXME: If the misalignment remains fixed across the iterations of the containing loop, the following cost should be added to the outside costs. */ if (targetm.vectorize.builtin_mask_for_load) - inside_cost - += targetm.vectorize.builtin_vectorization_cost (vector_stmt); + *inside_cost += vect_get_stmt_cost (vector_stmt); break; } @@ -787,32 +875,21 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node) access in the group. Inside the loop, there is a load op and a realignment op. */ - if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node) + if (add_realign_cost) { - outside_cost = 2 - * targetm.vectorize.builtin_vectorization_cost (vector_stmt); + *outside_cost = 2 * vect_get_stmt_cost (vector_stmt); if (targetm.vectorize.builtin_mask_for_load) - outside_cost - += targetm.vectorize.builtin_vectorization_cost (vector_stmt); + *outside_cost += vect_get_stmt_cost (vector_stmt); } - inside_cost += ncopies - * (targetm.vectorize.builtin_vectorization_cost (vector_load) - + targetm.vectorize.builtin_vectorization_cost (vector_stmt)); + *inside_cost += ncopies * (vect_get_stmt_cost (vector_load) + + vect_get_stmt_cost (vector_stmt)); break; } default: gcc_unreachable (); } - - if (vect_print_dump_info (REPORT_COST)) - fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, " - "outside_cost = %d .", inside_cost, outside_cost); - - /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */ - stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost); - stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost); } @@ -3142,7 +3219,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, dr_chain = VEC_alloc (tree, heap, group_size); oprnds = VEC_alloc (tree, heap, group_size); - alignment_support_scheme = vect_supportable_dr_alignment (first_dr); + alignment_support_scheme = vect_supportable_dr_alignment (first_dr, false); gcc_assert (alignment_support_scheme); /* In case the vectorization factor (VF) is bigger than the number @@ -3507,7 +3584,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, group_size = vec_num = 1; } - alignment_support_scheme = vect_supportable_dr_alignment (first_dr); + alignment_support_scheme = vect_supportable_dr_alignment (first_dr, false); gcc_assert (alignment_support_scheme); /* In case the vectorization factor (VF) is bigger than the number diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index bf6769c69a7..ed8ff58312a 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -170,6 +170,21 @@ DEF_VEC_ALLOC_P(slp_instance, heap); #define SLP_TREE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop #define SLP_TREE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop + +typedef struct _vect_peel_info +{ + int npeel; + struct data_reference *dr; + unsigned int count; +} *vect_peel_info; + +typedef struct _vect_peel_extended_info +{ + struct _vect_peel_info peel_info; + unsigned int inside_cost; + unsigned int outside_cost; +} *vect_peel_extended_info; + /*-----------------------------------------------------------------*/ /* Info on vectorized loops. */ /*-----------------------------------------------------------------*/ @@ -245,6 +260,10 @@ typedef struct _loop_vec_info { /* Reduction cycles detected in the loop. Used in loop-aware SLP. */ VEC (gimple, heap) *reductions; + + /* Hash table used to choose the best peeling option. */ + htab_t peeling_htab; + } *loop_vec_info; /* Access Functions. */ @@ -270,6 +289,7 @@ typedef struct _loop_vec_info { #define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor #define LOOP_VINFO_REDUCTIONS(L) (L)->reductions +#define LOOP_VINFO_PEELING_HTAB(L) (L)->peeling_htab #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ VEC_length (gimple, (L)->may_misalign_stmts) > 0 @@ -543,6 +563,8 @@ typedef struct _stmt_vec_info { #define PURE_SLP_STMT(S) ((S)->slp_type == pure_slp) #define STMT_SLP_TYPE(S) (S)->slp_type +#define VECT_MAX_COST 1000 + /* The maximum number of intermediate steps required in multi-step type conversion. */ #define MAX_INTERM_CVT_STEPS 3 @@ -743,11 +765,14 @@ extern void vect_remove_stores (gimple); extern bool vect_analyze_stmt (gimple, bool *, slp_tree); extern bool vectorizable_condition (gimple, gimple_stmt_iterator *, gimple *, tree, int); +extern void vect_get_load_cost (struct data_reference *, int, bool, + unsigned int *, unsigned int *); +extern void vect_get_store_cost (struct data_reference *, int, unsigned int *); /* In tree-vect-data-refs.c. */ extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int); extern enum dr_alignment_support vect_supportable_dr_alignment - (struct data_reference *); + (struct data_reference *, bool); extern tree vect_get_smallest_scalar_type (gimple, HOST_WIDE_INT *, HOST_WIDE_INT *); extern bool vect_analyze_data_ref_dependences (loop_vec_info, bb_vec_info, @@ -795,7 +820,8 @@ extern bool vectorizable_induction (gimple, gimple_stmt_iterator *, gimple *); extern int vect_estimate_min_profitable_iters (loop_vec_info); extern tree get_initial_def_for_reduction (gimple, tree, tree *); extern int vect_min_worthwhile_factor (enum tree_code); - +extern int vect_get_known_peeling_cost (loop_vec_info, int, int *, int); +extern int vect_get_single_scalar_iteraion_cost (loop_vec_info); /* In tree-vect-slp.c. */ extern void vect_free_slp_instance (slp_instance); |