diff options
author | Sam Thursfield <sam.thursfield@codethink.co.uk> | 2017-11-13 16:28:05 +0000 |
---|---|---|
committer | Sam Thursfield <sam.thursfield@codethink.co.uk> | 2017-11-13 16:29:09 +0000 |
commit | 03ac50856c9fc8c96b7a17239ee40a10397750a7 (patch) | |
tree | a648c6d3428e4757e003f6ed1748adb9613065db /gcc/tree-vect-data-refs.c | |
parent | 34efdaf078b01a7387007c4e6bde6db86384c4b7 (diff) | |
download | gcc-tarball-03ac50856c9fc8c96b7a17239ee40a10397750a7.tar.gz |
gcc 7.2.0
This is imported manually due to a bug in the tarball import script.
See the baserock-dev mailing list archives (November 2017) for a
more detailed explaination of the issue.
Diffstat (limited to 'gcc/tree-vect-data-refs.c')
-rw-r--r-- | gcc/tree-vect-data-refs.c | 6182 |
1 files changed, 0 insertions, 6182 deletions
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c deleted file mode 100644 index aa504b6a1c..0000000000 --- a/gcc/tree-vect-data-refs.c +++ /dev/null @@ -1,6182 +0,0 @@ -/* Data References Analysis and Manipulation Utilities for Vectorization. - Copyright (C) 2003-2017 Free Software Foundation, Inc. - Contributed by Dorit Naishlos <dorit@il.ibm.com> - and Ira Rosen <irar@il.ibm.com> - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free -Software Foundation; either version 3, or (at your option) any later -version. - -GCC is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING3. If not see -<http://www.gnu.org/licenses/>. */ - -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "backend.h" -#include "target.h" -#include "rtl.h" -#include "tree.h" -#include "gimple.h" -#include "predict.h" -#include "memmodel.h" -#include "tm_p.h" -#include "ssa.h" -#include "optabs-tree.h" -#include "cgraph.h" -#include "dumpfile.h" -#include "alias.h" -#include "fold-const.h" -#include "stor-layout.h" -#include "tree-eh.h" -#include "gimplify.h" -#include "gimple-iterator.h" -#include "gimplify-me.h" -#include "tree-ssa-loop-ivopts.h" -#include "tree-ssa-loop-manip.h" -#include "tree-ssa-loop.h" -#include "cfgloop.h" -#include "tree-scalar-evolution.h" -#include "tree-vectorizer.h" -#include "expr.h" -#include "builtins.h" -#include "params.h" - -/* Return true if load- or store-lanes optab OPTAB is implemented for - COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ - -static bool -vect_lanes_optab_supported_p (const char *name, convert_optab optab, - tree vectype, unsigned HOST_WIDE_INT count) -{ - machine_mode mode, array_mode; - bool limit_p; - - mode = TYPE_MODE (vectype); - limit_p = !targetm.array_mode_supported_p (mode, count); - array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode), - MODE_INT, limit_p); - - if (array_mode == BLKmode) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n", - GET_MODE_NAME (mode), count); - return false; - } - - if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "cannot use %s<%s><%s>\n", name, - GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); - return false; - } - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), - GET_MODE_NAME (mode)); - - return true; -} - - -/* Return the smallest scalar part of STMT. - This is used to determine the vectype of the stmt. We generally set the - vectype according to the type of the result (lhs). For stmts whose - result-type is different than the type of the arguments (e.g., demotion, - promotion), vectype will be reset appropriately (later). Note that we have - to visit the smallest datatype in this function, because that determines the - VF. If the smallest datatype in the loop is present only as the rhs of a - promotion operation - we'd miss it. - Such a case, where a variable of this datatype does not appear in the lhs - anywhere in the loop, can only occur if it's an invariant: e.g.: - 'int_x = (int) short_inv', which we'd expect to have been optimized away by - invariant motion. However, we cannot rely on invariant motion to always - take invariants out of the loop, and so in the case of promotion we also - have to check the rhs. - LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding - types. */ - -tree -vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit, - HOST_WIDE_INT *rhs_size_unit) -{ - tree scalar_type = gimple_expr_type (stmt); - HOST_WIDE_INT lhs, rhs; - - lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); - - if (is_gimple_assign (stmt) - && (gimple_assign_cast_p (stmt) - || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR - || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR - || gimple_assign_rhs_code (stmt) == FLOAT_EXPR)) - { - tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); - - rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); - if (rhs < lhs) - scalar_type = rhs_type; - } - - *lhs_size_unit = lhs; - *rhs_size_unit = rhs; - return scalar_type; -} - - -/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be - tested at run-time. Return TRUE if DDR was successfully inserted. - Return false if versioning is not supported. */ - -static bool -vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) -{ - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - - if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0) - return false; - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "mark for run-time aliasing test between "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr))); - dump_printf (MSG_NOTE, " and "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr))); - dump_printf (MSG_NOTE, "\n"); - } - - if (optimize_loop_nest_for_size_p (loop)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "versioning not supported when optimizing" - " for size.\n"); - return false; - } - - /* FORNOW: We don't support versioning with outer-loop vectorization. */ - if (loop->inner) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "versioning not yet supported for outer-loops.\n"); - return false; - } - - /* FORNOW: We don't support creating runtime alias tests for non-constant - step. */ - if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST - || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "versioning not yet supported for non-constant " - "step\n"); - return false; - } - - LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); - return true; -} - - -/* Function vect_analyze_data_ref_dependence. - - Return TRUE if there (might) exist a dependence between a memory-reference - DRA and a memory-reference DRB. When versioning for alias may check a - dependence at run-time, return FALSE. Adjust *MAX_VF according to - the data dependence. */ - -static bool -vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, - loop_vec_info loop_vinfo, int *max_vf) -{ - unsigned int i; - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - struct data_reference *dra = DDR_A (ddr); - struct data_reference *drb = DDR_B (ddr); - stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); - stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); - lambda_vector dist_v; - unsigned int loop_depth; - - /* In loop analysis all data references should be vectorizable. */ - if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) - || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) - gcc_unreachable (); - - /* Independent data accesses. */ - if (DDR_ARE_DEPENDENT (ddr) == chrec_known) - return false; - - if (dra == drb - || (DR_IS_READ (dra) && DR_IS_READ (drb))) - return false; - - /* We do not have to consider dependences between accesses that belong - to the same group. */ - if (GROUP_FIRST_ELEMENT (stmtinfo_a) - && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)) - return false; - - /* Even if we have an anti-dependence then, as the vectorized loop covers at - least two scalar iterations, there is always also a true dependence. - As the vectorizer does not re-order loads and stores we can ignore - the anti-dependence if TBAA can disambiguate both DRs similar to the - case with known negative distance anti-dependences (positive - distance anti-dependences would violate TBAA constraints). */ - if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) - || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) - && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), - get_alias_set (DR_REF (drb)))) - return false; - - /* Unknown data dependence. */ - if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) - { - /* If user asserted safelen consecutive iterations can be - executed concurrently, assume independence. */ - if (loop->safelen >= 2) - { - if (loop->safelen < *max_vf) - *max_vf = loop->safelen; - LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; - return false; - } - - if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) - || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "versioning for alias not supported for: " - "can't determine dependence between "); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, - DR_REF (dra)); - dump_printf (MSG_MISSED_OPTIMIZATION, " and "); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, - DR_REF (drb)); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - return true; - } - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "versioning for alias required: " - "can't determine dependence between "); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, - DR_REF (dra)); - dump_printf (MSG_MISSED_OPTIMIZATION, " and "); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, - DR_REF (drb)); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - - /* Add to list of ddrs that need to be tested at run-time. */ - return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); - } - - /* Known data dependence. */ - if (DDR_NUM_DIST_VECTS (ddr) == 0) - { - /* If user asserted safelen consecutive iterations can be - executed concurrently, assume independence. */ - if (loop->safelen >= 2) - { - if (loop->safelen < *max_vf) - *max_vf = loop->safelen; - LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; - return false; - } - - if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) - || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "versioning for alias not supported for: " - "bad dist vector for "); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, - DR_REF (dra)); - dump_printf (MSG_MISSED_OPTIMIZATION, " and "); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, - DR_REF (drb)); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - return true; - } - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "versioning for alias required: " - "bad dist vector for "); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); - dump_printf (MSG_MISSED_OPTIMIZATION, " and "); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - /* Add to list of ddrs that need to be tested at run-time. */ - return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo); - } - - loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); - FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) - { - int dist = dist_v[loop_depth]; - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "dependence distance = %d.\n", dist); - - if (dist == 0) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "dependence distance == 0 between "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); - dump_printf (MSG_NOTE, " and "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - - /* When we perform grouped accesses and perform implicit CSE - by detecting equal accesses and doing disambiguation with - runtime alias tests like for - .. = a[i]; - .. = a[i+1]; - a[i] = ..; - a[i+1] = ..; - *p = ..; - .. = a[i]; - .. = a[i+1]; - where we will end up loading { a[i], a[i+1] } once, make - sure that inserting group loads before the first load and - stores after the last store will do the right thing. - Similar for groups like - a[i] = ...; - ... = a[i]; - a[i+1] = ...; - where loads from the group interleave with the store. */ - if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) - || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) - { - gimple *earlier_stmt; - earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb)); - if (DR_IS_WRITE - (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)))) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "READ_WRITE dependence in interleaving." - "\n"); - return true; - } - } - - continue; - } - - if (dist > 0 && DDR_REVERSED_P (ddr)) - { - /* If DDR_REVERSED_P the order of the data-refs in DDR was - reversed (to make distance vector positive), and the actual - distance is negative. */ - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "dependence distance negative.\n"); - /* Record a negative dependence distance to later limit the - amount of stmt copying / unrolling we can perform. - Only need to handle read-after-write dependence. */ - if (DR_IS_READ (drb) - && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 - || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) - STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; - continue; - } - - if (abs (dist) >= 2 - && abs (dist) < *max_vf) - { - /* The dependence distance requires reduction of the maximal - vectorization factor. */ - *max_vf = abs (dist); - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "adjusting maximal vectorization factor to %i\n", - *max_vf); - } - - if (abs (dist) >= *max_vf) - { - /* Dependence distance does not create dependence, as far as - vectorization is concerned, in this case. */ - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "dependence distance >= VF.\n"); - continue; - } - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized, possible dependence " - "between data-refs "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); - dump_printf (MSG_NOTE, " and "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); - dump_printf (MSG_NOTE, "\n"); - } - - return true; - } - - return false; -} - -/* Function vect_analyze_data_ref_dependences. - - Examine all the data references in the loop, and make sure there do not - exist any data dependences between them. Set *MAX_VF according to - the maximum vectorization factor the data dependences allow. */ - -bool -vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf) -{ - unsigned int i; - struct data_dependence_relation *ddr; - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "=== vect_analyze_data_ref_dependences ===\n"); - - LOOP_VINFO_DDRS (loop_vinfo) - .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () - * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); - LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; - /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */ - if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), - &LOOP_VINFO_DDRS (loop_vinfo), - LOOP_VINFO_LOOP_NEST (loop_vinfo), true)) - return false; - - /* For epilogues we either have no aliases or alias versioning - was applied to original loop. Therefore we may just get max_vf - using VF of original loop. */ - if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) - *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo); - else - FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) - if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf)) - return false; - - return true; -} - - -/* Function vect_slp_analyze_data_ref_dependence. - - Return TRUE if there (might) exist a dependence between a memory-reference - DRA and a memory-reference DRB. When versioning for alias may check a - dependence at run-time, return FALSE. Adjust *MAX_VF according to - the data dependence. */ - -static bool -vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr) -{ - struct data_reference *dra = DDR_A (ddr); - struct data_reference *drb = DDR_B (ddr); - - /* We need to check dependences of statements marked as unvectorizable - as well, they still can prohibit vectorization. */ - - /* Independent data accesses. */ - if (DDR_ARE_DEPENDENT (ddr) == chrec_known) - return false; - - if (dra == drb) - return false; - - /* Read-read is OK. */ - if (DR_IS_READ (dra) && DR_IS_READ (drb)) - return false; - - /* If dra and drb are part of the same interleaving chain consider - them independent. */ - if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra))) - && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra))) - == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb))))) - return false; - - /* Unknown data dependence. */ - if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't determine dependence between "); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra)); - dump_printf (MSG_MISSED_OPTIMIZATION, " and "); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb)); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - } - else if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "determined dependence between "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); - dump_printf (MSG_NOTE, " and "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); - dump_printf (MSG_NOTE, "\n"); - } - - return true; -} - - -/* Analyze dependences involved in the transform of SLP NODE. STORES - contain the vector of scalar stores of this instance if we are - disambiguating the loads. */ - -static bool -vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node, - vec<gimple *> stores, gimple *last_store) -{ - /* This walks over all stmts involved in the SLP load/store done - in NODE verifying we can sink them up to the last stmt in the - group. */ - gimple *last_access = vect_find_last_scalar_stmt_in_slp (node); - for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) - { - gimple *access = SLP_TREE_SCALAR_STMTS (node)[k]; - if (access == last_access) - continue; - data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access)); - for (gimple_stmt_iterator gsi = gsi_for_stmt (access); - gsi_stmt (gsi) != last_access; gsi_next (&gsi)) - { - gimple *stmt = gsi_stmt (gsi); - if (! gimple_vuse (stmt) - || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt))) - continue; - - /* If we couldn't record a (single) data reference for this - stmt we have to give up. */ - /* ??? Here and below if dependence analysis fails we can resort - to the alias oracle which can handle more kinds of stmts. */ - data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); - if (!dr_b) - return false; - - bool dependent = false; - /* If we run into a store of this same instance (we've just - marked those) then delay dependence checking until we run - into the last store because this is where it will have - been sunk to (and we verify if we can do that as well). */ - if (gimple_visited_p (stmt)) - { - if (stmt != last_store) - continue; - unsigned i; - gimple *store; - FOR_EACH_VEC_ELT (stores, i, store) - { - data_reference *store_dr - = STMT_VINFO_DATA_REF (vinfo_for_stmt (store)); - ddr_p ddr = initialize_data_dependence_relation - (dr_a, store_dr, vNULL); - dependent = vect_slp_analyze_data_ref_dependence (ddr); - free_dependence_relation (ddr); - if (dependent) - break; - } - } - else - { - ddr_p ddr = initialize_data_dependence_relation (dr_a, - dr_b, vNULL); - dependent = vect_slp_analyze_data_ref_dependence (ddr); - free_dependence_relation (ddr); - } - if (dependent) - return false; - } - } - return true; -} - - -/* Function vect_analyze_data_ref_dependences. - - Examine all the data references in the basic-block, and make sure there - do not exist any data dependences between them. Set *MAX_VF according to - the maximum vectorization factor the data dependences allow. */ - -bool -vect_slp_analyze_instance_dependence (slp_instance instance) -{ - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "=== vect_slp_analyze_instance_dependence ===\n"); - - /* The stores of this instance are at the root of the SLP tree. */ - slp_tree store = SLP_INSTANCE_TREE (instance); - if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0]))) - store = NULL; - - /* Verify we can sink stores to the vectorized stmt insert location. */ - gimple *last_store = NULL; - if (store) - { - if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL)) - return false; - - /* Mark stores in this instance and remember the last one. */ - last_store = vect_find_last_scalar_stmt_in_slp (store); - for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) - gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true); - } - - bool res = true; - - /* Verify we can sink loads to the vectorized stmt insert location, - special-casing stores of this instance. */ - slp_tree load; - unsigned int i; - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load) - if (! vect_slp_analyze_node_dependences (instance, load, - store - ? SLP_TREE_SCALAR_STMTS (store) - : vNULL, last_store)) - { - res = false; - break; - } - - /* Unset the visited flag. */ - if (store) - for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) - gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false); - - return res; -} - -/* Function vect_compute_data_ref_alignment - - Compute the misalignment of the data reference DR. - - Output: - 1. If during the misalignment computation it is found that the data reference - cannot be vectorized then false is returned. - 2. DR_MISALIGNMENT (DR) is defined. - - FOR NOW: No analysis is actually performed. Misalignment is calculated - only for trivial cases. TODO. */ - -bool -vect_compute_data_ref_alignment (struct data_reference *dr) -{ - gimple *stmt = DR_STMT (dr); - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - struct loop *loop = NULL; - tree ref = DR_REF (dr); - tree vectype; - tree base, base_addr; - tree misalign = NULL_TREE; - tree aligned_to; - tree step; - unsigned HOST_WIDE_INT alignment; - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "vect_compute_data_ref_alignment:\n"); - - if (loop_vinfo) - loop = LOOP_VINFO_LOOP (loop_vinfo); - - /* Initialize misalignment to unknown. */ - SET_DR_MISALIGNMENT (dr, -1); - - if (tree_fits_shwi_p (DR_STEP (dr))) - misalign = DR_INIT (dr); - aligned_to = DR_ALIGNED_TO (dr); - base_addr = DR_BASE_ADDRESS (dr); - vectype = STMT_VINFO_VECTYPE (stmt_info); - - /* In case the dataref is in an inner-loop of the loop that is being - vectorized (LOOP), we use the base and misalignment information - relative to the outer-loop (LOOP). This is ok only if the misalignment - stays the same throughout the execution of the inner-loop, which is why - we have to check that the stride of the dataref in the inner-loop evenly - divides by the vector size. */ - if (loop && nested_in_vect_loop_p (loop, stmt)) - { - tree step = DR_STEP (dr); - - if (tree_fits_shwi_p (step) - && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "inner step divides the vector-size.\n"); - misalign = STMT_VINFO_DR_INIT (stmt_info); - aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info); - base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info); - } - else - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "inner step doesn't divide the vector-size.\n"); - misalign = NULL_TREE; - } - } - - /* Similarly we can only use base and misalignment information relative to - an innermost loop if the misalignment stays the same throughout the - execution of the loop. As above, this is the case if the stride of - the dataref evenly divides by the vector size. */ - else - { - tree step = DR_STEP (dr); - unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1; - - if (tree_fits_shwi_p (step) - && ((tree_to_shwi (step) * vf) - % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "step doesn't divide the vector-size.\n"); - misalign = NULL_TREE; - } - } - - /* To look at alignment of the base we have to preserve an inner MEM_REF - as that carries alignment information of the actual access. */ - base = ref; - while (handled_component_p (base)) - base = TREE_OPERAND (base, 0); - unsigned int base_alignment = 0; - unsigned HOST_WIDE_INT base_bitpos; - get_object_alignment_1 (base, &base_alignment, &base_bitpos); - /* As data-ref analysis strips the MEM_REF down to its base operand - to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to - adjust things to make base_alignment valid as the alignment of - DR_BASE_ADDRESS. */ - if (TREE_CODE (base) == MEM_REF) - { - /* Note all this only works if DR_BASE_ADDRESS is the same as - MEM_REF operand zero, otherwise DR/SCEV analysis might have factored - in other offsets. We need to rework DR to compute the alingment - of DR_BASE_ADDRESS as long as all information is still available. */ - if (operand_equal_p (TREE_OPERAND (base, 0), base_addr, 0)) - { - base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT; - base_bitpos &= (base_alignment - 1); - } - else - base_bitpos = BITS_PER_UNIT; - } - if (base_bitpos != 0) - base_alignment = base_bitpos & -base_bitpos; - /* Also look at the alignment of the base address DR analysis - computed. */ - unsigned int base_addr_alignment = get_pointer_alignment (base_addr); - if (base_addr_alignment > base_alignment) - base_alignment = base_addr_alignment; - - if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype))) - DR_VECT_AUX (dr)->base_element_aligned = true; - - alignment = TYPE_ALIGN_UNIT (vectype); - - if ((compare_tree_int (aligned_to, alignment) < 0) - || !misalign) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Unknown alignment for access: "); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - return true; - } - - if (base_alignment < TYPE_ALIGN (vectype)) - { - base = base_addr; - if (TREE_CODE (base) == ADDR_EXPR) - base = TREE_OPERAND (base, 0); - if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "can't force alignment of ref: "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); - dump_printf (MSG_NOTE, "\n"); - } - return true; - } - - if (DECL_USER_ALIGN (base)) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "not forcing alignment of user-aligned " - "variable: "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, base); - dump_printf (MSG_NOTE, "\n"); - } - return true; - } - - /* Force the alignment of the decl. - NOTE: This is the only change to the code we make during - the analysis phase, before deciding to vectorize the loop. */ - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, "force alignment of "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, ref); - dump_printf (MSG_NOTE, "\n"); - } - - DR_VECT_AUX (dr)->base_decl = base; - DR_VECT_AUX (dr)->base_misaligned = true; - DR_VECT_AUX (dr)->base_element_aligned = true; - } - - if (loop && nested_in_vect_loop_p (loop, stmt)) - step = STMT_VINFO_DR_STEP (stmt_info); - else - step = DR_STEP (dr); - /* If this is a backward running DR then first access in the larger - vectype actually is N-1 elements before the address in the DR. - Adjust misalign accordingly. */ - if (tree_int_cst_sgn (step) < 0) - { - tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); - /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type, - otherwise we wouldn't be here. */ - offset = fold_build2 (MULT_EXPR, ssizetype, offset, step); - /* PLUS because STEP was negative. */ - misalign = size_binop (PLUS_EXPR, misalign, offset); - } - - SET_DR_MISALIGNMENT (dr, - wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ()); - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr)); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - - return true; -} - - -/* Function vect_update_misalignment_for_peel - - DR - the data reference whose misalignment is to be adjusted. - DR_PEEL - the data reference whose misalignment is being made - zero in the vector loop by the peel. - NPEEL - the number of iterations in the peel loop if the misalignment - of DR_PEEL is known at compile time. */ - -static void -vect_update_misalignment_for_peel (struct data_reference *dr, - struct data_reference *dr_peel, int npeel) -{ - unsigned int i; - vec<dr_p> same_align_drs; - struct data_reference *current_dr; - int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); - int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel)))); - stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); - stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel)); - - /* For interleaved data accesses the step in the loop must be multiplied by - the size of the interleaving group. */ - if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) - dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); - if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info)) - dr_peel_size *= GROUP_SIZE (peel_stmt_info); - - /* It can be assumed that the data refs with the same alignment as dr_peel - are aligned in the vector loop. */ - same_align_drs - = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel))); - FOR_EACH_VEC_ELT (same_align_drs, i, current_dr) - { - if (current_dr != dr) - continue; - gcc_assert (DR_MISALIGNMENT (dr) / dr_size == - DR_MISALIGNMENT (dr_peel) / dr_peel_size); - SET_DR_MISALIGNMENT (dr, 0); - return; - } - - if (known_alignment_for_access_p (dr) - && known_alignment_for_access_p (dr_peel)) - { - bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0; - int misal = DR_MISALIGNMENT (dr); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - misal += negative ? -npeel * dr_size : npeel * dr_size; - misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1; - SET_DR_MISALIGNMENT (dr, misal); - return; - } - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n"); - SET_DR_MISALIGNMENT (dr, -1); -} - - -/* Function verify_data_ref_alignment - - Return TRUE if DR can be handled with respect to alignment. */ - -static bool -verify_data_ref_alignment (data_reference_p dr) -{ - enum dr_alignment_support supportable_dr_alignment - = vect_supportable_dr_alignment (dr, false); - if (!supportable_dr_alignment) - { - if (dump_enabled_p ()) - { - if (DR_IS_READ (dr)) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: unsupported unaligned load."); - else - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: unsupported unaligned " - "store."); - - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, - DR_REF (dr)); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - return false; - } - - if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Vectorizing an unaligned access.\n"); - - return true; -} - -/* Function vect_verify_datarefs_alignment - - Return TRUE if all data references in the loop can be - handled with respect to alignment. */ - -bool -vect_verify_datarefs_alignment (loop_vec_info vinfo) -{ - vec<data_reference_p> datarefs = vinfo->datarefs; - struct data_reference *dr; - unsigned int i; - - FOR_EACH_VEC_ELT (datarefs, i, dr) - { - gimple *stmt = DR_STMT (dr); - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - - if (!STMT_VINFO_RELEVANT_P (stmt_info)) - continue; - - /* For interleaving, only the alignment of the first access matters. */ - if (STMT_VINFO_GROUPED_ACCESS (stmt_info) - && GROUP_FIRST_ELEMENT (stmt_info) != stmt) - continue; - - /* Strided accesses perform only component accesses, alignment is - irrelevant for them. */ - if (STMT_VINFO_STRIDED_P (stmt_info) - && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) - continue; - - if (! verify_data_ref_alignment (dr)) - return false; - } - - return true; -} - -/* Given an memory reference EXP return whether its alignment is less - than its size. */ - -static bool -not_size_aligned (tree exp) -{ - if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) - return true; - - return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) - > get_object_alignment (exp)); -} - -/* Function vector_alignment_reachable_p - - Return true if vector alignment for DR is reachable by peeling - a few loop iterations. Return false otherwise. */ - -static bool -vector_alignment_reachable_p (struct data_reference *dr) -{ - gimple *stmt = DR_STMT (dr); - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - - if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) - { - /* For interleaved access we peel only if number of iterations in - the prolog loop ({VF - misalignment}), is a multiple of the - number of the interleaved accesses. */ - int elem_size, mis_in_elements; - int nelements = TYPE_VECTOR_SUBPARTS (vectype); - - /* FORNOW: handle only known alignment. */ - if (!known_alignment_for_access_p (dr)) - return false; - - elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements; - mis_in_elements = DR_MISALIGNMENT (dr) / elem_size; - - if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info)) - return false; - } - - /* If misalignment is known at the compile time then allow peeling - only if natural alignment is reachable through peeling. */ - if (known_alignment_for_access_p (dr) && !aligned_access_p (dr)) - { - HOST_WIDE_INT elmsize = - int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize); - dump_printf (MSG_NOTE, - ". misalignment = %d.\n", DR_MISALIGNMENT (dr)); - } - if (DR_MISALIGNMENT (dr) % elmsize) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "data size does not divide the misalignment.\n"); - return false; - } - } - - if (!known_alignment_for_access_p (dr)) - { - tree type = TREE_TYPE (DR_REF (dr)); - bool is_packed = not_size_aligned (DR_REF (dr)); - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Unknown misalignment, %snaturally aligned\n", - is_packed ? "not " : ""); - return targetm.vectorize.vector_alignment_reachable (type, is_packed); - } - - return true; -} - - -/* Calculate the cost of the memory access represented by DR. */ - -static void -vect_get_data_access_cost (struct data_reference *dr, - unsigned int *inside_cost, - unsigned int *outside_cost, - stmt_vector_for_cost *body_cost_vec) -{ - gimple *stmt = DR_STMT (dr); - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - int ncopies = vf / nunits; - - if (DR_IS_READ (dr)) - vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost, - NULL, body_cost_vec, false); - else - vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec); - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "vect_get_data_access_cost: inside_cost = %d, " - "outside_cost = %d.\n", *inside_cost, *outside_cost); -} - - -typedef struct _vect_peel_info -{ - struct data_reference *dr; - int npeel; - unsigned int count; -} *vect_peel_info; - -typedef struct _vect_peel_extended_info -{ - struct _vect_peel_info peel_info; - unsigned int inside_cost; - unsigned int outside_cost; - stmt_vector_for_cost body_cost_vec; -} *vect_peel_extended_info; - - -/* Peeling hashtable helpers. */ - -struct peel_info_hasher : free_ptr_hash <_vect_peel_info> -{ - static inline hashval_t hash (const _vect_peel_info *); - static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); -}; - -inline hashval_t -peel_info_hasher::hash (const _vect_peel_info *peel_info) -{ - return (hashval_t) peel_info->npeel; -} - -inline bool -peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) -{ - return (a->npeel == b->npeel); -} - - -/* Insert DR into peeling hash table with NPEEL as key. */ - -static void -vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, - loop_vec_info loop_vinfo, struct data_reference *dr, - int npeel) -{ - struct _vect_peel_info elem, *slot; - _vect_peel_info **new_slot; - bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); - - elem.npeel = npeel; - slot = peeling_htab->find (&elem); - if (slot) - slot->count++; - else - { - slot = XNEW (struct _vect_peel_info); - slot->npeel = npeel; - slot->dr = dr; - slot->count = 1; - new_slot = peeling_htab->find_slot (slot, INSERT); - *new_slot = slot; - } - - if (!supportable_dr_alignment - && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) - slot->count += VECT_MAX_COST; -} - - -/* Traverse peeling hash table to find peeling option that aligns maximum - number of data accesses. */ - -int -vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, - _vect_peel_extended_info *max) -{ - vect_peel_info elem = *slot; - - if (elem->count > max->peel_info.count - || (elem->count == max->peel_info.count - && max->peel_info.npeel > elem->npeel)) - { - max->peel_info.npeel = elem->npeel; - max->peel_info.count = elem->count; - max->peel_info.dr = elem->dr; - } - - return 1; -} - - -/* Traverse peeling hash table and calculate cost for each peeling option. - Find the one with the lowest cost. */ - -int -vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, - _vect_peel_extended_info *min) -{ - vect_peel_info elem = *slot; - int save_misalignment, dummy; - unsigned int inside_cost = 0, outside_cost = 0, i; - gimple *stmt = DR_STMT (elem->dr); - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); - struct data_reference *dr; - stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec; - - prologue_cost_vec.create (2); - body_cost_vec.create (2); - epilogue_cost_vec.create (2); - - FOR_EACH_VEC_ELT (datarefs, i, dr) - { - stmt = DR_STMT (dr); - stmt_info = vinfo_for_stmt (stmt); - /* For interleaving, only the alignment of the first access - matters. */ - if (STMT_VINFO_GROUPED_ACCESS (stmt_info) - && GROUP_FIRST_ELEMENT (stmt_info) != stmt) - continue; - - /* Strided accesses perform only component accesses, alignment is - irrelevant for them. */ - if (STMT_VINFO_STRIDED_P (stmt_info) - && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) - continue; - - save_misalignment = DR_MISALIGNMENT (dr); - vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel); - vect_get_data_access_cost (dr, &inside_cost, &outside_cost, - &body_cost_vec); - SET_DR_MISALIGNMENT (dr, save_misalignment); - } - - outside_cost += vect_get_known_peeling_cost - (loop_vinfo, elem->npeel, &dummy, - &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), - &prologue_cost_vec, &epilogue_cost_vec); - - /* Prologue and epilogue costs are added to the target model later. - These costs depend only on the scalar iteration cost, the - number of peeling iterations finally chosen, and the number of - misaligned statements. So discard the information found here. */ - prologue_cost_vec.release (); - epilogue_cost_vec.release (); - - if (inside_cost < min->inside_cost - || (inside_cost == min->inside_cost && outside_cost < min->outside_cost)) - { - min->inside_cost = inside_cost; - min->outside_cost = outside_cost; - min->body_cost_vec.release (); - min->body_cost_vec = body_cost_vec; - min->peel_info.dr = elem->dr; - min->peel_info.npeel = elem->npeel; - } - else - body_cost_vec.release (); - - return 1; -} - - -/* Choose best peeling option by traversing peeling hash table and either - choosing an option with the lowest cost (if cost model is enabled) or the - option that aligns as many accesses as possible. */ - -static struct data_reference * -vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, - loop_vec_info loop_vinfo, - unsigned int *npeel, - stmt_vector_for_cost *body_cost_vec) -{ - struct _vect_peel_extended_info res; - - res.peel_info.dr = NULL; - res.body_cost_vec = stmt_vector_for_cost (); - - if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) - { - res.inside_cost = INT_MAX; - res.outside_cost = INT_MAX; - peeling_htab->traverse <_vect_peel_extended_info *, - vect_peeling_hash_get_lowest_cost> (&res); - } - else - { - res.peel_info.count = 0; - peeling_htab->traverse <_vect_peel_extended_info *, - vect_peeling_hash_get_most_frequent> (&res); - } - - *npeel = res.peel_info.npeel; - *body_cost_vec = res.body_cost_vec; - return res.peel_info.dr; -} - - -/* Function vect_enhance_data_refs_alignment - - This pass will use loop versioning and loop peeling in order to enhance - the alignment of data references in the loop. - - FOR NOW: we assume that whatever versioning/peeling takes place, only the - original loop is to be vectorized. Any other loops that are created by - the transformations performed in this pass - are not supposed to be - vectorized. This restriction will be relaxed. - - This pass will require a cost model to guide it whether to apply peeling - or versioning or a combination of the two. For example, the scheme that - intel uses when given a loop with several memory accesses, is as follows: - choose one memory access ('p') which alignment you want to force by doing - peeling. Then, either (1) generate a loop in which 'p' is aligned and all - other accesses are not necessarily aligned, or (2) use loop versioning to - generate one loop in which all accesses are aligned, and another loop in - which only 'p' is necessarily aligned. - - ("Automatic Intra-Register Vectorization for the Intel Architecture", - Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International - Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) - - Devising a cost model is the most critical aspect of this work. It will - guide us on which access to peel for, whether to use loop versioning, how - many versions to create, etc. The cost model will probably consist of - generic considerations as well as target specific considerations (on - powerpc for example, misaligned stores are more painful than misaligned - loads). - - Here are the general steps involved in alignment enhancements: - - -- original loop, before alignment analysis: - for (i=0; i<N; i++){ - x = q[i]; # DR_MISALIGNMENT(q) = unknown - p[i] = y; # DR_MISALIGNMENT(p) = unknown - } - - -- After vect_compute_data_refs_alignment: - for (i=0; i<N; i++){ - x = q[i]; # DR_MISALIGNMENT(q) = 3 - p[i] = y; # DR_MISALIGNMENT(p) = unknown - } - - -- Possibility 1: we do loop versioning: - if (p is aligned) { - for (i=0; i<N; i++){ # loop 1A - x = q[i]; # DR_MISALIGNMENT(q) = 3 - p[i] = y; # DR_MISALIGNMENT(p) = 0 - } - } - else { - for (i=0; i<N; i++){ # loop 1B - x = q[i]; # DR_MISALIGNMENT(q) = 3 - p[i] = y; # DR_MISALIGNMENT(p) = unaligned - } - } - - -- Possibility 2: we do loop peeling: - for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). - x = q[i]; - p[i] = y; - } - for (i = 3; i < N; i++){ # loop 2A - x = q[i]; # DR_MISALIGNMENT(q) = 0 - p[i] = y; # DR_MISALIGNMENT(p) = unknown - } - - -- Possibility 3: combination of loop peeling and versioning: - for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). - x = q[i]; - p[i] = y; - } - if (p is aligned) { - for (i = 3; i<N; i++){ # loop 3A - x = q[i]; # DR_MISALIGNMENT(q) = 0 - p[i] = y; # DR_MISALIGNMENT(p) = 0 - } - } - else { - for (i = 3; i<N; i++){ # loop 3B - x = q[i]; # DR_MISALIGNMENT(q) = 0 - p[i] = y; # DR_MISALIGNMENT(p) = unaligned - } - } - - These loops are later passed to loop_transform to be vectorized. The - vectorizer will use the alignment information to guide the transformation - (whether to generate regular loads/stores, or with special handling for - misalignment). */ - -bool -vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) -{ - vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - enum dr_alignment_support supportable_dr_alignment; - struct data_reference *dr0 = NULL, *first_store = NULL; - struct data_reference *dr; - unsigned int i, j; - bool do_peeling = false; - bool do_versioning = false; - bool stat; - gimple *stmt; - stmt_vec_info stmt_info; - unsigned int npeel = 0; - bool all_misalignments_unknown = true; - unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - unsigned possible_npeel_number = 1; - tree vectype; - unsigned int nelements, mis, same_align_drs_max = 0; - stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost (); - hash_table<peel_info_hasher> peeling_htab (1); - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "=== vect_enhance_data_refs_alignment ===\n"); - - /* Reset data so we can safely be called multiple times. */ - LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; - - /* While cost model enhancements are expected in the future, the high level - view of the code at this time is as follows: - - A) If there is a misaligned access then see if peeling to align - this access can make all data references satisfy - vect_supportable_dr_alignment. If so, update data structures - as needed and return true. - - B) If peeling wasn't possible and there is a data reference with an - unknown misalignment that does not satisfy vect_supportable_dr_alignment - then see if loop versioning checks can be used to make all data - references satisfy vect_supportable_dr_alignment. If so, update - data structures as needed and return true. - - C) If neither peeling nor versioning were successful then return false if - any data reference does not satisfy vect_supportable_dr_alignment. - - D) Return true (all data references satisfy vect_supportable_dr_alignment). - - Note, Possibility 3 above (which is peeling and versioning together) is not - being done at this time. */ - - /* (1) Peeling to force alignment. */ - - /* (1.1) Decide whether to perform peeling, and how many iterations to peel: - Considerations: - + How many accesses will become aligned due to the peeling - - How many accesses will become unaligned due to the peeling, - and the cost of misaligned accesses. - - The cost of peeling (the extra runtime checks, the increase - in code size). */ - - FOR_EACH_VEC_ELT (datarefs, i, dr) - { - stmt = DR_STMT (dr); - stmt_info = vinfo_for_stmt (stmt); - - if (!STMT_VINFO_RELEVANT_P (stmt_info)) - continue; - - /* For interleaving, only the alignment of the first access - matters. */ - if (STMT_VINFO_GROUPED_ACCESS (stmt_info) - && GROUP_FIRST_ELEMENT (stmt_info) != stmt) - continue; - - /* For invariant accesses there is nothing to enhance. */ - if (integer_zerop (DR_STEP (dr))) - continue; - - /* Strided accesses perform only component accesses, alignment is - irrelevant for them. */ - if (STMT_VINFO_STRIDED_P (stmt_info) - && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) - continue; - - supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); - do_peeling = vector_alignment_reachable_p (dr); - if (do_peeling) - { - if (known_alignment_for_access_p (dr)) - { - unsigned int npeel_tmp; - bool negative = tree_int_cst_compare (DR_STEP (dr), - size_zero_node) < 0; - - /* Save info about DR in the hash table. */ - vectype = STMT_VINFO_VECTYPE (stmt_info); - nelements = TYPE_VECTOR_SUBPARTS (vectype); - mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE ( - TREE_TYPE (DR_REF (dr)))); - npeel_tmp = (negative - ? (mis - nelements) : (nelements - mis)) - & (nelements - 1); - - /* For multiple types, it is possible that the bigger type access - will have more than one peeling option. E.g., a loop with two - types: one of size (vector size / 4), and the other one of - size (vector size / 8). Vectorization factor will 8. If both - access are misaligned by 3, the first one needs one scalar - iteration to be aligned, and the second one needs 5. But the - first one will be aligned also by peeling 5 scalar - iterations, and in that case both accesses will be aligned. - Hence, except for the immediate peeling amount, we also want - to try to add full vector size, while we don't exceed - vectorization factor. - We do this automatically for cost model, since we calculate cost - for every peeling option. */ - if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) - { - if (STMT_SLP_TYPE (stmt_info)) - possible_npeel_number - = (vf * GROUP_SIZE (stmt_info)) / nelements; - else - possible_npeel_number = vf / nelements; - } - - /* Handle the aligned case. We may decide to align some other - access, making DR unaligned. */ - if (DR_MISALIGNMENT (dr) == 0) - { - npeel_tmp = 0; - if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) - possible_npeel_number++; - } - - for (j = 0; j < possible_npeel_number; j++) - { - vect_peeling_hash_insert (&peeling_htab, loop_vinfo, - dr, npeel_tmp); - npeel_tmp += nelements; - } - - all_misalignments_unknown = false; - /* Data-ref that was chosen for the case that all the - misalignments are unknown is not relevant anymore, since we - have a data-ref with known alignment. */ - dr0 = NULL; - } - else - { - /* If we don't know any misalignment values, we prefer - peeling for data-ref that has the maximum number of data-refs - with the same alignment, unless the target prefers to align - stores over load. */ - if (all_misalignments_unknown) - { - unsigned same_align_drs - = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); - if (!dr0 - || same_align_drs_max < same_align_drs) - { - same_align_drs_max = same_align_drs; - dr0 = dr; - } - /* For data-refs with the same number of related - accesses prefer the one where the misalign - computation will be invariant in the outermost loop. */ - else if (same_align_drs_max == same_align_drs) - { - struct loop *ivloop0, *ivloop; - ivloop0 = outermost_invariant_loop_for_expr - (loop, DR_BASE_ADDRESS (dr0)); - ivloop = outermost_invariant_loop_for_expr - (loop, DR_BASE_ADDRESS (dr)); - if ((ivloop && !ivloop0) - || (ivloop && ivloop0 - && flow_loop_nested_p (ivloop, ivloop0))) - dr0 = dr; - } - - if (!first_store && DR_IS_WRITE (dr)) - first_store = dr; - } - - /* If there are both known and unknown misaligned accesses in the - loop, we choose peeling amount according to the known - accesses. */ - if (!supportable_dr_alignment) - { - dr0 = dr; - if (!first_store && DR_IS_WRITE (dr)) - first_store = dr; - } - } - } - else - { - if (!aligned_access_p (dr)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "vector alignment may not be reachable\n"); - break; - } - } - } - - /* Check if we can possibly peel the loop. */ - if (!vect_can_advance_ivs_p (loop_vinfo) - || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) - || loop->inner) - do_peeling = false; - - if (do_peeling - && all_misalignments_unknown - && vect_supportable_dr_alignment (dr0, false)) - { - /* Check if the target requires to prefer stores over loads, i.e., if - misaligned stores are more expensive than misaligned loads (taking - drs with same alignment into account). */ - if (first_store && DR_IS_READ (dr0)) - { - unsigned int load_inside_cost = 0, load_outside_cost = 0; - unsigned int store_inside_cost = 0, store_outside_cost = 0; - unsigned int load_inside_penalty = 0, load_outside_penalty = 0; - unsigned int store_inside_penalty = 0, store_outside_penalty = 0; - stmt_vector_for_cost dummy; - dummy.create (2); - - vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost, - &dummy); - vect_get_data_access_cost (first_store, &store_inside_cost, - &store_outside_cost, &dummy); - - dummy.release (); - - /* Calculate the penalty for leaving FIRST_STORE unaligned (by - aligning the load DR0). */ - load_inside_penalty = store_inside_cost; - load_outside_penalty = store_outside_cost; - for (i = 0; - STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( - DR_STMT (first_store))).iterate (i, &dr); - i++) - if (DR_IS_READ (dr)) - { - load_inside_penalty += load_inside_cost; - load_outside_penalty += load_outside_cost; - } - else - { - load_inside_penalty += store_inside_cost; - load_outside_penalty += store_outside_cost; - } - - /* Calculate the penalty for leaving DR0 unaligned (by - aligning the FIRST_STORE). */ - store_inside_penalty = load_inside_cost; - store_outside_penalty = load_outside_cost; - for (i = 0; - STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt ( - DR_STMT (dr0))).iterate (i, &dr); - i++) - if (DR_IS_READ (dr)) - { - store_inside_penalty += load_inside_cost; - store_outside_penalty += load_outside_cost; - } - else - { - store_inside_penalty += store_inside_cost; - store_outside_penalty += store_outside_cost; - } - - if (load_inside_penalty > store_inside_penalty - || (load_inside_penalty == store_inside_penalty - && load_outside_penalty > store_outside_penalty)) - dr0 = first_store; - } - - /* In case there are only loads with different unknown misalignments, use - peeling only if it may help to align other accesses in the loop or - if it may help improving load bandwith when we'd end up using - unaligned loads. */ - tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0))); - if (!first_store - && !STMT_VINFO_SAME_ALIGN_REFS ( - vinfo_for_stmt (DR_STMT (dr0))).length () - && (vect_supportable_dr_alignment (dr0, false) - != dr_unaligned_supported - || (builtin_vectorization_cost (vector_load, dr0_vt, 0) - == builtin_vectorization_cost (unaligned_load, dr0_vt, -1)))) - do_peeling = false; - } - - if (do_peeling && !dr0) - { - /* Peeling is possible, but there is no data access that is not supported - unless aligned. So we try to choose the best possible peeling. */ - - /* We should get here only if there are drs with known misalignment. */ - gcc_assert (!all_misalignments_unknown); - - /* Choose the best peeling from the hash table. */ - dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab, - loop_vinfo, &npeel, - &body_cost_vec); - if (!dr0 || !npeel) - do_peeling = false; - } - - if (do_peeling) - { - stmt = DR_STMT (dr0); - stmt_info = vinfo_for_stmt (stmt); - vectype = STMT_VINFO_VECTYPE (stmt_info); - nelements = TYPE_VECTOR_SUBPARTS (vectype); - - if (known_alignment_for_access_p (dr0)) - { - bool negative = tree_int_cst_compare (DR_STEP (dr0), - size_zero_node) < 0; - if (!npeel) - { - /* Since it's known at compile time, compute the number of - iterations in the peeled loop (the peeling factor) for use in - updating DR_MISALIGNMENT values. The peeling factor is the - vectorization factor minus the misalignment as an element - count. */ - mis = DR_MISALIGNMENT (dr0); - mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0)))); - npeel = ((negative ? mis - nelements : nelements - mis) - & (nelements - 1)); - } - - /* For interleaved data access every iteration accesses all the - members of the group, therefore we divide the number of iterations - by the group size. */ - stmt_info = vinfo_for_stmt (DR_STMT (dr0)); - if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) - npeel /= GROUP_SIZE (stmt_info); - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Try peeling by %d\n", npeel); - } - - /* Ensure that all data refs can be vectorized after the peel. */ - FOR_EACH_VEC_ELT (datarefs, i, dr) - { - int save_misalignment; - - if (dr == dr0) - continue; - - stmt = DR_STMT (dr); - stmt_info = vinfo_for_stmt (stmt); - /* For interleaving, only the alignment of the first access - matters. */ - if (STMT_VINFO_GROUPED_ACCESS (stmt_info) - && GROUP_FIRST_ELEMENT (stmt_info) != stmt) - continue; - - /* Strided accesses perform only component accesses, alignment is - irrelevant for them. */ - if (STMT_VINFO_STRIDED_P (stmt_info) - && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) - continue; - - save_misalignment = DR_MISALIGNMENT (dr); - vect_update_misalignment_for_peel (dr, dr0, npeel); - supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); - SET_DR_MISALIGNMENT (dr, save_misalignment); - - if (!supportable_dr_alignment) - { - do_peeling = false; - break; - } - } - - if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0) - { - stat = vect_verify_datarefs_alignment (loop_vinfo); - if (!stat) - do_peeling = false; - else - { - body_cost_vec.release (); - return stat; - } - } - - /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ - if (do_peeling) - { - unsigned max_allowed_peel - = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT); - if (max_allowed_peel != (unsigned)-1) - { - unsigned max_peel = npeel; - if (max_peel == 0) - { - gimple *dr_stmt = DR_STMT (dr0); - stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt); - tree vtype = STMT_VINFO_VECTYPE (vinfo); - max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1; - } - if (max_peel > max_allowed_peel) - { - do_peeling = false; - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Disable peeling, max peels reached: %d\n", max_peel); - } - } - } - - /* Cost model #2 - if peeling may result in a remaining loop not - iterating enough to be vectorized then do not peel. */ - if (do_peeling - && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) - { - unsigned max_peel - = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel; - if (LOOP_VINFO_INT_NITERS (loop_vinfo) - < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel) - do_peeling = false; - } - - if (do_peeling) - { - /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. - If the misalignment of DR_i is identical to that of dr0 then set - DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and - dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) - by the peeling factor times the element size of DR_i (MOD the - vectorization factor times the size). Otherwise, the - misalignment of DR_i must be set to unknown. */ - FOR_EACH_VEC_ELT (datarefs, i, dr) - if (dr != dr0) - { - /* Strided accesses perform only component accesses, alignment - is irrelevant for them. */ - stmt_info = vinfo_for_stmt (DR_STMT (dr)); - if (STMT_VINFO_STRIDED_P (stmt_info) - && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) - continue; - - vect_update_misalignment_for_peel (dr, dr0, npeel); - } - - LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0; - if (npeel) - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; - else - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) - = DR_MISALIGNMENT (dr0); - SET_DR_MISALIGNMENT (dr0, 0); - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "Alignment of access forced using peeling.\n"); - dump_printf_loc (MSG_NOTE, vect_location, - "Peeling for alignment will be applied.\n"); - } - /* The inside-loop cost will be accounted for in vectorizable_load - and vectorizable_store correctly with adjusted alignments. - Drop the body_cst_vec on the floor here. */ - body_cost_vec.release (); - - stat = vect_verify_datarefs_alignment (loop_vinfo); - gcc_assert (stat); - return stat; - } - } - - body_cost_vec.release (); - - /* (2) Versioning to force alignment. */ - - /* Try versioning if: - 1) optimize loop for speed - 2) there is at least one unsupported misaligned data ref with an unknown - misalignment, and - 3) all misaligned data refs with a known misalignment are supported, and - 4) the number of runtime alignment checks is within reason. */ - - do_versioning = - optimize_loop_nest_for_speed_p (loop) - && (!loop->inner); /* FORNOW */ - - if (do_versioning) - { - FOR_EACH_VEC_ELT (datarefs, i, dr) - { - stmt = DR_STMT (dr); - stmt_info = vinfo_for_stmt (stmt); - - /* For interleaving, only the alignment of the first access - matters. */ - if (aligned_access_p (dr) - || (STMT_VINFO_GROUPED_ACCESS (stmt_info) - && GROUP_FIRST_ELEMENT (stmt_info) != stmt)) - continue; - - if (STMT_VINFO_STRIDED_P (stmt_info)) - { - /* Strided loads perform only component accesses, alignment is - irrelevant for them. */ - if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) - continue; - do_versioning = false; - break; - } - - supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); - - if (!supportable_dr_alignment) - { - gimple *stmt; - int mask; - tree vectype; - - if (known_alignment_for_access_p (dr) - || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () - >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)) - { - do_versioning = false; - break; - } - - stmt = DR_STMT (dr); - vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); - gcc_assert (vectype); - - /* The rightmost bits of an aligned address must be zeros. - Construct the mask needed for this test. For example, - GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the - mask must be 15 = 0xf. */ - mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1; - - /* FORNOW: use the same mask to test all potentially unaligned - references in the loop. The vectorizer currently supports - a single vector size, see the reference to - GET_MODE_NUNITS (TYPE_MODE (vectype)) where the - vectorization factor is computed. */ - gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo) - || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask); - LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; - LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push ( - DR_STMT (dr)); - } - } - - /* Versioning requires at least one misaligned data reference. */ - if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) - do_versioning = false; - else if (!do_versioning) - LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); - } - - if (do_versioning) - { - vec<gimple *> may_misalign_stmts - = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); - gimple *stmt; - - /* It can now be assumed that the data references in the statements - in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version - of the loop being vectorized. */ - FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt) - { - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - dr = STMT_VINFO_DATA_REF (stmt_info); - SET_DR_MISALIGNMENT (dr, 0); - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Alignment of access forced using versioning.\n"); - } - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Versioning for alignment will be applied.\n"); - - /* Peeling and versioning can't be done together at this time. */ - gcc_assert (! (do_peeling && do_versioning)); - - stat = vect_verify_datarefs_alignment (loop_vinfo); - gcc_assert (stat); - return stat; - } - - /* This point is reached if neither peeling nor versioning is being done. */ - gcc_assert (! (do_peeling || do_versioning)); - - stat = vect_verify_datarefs_alignment (loop_vinfo); - return stat; -} - - -/* Function vect_find_same_alignment_drs. - - Update group and alignment relations according to the chosen - vectorization factor. */ - -static void -vect_find_same_alignment_drs (struct data_dependence_relation *ddr, - loop_vec_info loop_vinfo) -{ - unsigned int i; - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - struct data_reference *dra = DDR_A (ddr); - struct data_reference *drb = DDR_B (ddr); - stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); - stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); - int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra)))); - int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb)))); - lambda_vector dist_v; - unsigned int loop_depth; - - if (DDR_ARE_DEPENDENT (ddr) == chrec_known) - return; - - if (dra == drb) - return; - - if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) - return; - - /* Loop-based vectorization and known data dependence. */ - if (DDR_NUM_DIST_VECTS (ddr) == 0) - return; - - /* Data-dependence analysis reports a distance vector of zero - for data-references that overlap only in the first iteration - but have different sign step (see PR45764). - So as a sanity check require equal DR_STEP. */ - if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) - return; - - loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); - FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) - { - int dist = dist_v[loop_depth]; - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "dependence distance = %d.\n", dist); - - /* Same loop iteration. */ - if (dist == 0 - || (dist % vectorization_factor == 0 && dra_size == drb_size)) - { - /* Two references with distance zero have the same alignment. */ - STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); - STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "accesses have the same alignment.\n"); - dump_printf (MSG_NOTE, - "dependence distance modulo vf == 0 between "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); - dump_printf (MSG_NOTE, " and "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); - dump_printf (MSG_NOTE, "\n"); - } - } - } -} - - -/* Function vect_analyze_data_refs_alignment - - Analyze the alignment of the data-references in the loop. - Return FALSE if a data reference is found that cannot be vectorized. */ - -bool -vect_analyze_data_refs_alignment (loop_vec_info vinfo) -{ - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "=== vect_analyze_data_refs_alignment ===\n"); - - /* Mark groups of data references with same alignment using - data dependence information. */ - vec<ddr_p> ddrs = vinfo->ddrs; - struct data_dependence_relation *ddr; - unsigned int i; - - FOR_EACH_VEC_ELT (ddrs, i, ddr) - vect_find_same_alignment_drs (ddr, vinfo); - - vec<data_reference_p> datarefs = vinfo->datarefs; - struct data_reference *dr; - - FOR_EACH_VEC_ELT (datarefs, i, dr) - { - stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr)); - if (STMT_VINFO_VECTORIZABLE (stmt_info) - && !vect_compute_data_ref_alignment (dr)) - { - /* Strided accesses perform only component accesses, misalignment - information is irrelevant for them. */ - if (STMT_VINFO_STRIDED_P (stmt_info) - && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) - continue; - - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: can't calculate alignment " - "for data ref.\n"); - - return false; - } - } - - return true; -} - - -/* Analyze alignment of DRs of stmts in NODE. */ - -static bool -vect_slp_analyze_and_verify_node_alignment (slp_tree node) -{ - /* We vectorize from the first scalar stmt in the node unless - the node is permuted in which case we start from the first - element in the group. */ - gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0]; - data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); - if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) - first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)); - - data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); - if (! vect_compute_data_ref_alignment (dr) - /* For creating the data-ref pointer we need alignment of the - first element anyway. */ - || (dr != first_dr - && ! vect_compute_data_ref_alignment (first_dr)) - || ! verify_data_ref_alignment (dr)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: bad data alignment in basic " - "block.\n"); - return false; - } - - return true; -} - -/* Function vect_slp_analyze_instance_alignment - - Analyze the alignment of the data-references in the SLP instance. - Return FALSE if a data reference is found that cannot be vectorized. */ - -bool -vect_slp_analyze_and_verify_instance_alignment (slp_instance instance) -{ - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "=== vect_slp_analyze_and_verify_instance_alignment ===\n"); - - slp_tree node; - unsigned i; - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) - if (! vect_slp_analyze_and_verify_node_alignment (node)) - return false; - - node = SLP_INSTANCE_TREE (instance); - if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0])) - && ! vect_slp_analyze_and_verify_node_alignment - (SLP_INSTANCE_TREE (instance))) - return false; - - return true; -} - - -/* Analyze groups of accesses: check that DR belongs to a group of - accesses of legal size, step, etc. Detect gaps, single element - interleaving, and other special cases. Set grouped access info. - Collect groups of strided stores for further use in SLP analysis. - Worker for vect_analyze_group_access. */ - -static bool -vect_analyze_group_access_1 (struct data_reference *dr) -{ - tree step = DR_STEP (dr); - tree scalar_type = TREE_TYPE (DR_REF (dr)); - HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); - gimple *stmt = DR_STMT (dr); - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); - HOST_WIDE_INT dr_step = -1; - HOST_WIDE_INT groupsize, last_accessed_element = 1; - bool slp_impossible = false; - - /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the - size of the interleaving group (including gaps). */ - if (tree_fits_shwi_p (step)) - { - dr_step = tree_to_shwi (step); - /* Check that STEP is a multiple of type size. Otherwise there is - a non-element-sized gap at the end of the group which we - cannot represent in GROUP_GAP or GROUP_SIZE. - ??? As we can handle non-constant step fine here we should - simply remove uses of GROUP_GAP between the last and first - element and instead rely on DR_STEP. GROUP_SIZE then would - simply not include that gap. */ - if ((dr_step % type_size) != 0) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "Step "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, step); - dump_printf (MSG_NOTE, - " is not a multiple of the element size for "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); - dump_printf (MSG_NOTE, "\n"); - } - return false; - } - groupsize = absu_hwi (dr_step) / type_size; - } - else - groupsize = 0; - - /* Not consecutive access is possible only if it is a part of interleaving. */ - if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) - { - /* Check if it this DR is a part of interleaving, and is a single - element of the group that is accessed in the loop. */ - - /* Gaps are supported only for loads. STEP must be a multiple of the type - size. The size of the group must be a power of 2. */ - if (DR_IS_READ (dr) - && (dr_step % type_size) == 0 - && groupsize > 0 - && pow2p_hwi (groupsize)) - { - GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt; - GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; - GROUP_GAP (stmt_info) = groupsize - 1; - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "Detected single element interleaving "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr)); - dump_printf (MSG_NOTE, " step "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, step); - dump_printf (MSG_NOTE, "\n"); - } - - return true; - } - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not consecutive access "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - } - - if (bb_vinfo) - { - /* Mark the statement as unvectorizable. */ - STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; - return true; - } - - dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n"); - STMT_VINFO_STRIDED_P (stmt_info) = true; - return true; - } - - if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt) - { - /* First stmt in the interleaving chain. Check the chain. */ - gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); - struct data_reference *data_ref = dr; - unsigned int count = 1; - tree prev_init = DR_INIT (data_ref); - gimple *prev = stmt; - HOST_WIDE_INT diff, gaps = 0; - - while (next) - { - /* Skip same data-refs. In case that two or more stmts share - data-ref (supported only for loads), we vectorize only the first - stmt, and the rest get their vectorized loads from the first - one. */ - if (!tree_int_cst_compare (DR_INIT (data_ref), - DR_INIT (STMT_VINFO_DATA_REF ( - vinfo_for_stmt (next))))) - { - if (DR_IS_WRITE (data_ref)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Two store stmts share the same dr.\n"); - return false; - } - - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Two or more load stmts share the same dr.\n"); - - /* For load use the same data-ref load. */ - GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev; - - prev = next; - next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); - continue; - } - - prev = next; - data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next)); - - /* All group members have the same STEP by construction. */ - gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); - - /* Check that the distance between two accesses is equal to the type - size. Otherwise, we have gaps. */ - diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) - - TREE_INT_CST_LOW (prev_init)) / type_size; - if (diff != 1) - { - /* FORNOW: SLP of accesses with gaps is not supported. */ - slp_impossible = true; - if (DR_IS_WRITE (data_ref)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "interleaved store with gaps\n"); - return false; - } - - gaps += diff - 1; - } - - last_accessed_element += diff; - - /* Store the gap from the previous member of the group. If there is no - gap in the access, GROUP_GAP is always 1. */ - GROUP_GAP (vinfo_for_stmt (next)) = diff; - - prev_init = DR_INIT (data_ref); - next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next)); - /* Count the number of data-refs in the chain. */ - count++; - } - - if (groupsize == 0) - groupsize = count + gaps; - - /* This could be UINT_MAX but as we are generating code in a very - inefficient way we have to cap earlier. See PR78699 for example. */ - if (groupsize > 4096) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "group is too large\n"); - return false; - } - - /* Check that the size of the interleaving is equal to count for stores, - i.e., that there are no gaps. */ - if (groupsize != count - && !DR_IS_READ (dr)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "interleaved store with gaps\n"); - return false; - } - - /* If there is a gap after the last load in the group it is the - difference between the groupsize and the last accessed - element. - When there is no gap, this difference should be 0. */ - GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element; - - GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "Detected interleaving "); - if (DR_IS_READ (dr)) - dump_printf (MSG_NOTE, "load "); - else - dump_printf (MSG_NOTE, "store "); - dump_printf (MSG_NOTE, "of size %u starting with ", - (unsigned)groupsize); - dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); - if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0) - dump_printf_loc (MSG_NOTE, vect_location, - "There is a gap of %u elements after the group\n", - GROUP_GAP (vinfo_for_stmt (stmt))); - } - - /* SLP: create an SLP data structure for every interleaving group of - stores for further analysis in vect_analyse_slp. */ - if (DR_IS_WRITE (dr) && !slp_impossible) - { - if (loop_vinfo) - LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt); - if (bb_vinfo) - BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt); - } - } - - return true; -} - -/* Analyze groups of accesses: check that DR belongs to a group of - accesses of legal size, step, etc. Detect gaps, single element - interleaving, and other special cases. Set grouped access info. - Collect groups of strided stores for further use in SLP analysis. */ - -static bool -vect_analyze_group_access (struct data_reference *dr) -{ - if (!vect_analyze_group_access_1 (dr)) - { - /* Dissolve the group if present. */ - gimple *next; - gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr))); - while (stmt) - { - stmt_vec_info vinfo = vinfo_for_stmt (stmt); - next = GROUP_NEXT_ELEMENT (vinfo); - GROUP_FIRST_ELEMENT (vinfo) = NULL; - GROUP_NEXT_ELEMENT (vinfo) = NULL; - stmt = next; - } - return false; - } - return true; -} - -/* Analyze the access pattern of the data-reference DR. - In case of non-consecutive accesses call vect_analyze_group_access() to - analyze groups of accesses. */ - -static bool -vect_analyze_data_ref_access (struct data_reference *dr) -{ - tree step = DR_STEP (dr); - tree scalar_type = TREE_TYPE (DR_REF (dr)); - gimple *stmt = DR_STMT (dr); - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - struct loop *loop = NULL; - - if (loop_vinfo) - loop = LOOP_VINFO_LOOP (loop_vinfo); - - if (loop_vinfo && !step) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "bad data-ref access in loop\n"); - return false; - } - - /* Allow loads with zero step in inner-loop vectorization. */ - if (loop_vinfo && integer_zerop (step)) - { - GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; - if (!nested_in_vect_loop_p (loop, stmt)) - return DR_IS_READ (dr); - /* Allow references with zero step for outer loops marked - with pragma omp simd only - it guarantees absence of - loop-carried dependencies between inner loop iterations. */ - if (!loop->force_vectorize) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "zero step in inner loop of nest\n"); - return false; - } - } - - if (loop && nested_in_vect_loop_p (loop, stmt)) - { - /* Interleaved accesses are not yet supported within outer-loop - vectorization for references in the inner-loop. */ - GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; - - /* For the rest of the analysis we use the outer-loop step. */ - step = STMT_VINFO_DR_STEP (stmt_info); - if (integer_zerop (step)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "zero step in outer loop.\n"); - return DR_IS_READ (dr); - } - } - - /* Consecutive? */ - if (TREE_CODE (step) == INTEGER_CST) - { - HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); - if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) - || (dr_step < 0 - && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) - { - /* Mark that it is not interleaving. */ - GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL; - return true; - } - } - - if (loop && nested_in_vect_loop_p (loop, stmt)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "grouped access in outer loop.\n"); - return false; - } - - - /* Assume this is a DR handled by non-constant strided load case. */ - if (TREE_CODE (step) != INTEGER_CST) - return (STMT_VINFO_STRIDED_P (stmt_info) - && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) - || vect_analyze_group_access (dr))); - - /* Not consecutive access - check if it's a part of interleaving group. */ - return vect_analyze_group_access (dr); -} - - - -/* A helper function used in the comparator function to sort data - references. T1 and T2 are two data references to be compared. - The function returns -1, 0, or 1. */ - -static int -compare_tree (tree t1, tree t2) -{ - int i, cmp; - enum tree_code code; - char tclass; - - if (t1 == t2) - return 0; - if (t1 == NULL) - return -1; - if (t2 == NULL) - return 1; - - STRIP_NOPS (t1); - STRIP_NOPS (t2); - - if (TREE_CODE (t1) != TREE_CODE (t2)) - return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1; - - code = TREE_CODE (t1); - switch (code) - { - /* For const values, we can just use hash values for comparisons. */ - case INTEGER_CST: - case REAL_CST: - case FIXED_CST: - case STRING_CST: - case COMPLEX_CST: - case VECTOR_CST: - { - hashval_t h1 = iterative_hash_expr (t1, 0); - hashval_t h2 = iterative_hash_expr (t2, 0); - if (h1 != h2) - return h1 < h2 ? -1 : 1; - break; - } - - case SSA_NAME: - cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2)); - if (cmp != 0) - return cmp; - - if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2)) - return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1; - break; - - default: - tclass = TREE_CODE_CLASS (code); - - /* For var-decl, we could compare their UIDs. */ - if (tclass == tcc_declaration) - { - if (DECL_UID (t1) != DECL_UID (t2)) - return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1; - break; - } - - /* For expressions with operands, compare their operands recursively. */ - for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i) - { - cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i)); - if (cmp != 0) - return cmp; - } - } - - return 0; -} - - -/* Compare two data-references DRA and DRB to group them into chunks - suitable for grouping. */ - -static int -dr_group_sort_cmp (const void *dra_, const void *drb_) -{ - data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); - data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); - int cmp; - - /* Stabilize sort. */ - if (dra == drb) - return 0; - - /* DRs in different loops never belong to the same group. */ - loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; - loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; - if (loopa != loopb) - return loopa->num < loopb->num ? -1 : 1; - - /* Ordering of DRs according to base. */ - if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)) - { - cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb)); - if (cmp != 0) - return cmp; - } - - /* And according to DR_OFFSET. */ - if (!dr_equal_offsets_p (dra, drb)) - { - cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); - if (cmp != 0) - return cmp; - } - - /* Put reads before writes. */ - if (DR_IS_READ (dra) != DR_IS_READ (drb)) - return DR_IS_READ (dra) ? -1 : 1; - - /* Then sort after access size. */ - if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), - TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0)) - { - cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), - TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); - if (cmp != 0) - return cmp; - } - - /* And after step. */ - if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) - { - cmp = compare_tree (DR_STEP (dra), DR_STEP (drb)); - if (cmp != 0) - return cmp; - } - - /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ - cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)); - if (cmp == 0) - return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; - return cmp; -} - -/* Function vect_analyze_data_ref_accesses. - - Analyze the access pattern of all the data references in the loop. - - FORNOW: the only access pattern that is considered vectorizable is a - simple step 1 (consecutive) access. - - FORNOW: handle only arrays and pointer accesses. */ - -bool -vect_analyze_data_ref_accesses (vec_info *vinfo) -{ - unsigned int i; - vec<data_reference_p> datarefs = vinfo->datarefs; - struct data_reference *dr; - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "=== vect_analyze_data_ref_accesses ===\n"); - - if (datarefs.is_empty ()) - return true; - - /* Sort the array of datarefs to make building the interleaving chains - linear. Don't modify the original vector's order, it is needed for - determining what dependencies are reversed. */ - vec<data_reference_p> datarefs_copy = datarefs.copy (); - datarefs_copy.qsort (dr_group_sort_cmp); - - /* Build the interleaving chains. */ - for (i = 0; i < datarefs_copy.length () - 1;) - { - data_reference_p dra = datarefs_copy[i]; - stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra)); - stmt_vec_info lastinfo = NULL; - if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a)) - { - ++i; - continue; - } - for (i = i + 1; i < datarefs_copy.length (); ++i) - { - data_reference_p drb = datarefs_copy[i]; - stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb)); - if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b)) - break; - - /* ??? Imperfect sorting (non-compatible types, non-modulo - accesses, same accesses) can lead to a group to be artificially - split here as we don't just skip over those. If it really - matters we can push those to a worklist and re-iterate - over them. The we can just skip ahead to the next DR here. */ - - /* DRs in a different loop should not be put into the same - interleaving group. */ - if (gimple_bb (DR_STMT (dra))->loop_father - != gimple_bb (DR_STMT (drb))->loop_father) - break; - - /* Check that the data-refs have same first location (except init) - and they are both either store or load (not load and store, - not masked loads or stores). */ - if (DR_IS_READ (dra) != DR_IS_READ (drb) - || !operand_equal_p (DR_BASE_ADDRESS (dra), - DR_BASE_ADDRESS (drb), 0) - || !dr_equal_offsets_p (dra, drb) - || !gimple_assign_single_p (DR_STMT (dra)) - || !gimple_assign_single_p (DR_STMT (drb))) - break; - - /* Check that the data-refs have the same constant size. */ - tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); - tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); - if (!tree_fits_uhwi_p (sza) - || !tree_fits_uhwi_p (szb) - || !tree_int_cst_equal (sza, szb)) - break; - - /* Check that the data-refs have the same step. */ - if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) - break; - - /* Do not place the same access in the interleaving chain twice. */ - if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0) - break; - - /* Check the types are compatible. - ??? We don't distinguish this during sorting. */ - if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), - TREE_TYPE (DR_REF (drb)))) - break; - - /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ - HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); - HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); - gcc_assert (init_a <= init_b); - - /* If init_b == init_a + the size of the type * k, we have an - interleaving, and DRA is accessed before DRB. */ - HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); - if (type_size_a == 0 - || (init_b - init_a) % type_size_a != 0) - break; - - /* If we have a store, the accesses are adjacent. This splits - groups into chunks we support (we don't support vectorization - of stores with gaps). */ - if (!DR_IS_READ (dra) - && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW - (DR_INIT (datarefs_copy[i-1])) - != type_size_a)) - break; - - /* If the step (if not zero or non-constant) is greater than the - difference between data-refs' inits this splits groups into - suitable sizes. */ - if (tree_fits_shwi_p (DR_STEP (dra))) - { - HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); - if (step != 0 && step <= (init_b - init_a)) - break; - } - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "Detected interleaving "); - if (DR_IS_READ (dra)) - dump_printf (MSG_NOTE, "load "); - else - dump_printf (MSG_NOTE, "store "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra)); - dump_printf (MSG_NOTE, " and "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb)); - dump_printf (MSG_NOTE, "\n"); - } - - /* Link the found element into the group list. */ - if (!GROUP_FIRST_ELEMENT (stmtinfo_a)) - { - GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra); - lastinfo = stmtinfo_a; - } - GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra); - GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb); - lastinfo = stmtinfo_b; - } - } - - FOR_EACH_VEC_ELT (datarefs_copy, i, dr) - if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) - && !vect_analyze_data_ref_access (dr)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: complicated access pattern.\n"); - - if (is_a <bb_vec_info> (vinfo)) - { - /* Mark the statement as not vectorizable. */ - STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; - continue; - } - else - { - datarefs_copy.release (); - return false; - } - } - - datarefs_copy.release (); - return true; -} - - -/* Operator == between two dr_with_seg_len objects. - - This equality operator is used to make sure two data refs - are the same one so that we will consider to combine the - aliasing checks of those two pairs of data dependent data - refs. */ - -static bool -operator == (const dr_with_seg_len& d1, - const dr_with_seg_len& d2) -{ - return operand_equal_p (DR_BASE_ADDRESS (d1.dr), - DR_BASE_ADDRESS (d2.dr), 0) - && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0 - && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0 - && compare_tree (d1.seg_len, d2.seg_len) == 0; -} - -/* Function comp_dr_with_seg_len_pair. - - Comparison function for sorting objects of dr_with_seg_len_pair_t - so that we can combine aliasing checks in one scan. */ - -static int -comp_dr_with_seg_len_pair (const void *pa_, const void *pb_) -{ - const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_; - const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_; - const dr_with_seg_len &a1 = pa->first, &a2 = pa->second; - const dr_with_seg_len &b1 = pb->first, &b2 = pb->second; - - /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks - if a and c have the same basic address snd step, and b and d have the same - address and step. Therefore, if any a&c or b&d don't have the same address - and step, we don't care the order of those two pairs after sorting. */ - int comp_res; - - if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr), - DR_BASE_ADDRESS (b1.dr))) != 0) - return comp_res; - if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr), - DR_BASE_ADDRESS (b2.dr))) != 0) - return comp_res; - if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0) - return comp_res; - if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0) - return comp_res; - if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0) - return comp_res; - if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0) - return comp_res; - if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0) - return comp_res; - if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0) - return comp_res; - - return 0; -} - -/* Function vect_vfa_segment_size. - - Create an expression that computes the size of segment - that will be accessed for a data reference. The functions takes into - account that realignment loads may access one more vector. - - Input: - DR: The data reference. - LENGTH_FACTOR: segment length to consider. - - Return an expression whose value is the size of segment which will be - accessed by DR. */ - -static tree -vect_vfa_segment_size (struct data_reference *dr, tree length_factor) -{ - tree segment_length; - - if (integer_zerop (DR_STEP (dr))) - segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); - else - segment_length = size_binop (MULT_EXPR, - fold_convert (sizetype, DR_STEP (dr)), - fold_convert (sizetype, length_factor)); - - if (vect_supportable_dr_alignment (dr, false) - == dr_explicit_realign_optimized) - { - tree vector_size = TYPE_SIZE_UNIT - (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)))); - - segment_length = size_binop (PLUS_EXPR, segment_length, vector_size); - } - return segment_length; -} - -/* Function vect_no_alias_p. - - Given data references A and B with equal base and offset, the alias - relation can be decided at compilation time, return TRUE if they do - not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A - and SEGMENT_LENGTH_B are the memory lengths accessed by A and B - respectively. */ - -static bool -vect_no_alias_p (struct data_reference *a, struct data_reference *b, - tree segment_length_a, tree segment_length_b) -{ - gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST - && TREE_CODE (DR_INIT (b)) == INTEGER_CST); - if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b))) - return false; - - tree seg_a_min = DR_INIT (a); - tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min), - seg_a_min, segment_length_a); - /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT - bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of - [a, a+12) */ - if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0) - { - tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a))); - seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max), - seg_a_max, unit_size); - seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)), - DR_INIT (a), unit_size); - } - tree seg_b_min = DR_INIT (b); - tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min), - seg_b_min, segment_length_b); - if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0) - { - tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b))); - seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max), - seg_b_max, unit_size); - seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)), - DR_INIT (b), unit_size); - } - - if (tree_int_cst_le (seg_a_max, seg_b_min) - || tree_int_cst_le (seg_b_max, seg_a_min)) - return true; - - return false; -} - -/* Function vect_prune_runtime_alias_test_list. - - Prune a list of ddrs to be tested at run-time by versioning for alias. - Merge several alias checks into one if possible. - Return FALSE if resulting list of ddrs is longer then allowed by - PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ - -bool -vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) -{ - vec<ddr_p> may_alias_ddrs = - LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); - vec<dr_with_seg_len_pair_t>& comp_alias_ddrs = - LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); - int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); - tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); - - ddr_p ddr; - unsigned int i; - tree length_factor; - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "=== vect_prune_runtime_alias_test_list ===\n"); - - if (may_alias_ddrs.is_empty ()) - return true; - - /* Basically, for each pair of dependent data refs store_ptr_0 - and load_ptr_0, we create an expression: - - ((store_ptr_0 + store_segment_length_0) <= load_ptr_0) - || (load_ptr_0 + load_segment_length_0) <= store_ptr_0)) - - for aliasing checks. However, in some cases we can decrease - the number of checks by combining two checks into one. For - example, suppose we have another pair of data refs store_ptr_0 - and load_ptr_1, and if the following condition is satisfied: - - load_ptr_0 < load_ptr_1 && - load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0 - - (this condition means, in each iteration of vectorized loop, - the accessed memory of store_ptr_0 cannot be between the memory - of load_ptr_0 and load_ptr_1.) - - we then can use only the following expression to finish the - alising checks between store_ptr_0 & load_ptr_0 and - store_ptr_0 & load_ptr_1: - - ((store_ptr_0 + store_segment_length_0) <= load_ptr_0) - || (load_ptr_1 + load_segment_length_1 <= store_ptr_0)) - - Note that we only consider that load_ptr_0 and load_ptr_1 have the - same basic address. */ - - comp_alias_ddrs.create (may_alias_ddrs.length ()); - - /* First, we collect all data ref pairs for aliasing checks. */ - FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) - { - int comp_res; - struct data_reference *dr_a, *dr_b; - gimple *dr_group_first_a, *dr_group_first_b; - tree segment_length_a, segment_length_b; - gimple *stmt_a, *stmt_b; - - dr_a = DDR_A (ddr); - stmt_a = DR_STMT (DDR_A (ddr)); - dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a)); - if (dr_group_first_a) - { - stmt_a = dr_group_first_a; - dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a)); - } - - dr_b = DDR_B (ddr); - stmt_b = DR_STMT (DDR_B (ddr)); - dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b)); - if (dr_group_first_b) - { - stmt_b = dr_group_first_b; - dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b)); - } - - if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)) - length_factor = scalar_loop_iters; - else - length_factor = size_int (vect_factor); - segment_length_a = vect_vfa_segment_size (dr_a, length_factor); - segment_length_b = vect_vfa_segment_size (dr_b, length_factor); - - comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)); - if (comp_res == 0) - comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b)); - - /* Alias is known at compilation time. */ - if (comp_res == 0 - && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST - && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST - && TREE_CODE (segment_length_a) == INTEGER_CST - && TREE_CODE (segment_length_b) == INTEGER_CST) - { - if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b)) - continue; - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "not vectorized: compilation time alias.\n"); - - return false; - } - - dr_with_seg_len_pair_t dr_with_seg_len_pair - (dr_with_seg_len (dr_a, segment_length_a), - dr_with_seg_len (dr_b, segment_length_b)); - - /* Canonicalize pairs by sorting the two DR members. */ - if (comp_res > 0) - std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); - - comp_alias_ddrs.safe_push (dr_with_seg_len_pair); - } - - /* Second, we sort the collected data ref pairs so that we can scan - them once to combine all possible aliasing checks. */ - comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair); - - /* Third, we scan the sorted dr pairs and check if we can combine - alias checks of two neighboring dr pairs. */ - for (size_t i = 1; i < comp_alias_ddrs.length (); ++i) - { - /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */ - dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first, - *dr_b1 = &comp_alias_ddrs[i-1].second, - *dr_a2 = &comp_alias_ddrs[i].first, - *dr_b2 = &comp_alias_ddrs[i].second; - - /* Remove duplicate data ref pairs. */ - if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "found equal ranges "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, - DR_REF (dr_a1->dr)); - dump_printf (MSG_NOTE, ", "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, - DR_REF (dr_b1->dr)); - dump_printf (MSG_NOTE, " and "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, - DR_REF (dr_a2->dr)); - dump_printf (MSG_NOTE, ", "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, - DR_REF (dr_b2->dr)); - dump_printf (MSG_NOTE, "\n"); - } - - comp_alias_ddrs.ordered_remove (i--); - continue; - } - - if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2) - { - /* We consider the case that DR_B1 and DR_B2 are same memrefs, - and DR_A1 and DR_A2 are two consecutive memrefs. */ - if (*dr_a1 == *dr_a2) - { - std::swap (dr_a1, dr_b1); - std::swap (dr_a2, dr_b2); - } - - if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr), - DR_BASE_ADDRESS (dr_a2->dr), 0) - || !operand_equal_p (DR_OFFSET (dr_a1->dr), - DR_OFFSET (dr_a2->dr), 0) - || !tree_fits_shwi_p (DR_INIT (dr_a1->dr)) - || !tree_fits_shwi_p (DR_INIT (dr_a2->dr))) - continue; - - /* Make sure dr_a1 starts left of dr_a2. */ - if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr))) - std::swap (*dr_a1, *dr_a2); - - bool do_remove = false; - unsigned HOST_WIDE_INT diff - = (tree_to_shwi (DR_INIT (dr_a2->dr)) - - tree_to_shwi (DR_INIT (dr_a1->dr))); - - /* If the left segment does not extend beyond the start of the - right segment the new segment length is that of the right - plus the segment distance. */ - if (tree_fits_uhwi_p (dr_a1->seg_len) - && compare_tree_int (dr_a1->seg_len, diff) <= 0) - { - dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len, - size_int (diff)); - do_remove = true; - } - /* Generally the new segment length is the maximum of the - left segment size and the right segment size plus the distance. - ??? We can also build tree MAX_EXPR here but it's not clear this - is profitable. */ - else if (tree_fits_uhwi_p (dr_a1->seg_len) - && tree_fits_uhwi_p (dr_a2->seg_len)) - { - unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len); - unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len); - dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2)); - do_remove = true; - } - /* Now we check if the following condition is satisfied: - - DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B - - where DIFF = DR_A2_INIT - DR_A1_INIT. However, - SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we - have to make a best estimation. We can get the minimum value - of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B, - then either of the following two conditions can guarantee the - one above: - - 1: DIFF <= MIN_SEG_LEN_B - 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */ - else - { - unsigned HOST_WIDE_INT min_seg_len_b - = (tree_fits_uhwi_p (dr_b1->seg_len) - ? tree_to_uhwi (dr_b1->seg_len) - : vect_factor); - - if (diff <= min_seg_len_b - || (tree_fits_uhwi_p (dr_a1->seg_len) - && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b)) - { - dr_a1->seg_len = size_binop (PLUS_EXPR, - dr_a2->seg_len, size_int (diff)); - do_remove = true; - } - } - - if (do_remove) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "merging ranges for "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr)); - dump_printf (MSG_NOTE, ", "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr)); - dump_printf (MSG_NOTE, " and "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr)); - dump_printf (MSG_NOTE, ", "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr)); - dump_printf (MSG_NOTE, "\n"); - } - comp_alias_ddrs.ordered_remove (i--); - } - } - } - - dump_printf_loc (MSG_NOTE, vect_location, - "improved number of alias checks from %d to %d\n", - may_alias_ddrs.length (), comp_alias_ddrs.length ()); - if ((int) comp_alias_ddrs.length () > - PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "number of versioning for alias " - "run-time tests exceeds %d " - "(--param vect-max-version-for-alias-checks)\n", - PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); - return false; - } - - /* All alias checks have been resolved at compilation time. */ - if (!comp_alias_ddrs.length ()) - LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0); - - return true; -} - -/* Return true if a non-affine read or write in STMT is suitable for a - gather load or scatter store. Describe the operation in *INFO if so. */ - -bool -vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, - gather_scatter_info *info) -{ - HOST_WIDE_INT scale = 1, pbitpos, pbitsize; - struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - tree offtype = NULL_TREE; - tree decl, base, off; - machine_mode pmode; - int punsignedp, reversep, pvolatilep = 0; - - base = DR_REF (dr); - /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, - see if we can use the def stmt of the address. */ - if (is_gimple_call (stmt) - && gimple_call_internal_p (stmt) - && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD - || gimple_call_internal_fn (stmt) == IFN_MASK_STORE) - && TREE_CODE (base) == MEM_REF - && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME - && integer_zerop (TREE_OPERAND (base, 1)) - && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) - { - gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); - if (is_gimple_assign (def_stmt) - && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) - base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); - } - - /* The gather and scatter builtins need address of the form - loop_invariant + vector * {1, 2, 4, 8} - or - loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. - Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture - of loop invariants/SSA_NAMEs defined in the loop, with casts, - multiplications and additions in it. To get a vector, we need - a single SSA_NAME that will be defined in the loop and will - contain everything that is not loop invariant and that can be - vectorized. The following code attempts to find such a preexistng - SSA_NAME OFF and put the loop invariants into a tree BASE - that can be gimplified before the loop. */ - base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, - &punsignedp, &reversep, &pvolatilep); - gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep); - - if (TREE_CODE (base) == MEM_REF) - { - if (!integer_zerop (TREE_OPERAND (base, 1))) - { - if (off == NULL_TREE) - { - offset_int moff = mem_ref_offset (base); - off = wide_int_to_tree (sizetype, moff); - } - else - off = size_binop (PLUS_EXPR, off, - fold_convert (sizetype, TREE_OPERAND (base, 1))); - } - base = TREE_OPERAND (base, 0); - } - else - base = build_fold_addr_expr (base); - - if (off == NULL_TREE) - off = size_zero_node; - - /* If base is not loop invariant, either off is 0, then we start with just - the constant offset in the loop invariant BASE and continue with base - as OFF, otherwise give up. - We could handle that case by gimplifying the addition of base + off - into some SSA_NAME and use that as off, but for now punt. */ - if (!expr_invariant_in_loop_p (loop, base)) - { - if (!integer_zerop (off)) - return false; - off = base; - base = size_int (pbitpos / BITS_PER_UNIT); - } - /* Otherwise put base + constant offset into the loop invariant BASE - and continue with OFF. */ - else - { - base = fold_convert (sizetype, base); - base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT)); - } - - /* OFF at this point may be either a SSA_NAME or some tree expression - from get_inner_reference. Try to peel off loop invariants from it - into BASE as long as possible. */ - STRIP_NOPS (off); - while (offtype == NULL_TREE) - { - enum tree_code code; - tree op0, op1, add = NULL_TREE; - - if (TREE_CODE (off) == SSA_NAME) - { - gimple *def_stmt = SSA_NAME_DEF_STMT (off); - - if (expr_invariant_in_loop_p (loop, off)) - return false; - - if (gimple_code (def_stmt) != GIMPLE_ASSIGN) - break; - - op0 = gimple_assign_rhs1 (def_stmt); - code = gimple_assign_rhs_code (def_stmt); - op1 = gimple_assign_rhs2 (def_stmt); - } - else - { - if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) - return false; - code = TREE_CODE (off); - extract_ops_from_tree (off, &code, &op0, &op1); - } - switch (code) - { - case POINTER_PLUS_EXPR: - case PLUS_EXPR: - if (expr_invariant_in_loop_p (loop, op0)) - { - add = op0; - off = op1; - do_add: - add = fold_convert (sizetype, add); - if (scale != 1) - add = size_binop (MULT_EXPR, add, size_int (scale)); - base = size_binop (PLUS_EXPR, base, add); - continue; - } - if (expr_invariant_in_loop_p (loop, op1)) - { - add = op1; - off = op0; - goto do_add; - } - break; - case MINUS_EXPR: - if (expr_invariant_in_loop_p (loop, op1)) - { - add = fold_convert (sizetype, op1); - add = size_binop (MINUS_EXPR, size_zero_node, add); - off = op0; - goto do_add; - } - break; - case MULT_EXPR: - if (scale == 1 && tree_fits_shwi_p (op1)) - { - scale = tree_to_shwi (op1); - off = op0; - continue; - } - break; - case SSA_NAME: - off = op0; - continue; - CASE_CONVERT: - if (!POINTER_TYPE_P (TREE_TYPE (op0)) - && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) - break; - if (TYPE_PRECISION (TREE_TYPE (op0)) - == TYPE_PRECISION (TREE_TYPE (off))) - { - off = op0; - continue; - } - if (TYPE_PRECISION (TREE_TYPE (op0)) - < TYPE_PRECISION (TREE_TYPE (off))) - { - off = op0; - offtype = TREE_TYPE (off); - STRIP_NOPS (off); - continue; - } - break; - default: - break; - } - break; - } - - /* If at the end OFF still isn't a SSA_NAME or isn't - defined in the loop, punt. */ - if (TREE_CODE (off) != SSA_NAME - || expr_invariant_in_loop_p (loop, off)) - return false; - - if (offtype == NULL_TREE) - offtype = TREE_TYPE (off); - - if (DR_IS_READ (dr)) - decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info), - offtype, scale); - else - decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info), - offtype, scale); - - if (decl == NULL_TREE) - return false; - - info->decl = decl; - info->base = base; - info->offset = off; - info->offset_dt = vect_unknown_def_type; - info->offset_vectype = NULL_TREE; - info->scale = scale; - return true; -} - -/* Function vect_analyze_data_refs. - - Find all the data references in the loop or basic block. - - The general structure of the analysis of data refs in the vectorizer is as - follows: - 1- vect_analyze_data_refs(loop/bb): call - compute_data_dependences_for_loop/bb to find and analyze all data-refs - in the loop/bb and their dependences. - 2- vect_analyze_dependences(): apply dependence testing using ddrs. - 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. - 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. - -*/ - -bool -vect_analyze_data_refs (vec_info *vinfo, int *min_vf) -{ - struct loop *loop = NULL; - unsigned int i; - struct data_reference *dr; - tree scalar_type; - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "=== vect_analyze_data_refs ===\n"); - - if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) - loop = LOOP_VINFO_LOOP (loop_vinfo); - - /* Go through the data-refs, check that the analysis succeeded. Update - pointer from stmt_vec_info struct to DR and vectype. */ - - vec<data_reference_p> datarefs = vinfo->datarefs; - FOR_EACH_VEC_ELT (datarefs, i, dr) - { - gimple *stmt; - stmt_vec_info stmt_info; - tree base, offset, init; - enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; - bool simd_lane_access = false; - int vf; - -again: - if (!dr || !DR_REF (dr)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: unhandled data-ref\n"); - return false; - } - - stmt = DR_STMT (dr); - stmt_info = vinfo_for_stmt (stmt); - - /* Discard clobbers from the dataref vector. We will remove - clobber stmts during vectorization. */ - if (gimple_clobber_p (stmt)) - { - free_data_ref (dr); - if (i == datarefs.length () - 1) - { - datarefs.pop (); - break; - } - datarefs.ordered_remove (i); - dr = datarefs[i]; - goto again; - } - - /* Check that analysis of the data-ref succeeded. */ - if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) - || !DR_STEP (dr)) - { - bool maybe_gather - = DR_IS_READ (dr) - && !TREE_THIS_VOLATILE (DR_REF (dr)) - && targetm.vectorize.builtin_gather != NULL; - bool maybe_scatter - = DR_IS_WRITE (dr) - && !TREE_THIS_VOLATILE (DR_REF (dr)) - && targetm.vectorize.builtin_scatter != NULL; - bool maybe_simd_lane_access - = is_a <loop_vec_info> (vinfo) && loop->simduid; - - /* If target supports vector gather loads or scatter stores, or if - this might be a SIMD lane access, see if they can't be used. */ - if (is_a <loop_vec_info> (vinfo) - && (maybe_gather || maybe_scatter || maybe_simd_lane_access) - && !nested_in_vect_loop_p (loop, stmt)) - { - struct data_reference *newdr - = create_data_ref (NULL, loop_containing_stmt (stmt), - DR_REF (dr), stmt, maybe_scatter ? false : true); - gcc_assert (newdr != NULL && DR_REF (newdr)); - if (DR_BASE_ADDRESS (newdr) - && DR_OFFSET (newdr) - && DR_INIT (newdr) - && DR_STEP (newdr) - && integer_zerop (DR_STEP (newdr))) - { - if (maybe_simd_lane_access) - { - tree off = DR_OFFSET (newdr); - STRIP_NOPS (off); - if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST - && TREE_CODE (off) == MULT_EXPR - && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) - { - tree step = TREE_OPERAND (off, 1); - off = TREE_OPERAND (off, 0); - STRIP_NOPS (off); - if (CONVERT_EXPR_P (off) - && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, - 0))) - < TYPE_PRECISION (TREE_TYPE (off))) - off = TREE_OPERAND (off, 0); - if (TREE_CODE (off) == SSA_NAME) - { - gimple *def = SSA_NAME_DEF_STMT (off); - tree reft = TREE_TYPE (DR_REF (newdr)); - if (is_gimple_call (def) - && gimple_call_internal_p (def) - && (gimple_call_internal_fn (def) - == IFN_GOMP_SIMD_LANE)) - { - tree arg = gimple_call_arg (def, 0); - gcc_assert (TREE_CODE (arg) == SSA_NAME); - arg = SSA_NAME_VAR (arg); - if (arg == loop->simduid - /* For now. */ - && tree_int_cst_equal - (TYPE_SIZE_UNIT (reft), - step)) - { - DR_OFFSET (newdr) = ssize_int (0); - DR_STEP (newdr) = step; - DR_ALIGNED_TO (newdr) - = size_int (BIGGEST_ALIGNMENT); - dr = newdr; - simd_lane_access = true; - } - } - } - } - } - if (!simd_lane_access && (maybe_gather || maybe_scatter)) - { - dr = newdr; - if (maybe_gather) - gatherscatter = GATHER; - else - gatherscatter = SCATTER; - } - } - if (gatherscatter == SG_NONE && !simd_lane_access) - free_data_ref (newdr); - } - - if (gatherscatter == SG_NONE && !simd_lane_access) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: data ref analysis " - "failed "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - } - - if (is_a <bb_vec_info> (vinfo)) - break; - - return false; - } - } - - if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: base addr of dr is a " - "constant\n"); - - if (is_a <bb_vec_info> (vinfo)) - break; - - if (gatherscatter != SG_NONE || simd_lane_access) - free_data_ref (dr); - return false; - } - - if (TREE_THIS_VOLATILE (DR_REF (dr))) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: volatile type "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - } - - if (is_a <bb_vec_info> (vinfo)) - break; - - return false; - } - - if (stmt_can_throw_internal (stmt)) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: statement can throw an " - "exception "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - } - - if (is_a <bb_vec_info> (vinfo)) - break; - - if (gatherscatter != SG_NONE || simd_lane_access) - free_data_ref (dr); - return false; - } - - if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF - && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: statement is bitfield " - "access "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - } - - if (is_a <bb_vec_info> (vinfo)) - break; - - if (gatherscatter != SG_NONE || simd_lane_access) - free_data_ref (dr); - return false; - } - - base = unshare_expr (DR_BASE_ADDRESS (dr)); - offset = unshare_expr (DR_OFFSET (dr)); - init = unshare_expr (DR_INIT (dr)); - - if (is_gimple_call (stmt) - && (!gimple_call_internal_p (stmt) - || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD - && gimple_call_internal_fn (stmt) != IFN_MASK_STORE))) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: dr in a call "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - } - - if (is_a <bb_vec_info> (vinfo)) - break; - - if (gatherscatter != SG_NONE || simd_lane_access) - free_data_ref (dr); - return false; - } - - /* Update DR field in stmt_vec_info struct. */ - - /* If the dataref is in an inner-loop of the loop that is considered for - for vectorization, we also want to analyze the access relative to - the outer-loop (DR contains information only relative to the - inner-most enclosing loop). We do that by building a reference to the - first location accessed by the inner-loop, and analyze it relative to - the outer-loop. */ - if (loop && nested_in_vect_loop_p (loop, stmt)) - { - tree outer_step, outer_base, outer_init; - HOST_WIDE_INT pbitsize, pbitpos; - tree poffset; - machine_mode pmode; - int punsignedp, preversep, pvolatilep; - affine_iv base_iv, offset_iv; - tree dinit; - - /* Build a reference to the first location accessed by the - inner-loop: *(BASE+INIT). (The first location is actually - BASE+INIT+OFFSET, but we add OFFSET separately later). */ - tree inner_base = build_fold_indirect_ref - (fold_build_pointer_plus (base, init)); - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "analyze in outer-loop: "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base); - dump_printf (MSG_NOTE, "\n"); - } - - outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos, - &poffset, &pmode, &punsignedp, - &preversep, &pvolatilep); - gcc_assert (outer_base != NULL_TREE); - - if (pbitpos % BITS_PER_UNIT != 0) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "failed: bit offset alignment.\n"); - return false; - } - - if (preversep) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "failed: reverse storage order.\n"); - return false; - } - - outer_base = build_fold_addr_expr (outer_base); - if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base, - &base_iv, false)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "failed: evolution of base is not affine.\n"); - return false; - } - - if (offset) - { - if (poffset) - poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, - poffset); - else - poffset = offset; - } - - if (!poffset) - { - offset_iv.base = ssize_int (0); - offset_iv.step = ssize_int (0); - } - else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset, - &offset_iv, false)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "evolution of offset is not affine.\n"); - return false; - } - - outer_init = ssize_int (pbitpos / BITS_PER_UNIT); - split_constant_offset (base_iv.base, &base_iv.base, &dinit); - outer_init = size_binop (PLUS_EXPR, outer_init, dinit); - split_constant_offset (offset_iv.base, &offset_iv.base, &dinit); - outer_init = size_binop (PLUS_EXPR, outer_init, dinit); - - outer_step = size_binop (PLUS_EXPR, - fold_convert (ssizetype, base_iv.step), - fold_convert (ssizetype, offset_iv.step)); - - STMT_VINFO_DR_STEP (stmt_info) = outer_step; - /* FIXME: Use canonicalize_base_object_address (base_iv.base); */ - STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base; - STMT_VINFO_DR_INIT (stmt_info) = outer_init; - STMT_VINFO_DR_OFFSET (stmt_info) = - fold_convert (ssizetype, offset_iv.base); - STMT_VINFO_DR_ALIGNED_TO (stmt_info) = - size_int (highest_pow2_factor (offset_iv.base)); - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "\touter base_address: "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, - STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); - dump_printf (MSG_NOTE, "\n\touter offset from base address: "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, - STMT_VINFO_DR_OFFSET (stmt_info)); - dump_printf (MSG_NOTE, - "\n\touter constant offset from base address: "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, - STMT_VINFO_DR_INIT (stmt_info)); - dump_printf (MSG_NOTE, "\n\touter step: "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, - STMT_VINFO_DR_STEP (stmt_info)); - dump_printf (MSG_NOTE, "\n\touter aligned to: "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, - STMT_VINFO_DR_ALIGNED_TO (stmt_info)); - dump_printf (MSG_NOTE, "\n"); - } - } - - if (STMT_VINFO_DATA_REF (stmt_info)) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: more than one data ref " - "in stmt: "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - } - - if (is_a <bb_vec_info> (vinfo)) - break; - - if (gatherscatter != SG_NONE || simd_lane_access) - free_data_ref (dr); - return false; - } - - STMT_VINFO_DATA_REF (stmt_info) = dr; - if (simd_lane_access) - { - STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true; - free_data_ref (datarefs[i]); - datarefs[i] = dr; - } - - /* Set vectype for STMT. */ - scalar_type = TREE_TYPE (DR_REF (dr)); - STMT_VINFO_VECTYPE (stmt_info) - = get_vectype_for_scalar_type (scalar_type); - if (!STMT_VINFO_VECTYPE (stmt_info)) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: no vectype for stmt: "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, - scalar_type); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - - if (is_a <bb_vec_info> (vinfo)) - { - /* No vector type is fine, the ref can still participate - in dependence analysis, we just can't vectorize it. */ - STMT_VINFO_VECTORIZABLE (stmt_info) = false; - continue; - } - - if (gatherscatter != SG_NONE || simd_lane_access) - { - STMT_VINFO_DATA_REF (stmt_info) = NULL; - if (gatherscatter != SG_NONE) - free_data_ref (dr); - } - return false; - } - else - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "got vectype for stmt: "); - dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0); - dump_generic_expr (MSG_NOTE, TDF_SLIM, - STMT_VINFO_VECTYPE (stmt_info)); - dump_printf (MSG_NOTE, "\n"); - } - } - - /* Adjust the minimal vectorization factor according to the - vector type. */ - vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); - if (vf > *min_vf) - *min_vf = vf; - - if (gatherscatter != SG_NONE) - { - gather_scatter_info gs_info; - if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo), - &gs_info) - || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset))) - { - STMT_VINFO_DATA_REF (stmt_info) = NULL; - free_data_ref (dr); - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - (gatherscatter == GATHER) ? - "not vectorized: not suitable for gather " - "load " : - "not vectorized: not suitable for scatter " - "store "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - } - return false; - } - - free_data_ref (datarefs[i]); - datarefs[i] = dr; - STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; - } - - else if (is_a <loop_vec_info> (vinfo) - && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) - { - if (nested_in_vect_loop_p (loop, stmt)) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: not suitable for strided " - "load "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - } - return false; - } - STMT_VINFO_STRIDED_P (stmt_info) = true; - } - } - - /* If we stopped analysis at the first dataref we could not analyze - when trying to vectorize a basic-block mark the rest of the datarefs - as not vectorizable and truncate the vector of datarefs. That - avoids spending useless time in analyzing their dependence. */ - if (i != datarefs.length ()) - { - gcc_assert (is_a <bb_vec_info> (vinfo)); - for (unsigned j = i; j < datarefs.length (); ++j) - { - data_reference_p dr = datarefs[j]; - STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false; - free_data_ref (dr); - } - datarefs.truncate (i); - } - - return true; -} - - -/* Function vect_get_new_vect_var. - - Returns a name for a new variable. The current naming scheme appends the - prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to - the name of vectorizer generated variables, and appends that to NAME if - provided. */ - -tree -vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) -{ - const char *prefix; - tree new_vect_var; - - switch (var_kind) - { - case vect_simple_var: - prefix = "vect"; - break; - case vect_scalar_var: - prefix = "stmp"; - break; - case vect_mask_var: - prefix = "mask"; - break; - case vect_pointer_var: - prefix = "vectp"; - break; - default: - gcc_unreachable (); - } - - if (name) - { - char* tmp = concat (prefix, "_", name, NULL); - new_vect_var = create_tmp_reg (type, tmp); - free (tmp); - } - else - new_vect_var = create_tmp_reg (type, prefix); - - return new_vect_var; -} - -/* Like vect_get_new_vect_var but return an SSA name. */ - -tree -vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) -{ - const char *prefix; - tree new_vect_var; - - switch (var_kind) - { - case vect_simple_var: - prefix = "vect"; - break; - case vect_scalar_var: - prefix = "stmp"; - break; - case vect_pointer_var: - prefix = "vectp"; - break; - default: - gcc_unreachable (); - } - - if (name) - { - char* tmp = concat (prefix, "_", name, NULL); - new_vect_var = make_temp_ssa_name (type, NULL, tmp); - free (tmp); - } - else - new_vect_var = make_temp_ssa_name (type, NULL, prefix); - - return new_vect_var; -} - -/* Duplicate ptr info and set alignment/misaligment on NAME from DR. */ - -static void -vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr, - stmt_vec_info stmt_info) -{ - duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr)); - unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info)); - int misalign = DR_MISALIGNMENT (dr); - if (misalign == -1) - mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); - else - set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign); -} - -/* Function vect_create_addr_base_for_vector_ref. - - Create an expression that computes the address of the first memory location - that will be accessed for a data reference. - - Input: - STMT: The statement containing the data reference. - NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. - OFFSET: Optional. If supplied, it is be added to the initial address. - LOOP: Specify relative to which loop-nest should the address be computed. - For example, when the dataref is in an inner-loop nested in an - outer-loop that is now being vectorized, LOOP can be either the - outer-loop, or the inner-loop. The first memory location accessed - by the following dataref ('in' points to short): - - for (i=0; i<N; i++) - for (j=0; j<M; j++) - s += in[i+j] - - is as follows: - if LOOP=i_loop: &in (relative to i_loop) - if LOOP=j_loop: &in+i*2B (relative to j_loop) - BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the - initial address. Unlike OFFSET, which is number of elements to - be added, BYTE_OFFSET is measured in bytes. - - Output: - 1. Return an SSA_NAME whose value is the address of the memory location of - the first vector of the data reference. - 2. If new_stmt_list is not NULL_TREE after return then the caller must insert - these statement(s) which define the returned SSA_NAME. - - FORNOW: We are only handling array accesses with step 1. */ - -tree -vect_create_addr_base_for_vector_ref (gimple *stmt, - gimple_seq *new_stmt_list, - tree offset, - struct loop *loop, - tree byte_offset) -{ - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - tree data_ref_base; - const char *base_name; - tree addr_base; - tree dest; - gimple_seq seq = NULL; - tree base_offset; - tree init; - tree vect_ptr_type; - tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - - if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father) - { - struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo); - - gcc_assert (nested_in_vect_loop_p (outer_loop, stmt)); - - data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); - base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info)); - init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info)); - } - else - { - data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr)); - base_offset = unshare_expr (DR_OFFSET (dr)); - init = unshare_expr (DR_INIT (dr)); - } - - if (loop_vinfo) - base_name = get_name (data_ref_base); - else - { - base_offset = ssize_int (0); - init = ssize_int (0); - base_name = get_name (DR_REF (dr)); - } - - /* Create base_offset */ - base_offset = size_binop (PLUS_EXPR, - fold_convert (sizetype, base_offset), - fold_convert (sizetype, init)); - - if (offset) - { - offset = fold_build2 (MULT_EXPR, sizetype, - fold_convert (sizetype, offset), step); - base_offset = fold_build2 (PLUS_EXPR, sizetype, - base_offset, offset); - } - if (byte_offset) - { - byte_offset = fold_convert (sizetype, byte_offset); - base_offset = fold_build2 (PLUS_EXPR, sizetype, - base_offset, byte_offset); - } - - /* base + base_offset */ - if (loop_vinfo) - addr_base = fold_build_pointer_plus (data_ref_base, base_offset); - else - { - addr_base = build1 (ADDR_EXPR, - build_pointer_type (TREE_TYPE (DR_REF (dr))), - unshare_expr (DR_REF (dr))); - } - - vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); - dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); - addr_base = force_gimple_operand (addr_base, &seq, true, dest); - gimple_seq_add_seq (new_stmt_list, seq); - - if (DR_PTR_INFO (dr) - && TREE_CODE (addr_base) == SSA_NAME - && !SSA_NAME_PTR_INFO (addr_base)) - { - vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info); - if (offset || byte_offset) - mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); - } - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, "created "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base); - dump_printf (MSG_NOTE, "\n"); - } - - return addr_base; -} - - -/* Function vect_create_data_ref_ptr. - - Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first - location accessed in the loop by STMT, along with the def-use update - chain to appropriately advance the pointer through the loop iterations. - Also set aliasing information for the pointer. This pointer is used by - the callers to this function to create a memory reference expression for - vector load/store access. - - Input: - 1. STMT: a stmt that references memory. Expected to be of the form - GIMPLE_ASSIGN <name, data-ref> or - GIMPLE_ASSIGN <data-ref, name>. - 2. AGGR_TYPE: the type of the reference, which should be either a vector - or an array. - 3. AT_LOOP: the loop where the vector memref is to be created. - 4. OFFSET (optional): an offset to be added to the initial address accessed - by the data-ref in STMT. - 5. BSI: location where the new stmts are to be placed if there is no loop - 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain - pointing to the initial address. - 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added - to the initial address accessed by the data-ref in STMT. This is - similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET - in bytes. - - Output: - 1. Declare a new ptr to vector_type, and have it point to the base of the - data reference (initial addressed accessed by the data reference). - For example, for vector of type V8HI, the following code is generated: - - v8hi *ap; - ap = (v8hi *)initial_address; - - if OFFSET is not supplied: - initial_address = &a[init]; - if OFFSET is supplied: - initial_address = &a[init + OFFSET]; - if BYTE_OFFSET is supplied: - initial_address = &a[init] + BYTE_OFFSET; - - Return the initial_address in INITIAL_ADDRESS. - - 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also - update the pointer in each iteration of the loop. - - Return the increment stmt that updates the pointer in PTR_INCR. - - 3. Set INV_P to true if the access pattern of the data reference in the - vectorized loop is invariant. Set it to false otherwise. - - 4. Return the pointer. */ - -tree -vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop, - tree offset, tree *initial_address, - gimple_stmt_iterator *gsi, gimple **ptr_incr, - bool only_init, bool *inv_p, tree byte_offset) -{ - const char *base_name; - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - struct loop *loop = NULL; - bool nested_in_vect_loop = false; - struct loop *containing_loop = NULL; - tree aggr_ptr_type; - tree aggr_ptr; - tree new_temp; - gimple_seq new_stmt_list = NULL; - edge pe = NULL; - basic_block new_bb; - tree aggr_ptr_init; - struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - tree aptr; - gimple_stmt_iterator incr_gsi; - bool insert_after; - tree indx_before_incr, indx_after_incr; - gimple *incr; - tree step; - bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); - - gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE - || TREE_CODE (aggr_type) == VECTOR_TYPE); - - if (loop_vinfo) - { - loop = LOOP_VINFO_LOOP (loop_vinfo); - nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); - containing_loop = (gimple_bb (stmt))->loop_father; - pe = loop_preheader_edge (loop); - } - else - { - gcc_assert (bb_vinfo); - only_init = true; - *ptr_incr = NULL; - } - - /* Check the step (evolution) of the load in LOOP, and record - whether it's invariant. */ - if (nested_in_vect_loop) - step = STMT_VINFO_DR_STEP (stmt_info); - else - step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info)); - - if (integer_zerop (step)) - *inv_p = true; - else - *inv_p = false; - - /* Create an expression for the first address accessed by this load - in LOOP. */ - base_name = get_name (DR_BASE_ADDRESS (dr)); - - if (dump_enabled_p ()) - { - tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); - dump_printf_loc (MSG_NOTE, vect_location, - "create %s-pointer variable to type: ", - get_tree_code_name (TREE_CODE (aggr_type))); - dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type); - if (TREE_CODE (dr_base_type) == ARRAY_TYPE) - dump_printf (MSG_NOTE, " vectorizing an array ref: "); - else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) - dump_printf (MSG_NOTE, " vectorizing a vector ref: "); - else if (TREE_CODE (dr_base_type) == RECORD_TYPE) - dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); - else - dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); - dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr)); - dump_printf (MSG_NOTE, "\n"); - } - - /* (1) Create the new aggregate-pointer variable. - Vector and array types inherit the alias set of their component - type by default so we need to use a ref-all pointer if the data - reference does not conflict with the created aggregated data - reference because it is not addressable. */ - bool need_ref_all = false; - if (!alias_sets_conflict_p (get_alias_set (aggr_type), - get_alias_set (DR_REF (dr)))) - need_ref_all = true; - /* Likewise for any of the data references in the stmt group. */ - else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1) - { - gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info); - do - { - stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt); - struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); - if (!alias_sets_conflict_p (get_alias_set (aggr_type), - get_alias_set (DR_REF (sdr)))) - { - need_ref_all = true; - break; - } - orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo); - } - while (orig_stmt); - } - aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, - need_ref_all); - aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); - - - /* Note: If the dataref is in an inner-loop nested in LOOP, and we are - vectorizing LOOP (i.e., outer-loop vectorization), we need to create two - def-use update cycles for the pointer: one relative to the outer-loop - (LOOP), which is what steps (3) and (4) below do. The other is relative - to the inner-loop (which is the inner-most loop containing the dataref), - and this is done be step (5) below. - - When vectorizing inner-most loops, the vectorized loop (LOOP) is also the - inner-most loop, and so steps (3),(4) work the same, and step (5) is - redundant. Steps (3),(4) create the following: - - vp0 = &base_addr; - LOOP: vp1 = phi(vp0,vp2) - ... - ... - vp2 = vp1 + step - goto LOOP - - If there is an inner-loop nested in loop, then step (5) will also be - applied, and an additional update in the inner-loop will be created: - - vp0 = &base_addr; - LOOP: vp1 = phi(vp0,vp2) - ... - inner: vp3 = phi(vp1,vp4) - vp4 = vp3 + inner_step - if () goto inner - ... - vp2 = vp1 + step - if () goto LOOP */ - - /* (2) Calculate the initial address of the aggregate-pointer, and set - the aggregate-pointer to point to it before the loop. */ - - /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ - - new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, - offset, loop, byte_offset); - if (new_stmt_list) - { - if (pe) - { - new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); - gcc_assert (!new_bb); - } - else - gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); - } - - *initial_address = new_temp; - aggr_ptr_init = new_temp; - - /* (3) Handle the updating of the aggregate-pointer inside the loop. - This is needed when ONLY_INIT is false, and also when AT_LOOP is the - inner-loop nested in LOOP (during outer-loop vectorization). */ - - /* No update in loop is required. */ - if (only_init && (!loop_vinfo || at_loop == loop)) - aptr = aggr_ptr_init; - else - { - /* The step of the aggregate pointer is the type size. */ - tree iv_step = TYPE_SIZE_UNIT (aggr_type); - /* One exception to the above is when the scalar step of the load in - LOOP is zero. In this case the step here is also zero. */ - if (*inv_p) - iv_step = size_zero_node; - else if (tree_int_cst_sgn (step) == -1) - iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); - - standard_iv_increment_position (loop, &incr_gsi, &insert_after); - - create_iv (aggr_ptr_init, - fold_convert (aggr_ptr_type, iv_step), - aggr_ptr, loop, &incr_gsi, insert_after, - &indx_before_incr, &indx_after_incr); - incr = gsi_stmt (incr_gsi); - set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); - - /* Copy the points-to information if it exists. */ - if (DR_PTR_INFO (dr)) - { - vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info); - vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info); - } - if (ptr_incr) - *ptr_incr = incr; - - aptr = indx_before_incr; - } - - if (!nested_in_vect_loop || only_init) - return aptr; - - - /* (4) Handle the updating of the aggregate-pointer inside the inner-loop - nested in LOOP, if exists. */ - - gcc_assert (nested_in_vect_loop); - if (!only_init) - { - standard_iv_increment_position (containing_loop, &incr_gsi, - &insert_after); - create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, - containing_loop, &incr_gsi, insert_after, &indx_before_incr, - &indx_after_incr); - incr = gsi_stmt (incr_gsi); - set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo)); - - /* Copy the points-to information if it exists. */ - if (DR_PTR_INFO (dr)) - { - vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info); - vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info); - } - if (ptr_incr) - *ptr_incr = incr; - - return indx_before_incr; - } - else - gcc_unreachable (); -} - - -/* Function bump_vector_ptr - - Increment a pointer (to a vector type) by vector-size. If requested, - i.e. if PTR-INCR is given, then also connect the new increment stmt - to the existing def-use update-chain of the pointer, by modifying - the PTR_INCR as illustrated below: - - The pointer def-use update-chain before this function: - DATAREF_PTR = phi (p_0, p_2) - .... - PTR_INCR: p_2 = DATAREF_PTR + step - - The pointer def-use update-chain after this function: - DATAREF_PTR = phi (p_0, p_2) - .... - NEW_DATAREF_PTR = DATAREF_PTR + BUMP - .... - PTR_INCR: p_2 = NEW_DATAREF_PTR + step - - Input: - DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated - in the loop. - PTR_INCR - optional. The stmt that updates the pointer in each iteration of - the loop. The increment amount across iterations is expected - to be vector_size. - BSI - location where the new update stmt is to be placed. - STMT - the original scalar memory-access stmt that is being vectorized. - BUMP - optional. The offset by which to bump the pointer. If not given, - the offset is assumed to be vector_size. - - Output: Return NEW_DATAREF_PTR as illustrated above. - -*/ - -tree -bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, - gimple *stmt, tree bump) -{ - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - tree update = TYPE_SIZE_UNIT (vectype); - gassign *incr_stmt; - ssa_op_iter iter; - use_operand_p use_p; - tree new_dataref_ptr; - - if (bump) - update = bump; - - if (TREE_CODE (dataref_ptr) == SSA_NAME) - new_dataref_ptr = copy_ssa_name (dataref_ptr); - else - new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); - incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, - dataref_ptr, update); - vect_finish_stmt_generation (stmt, incr_stmt, gsi); - - /* Copy the points-to information if it exists. */ - if (DR_PTR_INFO (dr)) - { - duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); - mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); - } - - if (!ptr_incr) - return new_dataref_ptr; - - /* Update the vector-pointer's cross-iteration increment. */ - FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) - { - tree use = USE_FROM_PTR (use_p); - - if (use == dataref_ptr) - SET_USE (use_p, new_dataref_ptr); - else - gcc_assert (tree_int_cst_compare (use, update) == 0); - } - - return new_dataref_ptr; -} - - -/* Function vect_create_destination_var. - - Create a new temporary of type VECTYPE. */ - -tree -vect_create_destination_var (tree scalar_dest, tree vectype) -{ - tree vec_dest; - const char *name; - char *new_name; - tree type; - enum vect_var_kind kind; - - kind = vectype - ? VECTOR_BOOLEAN_TYPE_P (vectype) - ? vect_mask_var - : vect_simple_var - : vect_scalar_var; - type = vectype ? vectype : TREE_TYPE (scalar_dest); - - gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); - - name = get_name (scalar_dest); - if (name) - new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); - else - new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); - vec_dest = vect_get_new_vect_var (type, kind, new_name); - free (new_name); - - return vec_dest; -} - -/* Function vect_grouped_store_supported. - - Returns TRUE if interleave high and interleave low permutations - are supported, and FALSE otherwise. */ - -bool -vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) -{ - machine_mode mode = TYPE_MODE (vectype); - - /* vect_permute_store_chain requires the group size to be equal to 3 or - be a power of two. */ - if (count != 3 && exact_log2 (count) == -1) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "the size of the group of accesses" - " is not a power of 2 or not eqaul to 3\n"); - return false; - } - - /* Check that the permutation is supported. */ - if (VECTOR_MODE_P (mode)) - { - unsigned int i, nelt = GET_MODE_NUNITS (mode); - unsigned char *sel = XALLOCAVEC (unsigned char, nelt); - - if (count == 3) - { - unsigned int j0 = 0, j1 = 0, j2 = 0; - unsigned int i, j; - - for (j = 0; j < 3; j++) - { - int nelt0 = ((3 - j) * nelt) % 3; - int nelt1 = ((3 - j) * nelt + 1) % 3; - int nelt2 = ((3 - j) * nelt + 2) % 3; - for (i = 0; i < nelt; i++) - { - if (3 * i + nelt0 < nelt) - sel[3 * i + nelt0] = j0++; - if (3 * i + nelt1 < nelt) - sel[3 * i + nelt1] = nelt + j1++; - if (3 * i + nelt2 < nelt) - sel[3 * i + nelt2] = 0; - } - if (!can_vec_perm_p (mode, false, sel)) - { - if (dump_enabled_p ()) - dump_printf (MSG_MISSED_OPTIMIZATION, - "permutaion op not supported by target.\n"); - return false; - } - - for (i = 0; i < nelt; i++) - { - if (3 * i + nelt0 < nelt) - sel[3 * i + nelt0] = 3 * i + nelt0; - if (3 * i + nelt1 < nelt) - sel[3 * i + nelt1] = 3 * i + nelt1; - if (3 * i + nelt2 < nelt) - sel[3 * i + nelt2] = nelt + j2++; - } - if (!can_vec_perm_p (mode, false, sel)) - { - if (dump_enabled_p ()) - dump_printf (MSG_MISSED_OPTIMIZATION, - "permutaion op not supported by target.\n"); - return false; - } - } - return true; - } - else - { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (count)); - - for (i = 0; i < nelt / 2; i++) - { - sel[i * 2] = i; - sel[i * 2 + 1] = i + nelt; - } - if (can_vec_perm_p (mode, false, sel)) - { - for (i = 0; i < nelt; i++) - sel[i] += nelt / 2; - if (can_vec_perm_p (mode, false, sel)) - return true; - } - } - } - - if (dump_enabled_p ()) - dump_printf (MSG_MISSED_OPTIMIZATION, - "permutaion op not supported by target.\n"); - return false; -} - - -/* Return TRUE if vec_store_lanes is available for COUNT vectors of - type VECTYPE. */ - -bool -vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) -{ - return vect_lanes_optab_supported_p ("vec_store_lanes", - vec_store_lanes_optab, - vectype, count); -} - - -/* Function vect_permute_store_chain. - - Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be - a power of 2 or equal to 3, generate interleave_high/low stmts to reorder - the data correctly for the stores. Return the final references for stores - in RESULT_CHAIN. - - E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. - The input is 4 vectors each containing 8 elements. We assign a number to - each element, the input sequence is: - - 1st vec: 0 1 2 3 4 5 6 7 - 2nd vec: 8 9 10 11 12 13 14 15 - 3rd vec: 16 17 18 19 20 21 22 23 - 4th vec: 24 25 26 27 28 29 30 31 - - The output sequence should be: - - 1st vec: 0 8 16 24 1 9 17 25 - 2nd vec: 2 10 18 26 3 11 19 27 - 3rd vec: 4 12 20 28 5 13 21 30 - 4th vec: 6 14 22 30 7 15 23 31 - - i.e., we interleave the contents of the four vectors in their order. - - We use interleave_high/low instructions to create such output. The input of - each interleave_high/low operation is two vectors: - 1st vec 2nd vec - 0 1 2 3 4 5 6 7 - the even elements of the result vector are obtained left-to-right from the - high/low elements of the first vector. The odd elements of the result are - obtained left-to-right from the high/low elements of the second vector. - The output of interleave_high will be: 0 4 1 5 - and of interleave_low: 2 6 3 7 - - - The permutation is done in log LENGTH stages. In each stage interleave_high - and interleave_low stmts are created for each pair of vectors in DR_CHAIN, - where the first argument is taken from the first half of DR_CHAIN and the - second argument from it's second half. - In our example, - - I1: interleave_high (1st vec, 3rd vec) - I2: interleave_low (1st vec, 3rd vec) - I3: interleave_high (2nd vec, 4th vec) - I4: interleave_low (2nd vec, 4th vec) - - The output for the first stage is: - - I1: 0 16 1 17 2 18 3 19 - I2: 4 20 5 21 6 22 7 23 - I3: 8 24 9 25 10 26 11 27 - I4: 12 28 13 29 14 30 15 31 - - The output of the second stage, i.e. the final result is: - - I1: 0 8 16 24 1 9 17 25 - I2: 2 10 18 26 3 11 19 27 - I3: 4 12 20 28 5 13 21 30 - I4: 6 14 22 30 7 15 23 31. */ - -void -vect_permute_store_chain (vec<tree> dr_chain, - unsigned int length, - gimple *stmt, - gimple_stmt_iterator *gsi, - vec<tree> *result_chain) -{ - tree vect1, vect2, high, low; - gimple *perm_stmt; - tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); - tree perm_mask_low, perm_mask_high; - tree data_ref; - tree perm3_mask_low, perm3_mask_high; - unsigned int i, n, log_length = exact_log2 (length); - unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); - unsigned char *sel = XALLOCAVEC (unsigned char, nelt); - - result_chain->quick_grow (length); - memcpy (result_chain->address (), dr_chain.address (), - length * sizeof (tree)); - - if (length == 3) - { - unsigned int j0 = 0, j1 = 0, j2 = 0; - - for (j = 0; j < 3; j++) - { - int nelt0 = ((3 - j) * nelt) % 3; - int nelt1 = ((3 - j) * nelt + 1) % 3; - int nelt2 = ((3 - j) * nelt + 2) % 3; - - for (i = 0; i < nelt; i++) - { - if (3 * i + nelt0 < nelt) - sel[3 * i + nelt0] = j0++; - if (3 * i + nelt1 < nelt) - sel[3 * i + nelt1] = nelt + j1++; - if (3 * i + nelt2 < nelt) - sel[3 * i + nelt2] = 0; - } - perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel); - - for (i = 0; i < nelt; i++) - { - if (3 * i + nelt0 < nelt) - sel[3 * i + nelt0] = 3 * i + nelt0; - if (3 * i + nelt1 < nelt) - sel[3 * i + nelt1] = 3 * i + nelt1; - if (3 * i + nelt2 < nelt) - sel[3 * i + nelt2] = nelt + j2++; - } - perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel); - - vect1 = dr_chain[0]; - vect2 = dr_chain[1]; - - /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, - {j, nelt, *, j + 1, nelt + j + 1, *, - j + 2, nelt + j + 2, *, ...}> */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, - vect2, perm3_mask_low); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - - vect1 = data_ref; - vect2 = dr_chain[2]; - /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, - {0, 1, nelt + j, 3, 4, nelt + j + 1, - 6, 7, nelt + j + 2, ...}> */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, - vect2, perm3_mask_high); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j] = data_ref; - } - } - else - { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (length)); - - for (i = 0, n = nelt / 2; i < n; i++) - { - sel[i * 2] = i; - sel[i * 2 + 1] = i + nelt; - } - perm_mask_high = vect_gen_perm_mask_checked (vectype, sel); - - for (i = 0; i < nelt; i++) - sel[i] += nelt / 2; - perm_mask_low = vect_gen_perm_mask_checked (vectype, sel); - - for (i = 0, n = log_length; i < n; i++) - { - for (j = 0; j < length/2; j++) - { - vect1 = dr_chain[j]; - vect2 = dr_chain[j+length/2]; - - /* Create interleaving stmt: - high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, - ...}> */ - high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); - perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, - vect2, perm_mask_high); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[2*j] = high; - - /* Create interleaving stmt: - low = VEC_PERM_EXPR <vect1, vect2, - {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, - ...}> */ - low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); - perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, - vect2, perm_mask_low); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[2*j+1] = low; - } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); - } - } -} - -/* Function vect_setup_realignment - - This function is called when vectorizing an unaligned load using - the dr_explicit_realign[_optimized] scheme. - This function generates the following code at the loop prolog: - - p = initial_addr; - x msq_init = *(floor(p)); # prolog load - realignment_token = call target_builtin; - loop: - x msq = phi (msq_init, ---) - - The stmts marked with x are generated only for the case of - dr_explicit_realign_optimized. - - The code above sets up a new (vector) pointer, pointing to the first - location accessed by STMT, and a "floor-aligned" load using that pointer. - It also generates code to compute the "realignment-token" (if the relevant - target hook was defined), and creates a phi-node at the loop-header bb - whose arguments are the result of the prolog-load (created by this - function) and the result of a load that takes place in the loop (to be - created by the caller to this function). - - For the case of dr_explicit_realign_optimized: - The caller to this function uses the phi-result (msq) to create the - realignment code inside the loop, and sets up the missing phi argument, - as follows: - loop: - msq = phi (msq_init, lsq) - lsq = *(floor(p')); # load in loop - result = realign_load (msq, lsq, realignment_token); - - For the case of dr_explicit_realign: - loop: - msq = *(floor(p)); # load in loop - p' = p + (VS-1); - lsq = *(floor(p')); # load in loop - result = realign_load (msq, lsq, realignment_token); - - Input: - STMT - (scalar) load stmt to be vectorized. This load accesses - a memory location that may be unaligned. - BSI - place where new code is to be inserted. - ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes - is used. - - Output: - REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load - target hook, if defined. - Return value - the result of the loop-header phi node. */ - -tree -vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi, - tree *realignment_token, - enum dr_alignment_support alignment_support_scheme, - tree init_addr, - struct loop **at_loop) -{ - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); - struct loop *loop = NULL; - edge pe = NULL; - tree scalar_dest = gimple_assign_lhs (stmt); - tree vec_dest; - gimple *inc; - tree ptr; - tree data_ref; - basic_block new_bb; - tree msq_init = NULL_TREE; - tree new_temp; - gphi *phi_stmt; - tree msq = NULL_TREE; - gimple_seq stmts = NULL; - bool inv_p; - bool compute_in_loop = false; - bool nested_in_vect_loop = false; - struct loop *containing_loop = (gimple_bb (stmt))->loop_father; - struct loop *loop_for_initial_load = NULL; - - if (loop_vinfo) - { - loop = LOOP_VINFO_LOOP (loop_vinfo); - nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); - } - - gcc_assert (alignment_support_scheme == dr_explicit_realign - || alignment_support_scheme == dr_explicit_realign_optimized); - - /* We need to generate three things: - 1. the misalignment computation - 2. the extra vector load (for the optimized realignment scheme). - 3. the phi node for the two vectors from which the realignment is - done (for the optimized realignment scheme). */ - - /* 1. Determine where to generate the misalignment computation. - - If INIT_ADDR is NULL_TREE, this indicates that the misalignment - calculation will be generated by this function, outside the loop (in the - preheader). Otherwise, INIT_ADDR had already been computed for us by the - caller, inside the loop. - - Background: If the misalignment remains fixed throughout the iterations of - the loop, then both realignment schemes are applicable, and also the - misalignment computation can be done outside LOOP. This is because we are - vectorizing LOOP, and so the memory accesses in LOOP advance in steps that - are a multiple of VS (the Vector Size), and therefore the misalignment in - different vectorized LOOP iterations is always the same. - The problem arises only if the memory access is in an inner-loop nested - inside LOOP, which is now being vectorized using outer-loop vectorization. - This is the only case when the misalignment of the memory access may not - remain fixed throughout the iterations of the inner-loop (as explained in - detail in vect_supportable_dr_alignment). In this case, not only is the - optimized realignment scheme not applicable, but also the misalignment - computation (and generation of the realignment token that is passed to - REALIGN_LOAD) have to be done inside the loop. - - In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode - or not, which in turn determines if the misalignment is computed inside - the inner-loop, or outside LOOP. */ - - if (init_addr != NULL_TREE || !loop_vinfo) - { - compute_in_loop = true; - gcc_assert (alignment_support_scheme == dr_explicit_realign); - } - - - /* 2. Determine where to generate the extra vector load. - - For the optimized realignment scheme, instead of generating two vector - loads in each iteration, we generate a single extra vector load in the - preheader of the loop, and in each iteration reuse the result of the - vector load from the previous iteration. In case the memory access is in - an inner-loop nested inside LOOP, which is now being vectorized using - outer-loop vectorization, we need to determine whether this initial vector - load should be generated at the preheader of the inner-loop, or can be - generated at the preheader of LOOP. If the memory access has no evolution - in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has - to be generated inside LOOP (in the preheader of the inner-loop). */ - - if (nested_in_vect_loop) - { - tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); - bool invariant_in_outerloop = - (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); - loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); - } - else - loop_for_initial_load = loop; - if (at_loop) - *at_loop = loop_for_initial_load; - - if (loop_for_initial_load) - pe = loop_preheader_edge (loop_for_initial_load); - - /* 3. For the case of the optimized realignment, create the first vector - load at the loop preheader. */ - - if (alignment_support_scheme == dr_explicit_realign_optimized) - { - /* Create msq_init = *(floor(p1)) in the loop preheader */ - gassign *new_stmt; - - gcc_assert (!compute_in_loop); - vec_dest = vect_create_destination_var (scalar_dest, vectype); - ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load, - NULL_TREE, &init_addr, NULL, &inc, - true, &inv_p); - if (TREE_CODE (ptr) == SSA_NAME) - new_temp = copy_ssa_name (ptr); - else - new_temp = make_ssa_name (TREE_TYPE (ptr)); - new_stmt = gimple_build_assign - (new_temp, BIT_AND_EXPR, ptr, - build_int_cst (TREE_TYPE (ptr), - -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype))); - new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); - gcc_assert (!new_bb); - data_ref - = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, - build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); - new_stmt = gimple_build_assign (vec_dest, data_ref); - new_temp = make_ssa_name (vec_dest, new_stmt); - gimple_assign_set_lhs (new_stmt, new_temp); - if (pe) - { - new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); - gcc_assert (!new_bb); - } - else - gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); - - msq_init = gimple_assign_lhs (new_stmt); - } - - /* 4. Create realignment token using a target builtin, if available. - It is done either inside the containing loop, or before LOOP (as - determined above). */ - - if (targetm.vectorize.builtin_mask_for_load) - { - gcall *new_stmt; - tree builtin_decl; - - /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ - if (!init_addr) - { - /* Generate the INIT_ADDR computation outside LOOP. */ - init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, - NULL_TREE, loop); - if (loop) - { - pe = loop_preheader_edge (loop); - new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); - gcc_assert (!new_bb); - } - else - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); - } - - builtin_decl = targetm.vectorize.builtin_mask_for_load (); - new_stmt = gimple_build_call (builtin_decl, 1, init_addr); - vec_dest = - vect_create_destination_var (scalar_dest, - gimple_call_return_type (new_stmt)); - new_temp = make_ssa_name (vec_dest, new_stmt); - gimple_call_set_lhs (new_stmt, new_temp); - - if (compute_in_loop) - gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); - else - { - /* Generate the misalignment computation outside LOOP. */ - pe = loop_preheader_edge (loop); - new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); - gcc_assert (!new_bb); - } - - *realignment_token = gimple_call_lhs (new_stmt); - - /* The result of the CALL_EXPR to this builtin is determined from - the value of the parameter and no global variables are touched - which makes the builtin a "const" function. Requiring the - builtin to have the "const" attribute makes it unnecessary - to call mark_call_clobbered. */ - gcc_assert (TREE_READONLY (builtin_decl)); - } - - if (alignment_support_scheme == dr_explicit_realign) - return msq; - - gcc_assert (!compute_in_loop); - gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); - - - /* 5. Create msq = phi <msq_init, lsq> in loop */ - - pe = loop_preheader_edge (containing_loop); - vec_dest = vect_create_destination_var (scalar_dest, vectype); - msq = make_ssa_name (vec_dest); - phi_stmt = create_phi_node (msq, containing_loop->header); - add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); - - return msq; -} - - -/* Function vect_grouped_load_supported. - - COUNT is the size of the load group (the number of statements plus the - number of gaps). SINGLE_ELEMENT_P is true if there is actually - only one statement, with a gap of COUNT - 1. - - Returns true if a suitable permute exists. */ - -bool -vect_grouped_load_supported (tree vectype, bool single_element_p, - unsigned HOST_WIDE_INT count) -{ - machine_mode mode = TYPE_MODE (vectype); - - /* If this is single-element interleaving with an element distance - that leaves unused vector loads around punt - we at least create - very sub-optimal code in that case (and blow up memory, - see PR65518). */ - if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "single-element interleaving not supported " - "for not adjacent vector loads\n"); - return false; - } - - /* vect_permute_load_chain requires the group size to be equal to 3 or - be a power of two. */ - if (count != 3 && exact_log2 (count) == -1) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "the size of the group of accesses" - " is not a power of 2 or not equal to 3\n"); - return false; - } - - /* Check that the permutation is supported. */ - if (VECTOR_MODE_P (mode)) - { - unsigned int i, j, nelt = GET_MODE_NUNITS (mode); - unsigned char *sel = XALLOCAVEC (unsigned char, nelt); - - if (count == 3) - { - unsigned int k; - for (k = 0; k < 3; k++) - { - for (i = 0; i < nelt; i++) - if (3 * i + k < 2 * nelt) - sel[i] = 3 * i + k; - else - sel[i] = 0; - if (!can_vec_perm_p (mode, false, sel)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 3 loads is not supported by" - " target\n"); - return false; - } - for (i = 0, j = 0; i < nelt; i++) - if (3 * i + k < 2 * nelt) - sel[i] = i; - else - sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); - if (!can_vec_perm_p (mode, false, sel)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 3 loads is not supported by" - " target\n"); - return false; - } - } - return true; - } - else - { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (count)); - for (i = 0; i < nelt; i++) - sel[i] = i * 2; - if (can_vec_perm_p (mode, false, sel)) - { - for (i = 0; i < nelt; i++) - sel[i] = i * 2 + 1; - if (can_vec_perm_p (mode, false, sel)) - return true; - } - } - } - - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "extract even/odd not supported by target\n"); - return false; -} - -/* Return TRUE if vec_load_lanes is available for COUNT vectors of - type VECTYPE. */ - -bool -vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) -{ - return vect_lanes_optab_supported_p ("vec_load_lanes", - vec_load_lanes_optab, - vectype, count); -} - -/* Function vect_permute_load_chain. - - Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be - a power of 2 or equal to 3, generate extract_even/odd stmts to reorder - the input data correctly. Return the final references for loads in - RESULT_CHAIN. - - E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. - The input is 4 vectors each containing 8 elements. We assign a number to each - element, the input sequence is: - - 1st vec: 0 1 2 3 4 5 6 7 - 2nd vec: 8 9 10 11 12 13 14 15 - 3rd vec: 16 17 18 19 20 21 22 23 - 4th vec: 24 25 26 27 28 29 30 31 - - The output sequence should be: - - 1st vec: 0 4 8 12 16 20 24 28 - 2nd vec: 1 5 9 13 17 21 25 29 - 3rd vec: 2 6 10 14 18 22 26 30 - 4th vec: 3 7 11 15 19 23 27 31 - - i.e., the first output vector should contain the first elements of each - interleaving group, etc. - - We use extract_even/odd instructions to create such output. The input of - each extract_even/odd operation is two vectors - 1st vec 2nd vec - 0 1 2 3 4 5 6 7 - - and the output is the vector of extracted even/odd elements. The output of - extract_even will be: 0 2 4 6 - and of extract_odd: 1 3 5 7 - - - The permutation is done in log LENGTH stages. In each stage extract_even - and extract_odd stmts are created for each pair of vectors in DR_CHAIN in - their order. In our example, - - E1: extract_even (1st vec, 2nd vec) - E2: extract_odd (1st vec, 2nd vec) - E3: extract_even (3rd vec, 4th vec) - E4: extract_odd (3rd vec, 4th vec) - - The output for the first stage will be: - - E1: 0 2 4 6 8 10 12 14 - E2: 1 3 5 7 9 11 13 15 - E3: 16 18 20 22 24 26 28 30 - E4: 17 19 21 23 25 27 29 31 - - In order to proceed and create the correct sequence for the next stage (or - for the correct output, if the second stage is the last one, as in our - example), we first put the output of extract_even operation and then the - output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). - The input for the second stage is: - - 1st vec (E1): 0 2 4 6 8 10 12 14 - 2nd vec (E3): 16 18 20 22 24 26 28 30 - 3rd vec (E2): 1 3 5 7 9 11 13 15 - 4th vec (E4): 17 19 21 23 25 27 29 31 - - The output of the second stage: - - E1: 0 4 8 12 16 20 24 28 - E2: 2 6 10 14 18 22 26 30 - E3: 1 5 9 13 17 21 25 29 - E4: 3 7 11 15 19 23 27 31 - - And RESULT_CHAIN after reordering: - - 1st vec (E1): 0 4 8 12 16 20 24 28 - 2nd vec (E3): 1 5 9 13 17 21 25 29 - 3rd vec (E2): 2 6 10 14 18 22 26 30 - 4th vec (E4): 3 7 11 15 19 23 27 31. */ - -static void -vect_permute_load_chain (vec<tree> dr_chain, - unsigned int length, - gimple *stmt, - gimple_stmt_iterator *gsi, - vec<tree> *result_chain) -{ - tree data_ref, first_vect, second_vect; - tree perm_mask_even, perm_mask_odd; - tree perm3_mask_low, perm3_mask_high; - gimple *perm_stmt; - tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); - unsigned int i, j, log_length = exact_log2 (length); - unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); - unsigned char *sel = XALLOCAVEC (unsigned char, nelt); - - result_chain->quick_grow (length); - memcpy (result_chain->address (), dr_chain.address (), - length * sizeof (tree)); - - if (length == 3) - { - unsigned int k; - - for (k = 0; k < 3; k++) - { - for (i = 0; i < nelt; i++) - if (3 * i + k < 2 * nelt) - sel[i] = 3 * i + k; - else - sel[i] = 0; - perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel); - - for (i = 0, j = 0; i < nelt; i++) - if (3 * i + k < 2 * nelt) - sel[i] = i; - else - sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); - - perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel); - - first_vect = dr_chain[0]; - second_vect = dr_chain[1]; - - /* Create interleaving stmt (low part of): - low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, - ...}> */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, - second_vect, perm3_mask_low); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - - /* Create interleaving stmt (high part of): - high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, - ...}> */ - first_vect = data_ref; - second_vect = dr_chain[2]; - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, - second_vect, perm3_mask_high); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[k] = data_ref; - } - } - else - { - /* If length is not equal to 3 then only power of 2 is supported. */ - gcc_assert (pow2p_hwi (length)); - - for (i = 0; i < nelt; ++i) - sel[i] = i * 2; - perm_mask_even = vect_gen_perm_mask_checked (vectype, sel); - - for (i = 0; i < nelt; ++i) - sel[i] = i * 2 + 1; - perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel); - - for (i = 0; i < log_length; i++) - { - for (j = 0; j < length; j += 2) - { - first_vect = dr_chain[j]; - second_vect = dr_chain[j+1]; - - /* data_ref = permute_even (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - first_vect, second_vect, - perm_mask_even); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - first_vect, second_vect, - perm_mask_odd); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2+length/2] = data_ref; - } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); - } - } -} - -/* Function vect_shift_permute_load_chain. - - Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate - sequence of stmts to reorder the input data accordingly. - Return the final references for loads in RESULT_CHAIN. - Return true if successed, false otherwise. - - E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. - The input is 3 vectors each containing 8 elements. We assign a - number to each element, the input sequence is: - - 1st vec: 0 1 2 3 4 5 6 7 - 2nd vec: 8 9 10 11 12 13 14 15 - 3rd vec: 16 17 18 19 20 21 22 23 - - The output sequence should be: - - 1st vec: 0 3 6 9 12 15 18 21 - 2nd vec: 1 4 7 10 13 16 19 22 - 3rd vec: 2 5 8 11 14 17 20 23 - - We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. - - First we shuffle all 3 vectors to get correct elements order: - - 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) - 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) - 3rd vec: (16 19 22) (17 20 23) (18 21) - - Next we unite and shift vector 3 times: - - 1st step: - shift right by 6 the concatenation of: - "1st vec" and "2nd vec" - ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) - "2nd vec" and "3rd vec" - ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) - "3rd vec" and "1st vec" - (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) - | New vectors | - - So that now new vectors are: - - 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) - 2nd vec: (10 13) (16 19 22) (17 20 23) - 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) - - 2nd step: - shift right by 5 the concatenation of: - "1st vec" and "3rd vec" - ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) - "2nd vec" and "1st vec" - (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) - "3rd vec" and "2nd vec" - (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) - | New vectors | - - So that now new vectors are: - - 1st vec: ( 9 12 15) (18 21) ( 0 3 6) - 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) - 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY - - 3rd step: - shift right by 5 the concatenation of: - "1st vec" and "1st vec" - ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) - shift right by 3 the concatenation of: - "2nd vec" and "2nd vec" - (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) - | New vectors | - - So that now all vectors are READY: - 1st vec: ( 0 3 6) ( 9 12 15) (18 21) - 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) - 3rd vec: ( 1 4 7) (10 13) (16 19 22) - - This algorithm is faster than one in vect_permute_load_chain if: - 1. "shift of a concatination" is faster than general permutation. - This is usually so. - 2. The TARGET machine can't execute vector instructions in parallel. - This is because each step of the algorithm depends on previous. - The algorithm in vect_permute_load_chain is much more parallel. - - The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. -*/ - -static bool -vect_shift_permute_load_chain (vec<tree> dr_chain, - unsigned int length, - gimple *stmt, - gimple_stmt_iterator *gsi, - vec<tree> *result_chain) -{ - tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; - tree perm2_mask1, perm2_mask2, perm3_mask; - tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; - gimple *perm_stmt; - - tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); - unsigned int i; - unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype); - unsigned char *sel = XALLOCAVEC (unsigned char, nelt); - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - - result_chain->quick_grow (length); - memcpy (result_chain->address (), dr_chain.address (), - length * sizeof (tree)); - - if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4) - { - unsigned int j, log_length = exact_log2 (length); - for (i = 0; i < nelt / 2; ++i) - sel[i] = i * 2; - for (i = 0; i < nelt / 2; ++i) - sel[nelt / 2 + i] = i * 2 + 1; - if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 2 fields structure is not \ - supported by target\n"); - return false; - } - perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel); - - for (i = 0; i < nelt / 2; ++i) - sel[i] = i * 2 + 1; - for (i = 0; i < nelt / 2; ++i) - sel[nelt / 2 + i] = i * 2; - if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 2 fields structure is not \ - supported by target\n"); - return false; - } - perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {4 5 6 7 8 9 10 11}. */ - for (i = 0; i < nelt; i++) - sel[i] = nelt / 2 + i; - if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift1_mask = vect_gen_perm_mask_checked (vectype, sel); - - /* Generating permutation constant to select vector from 2. - For vector length 8 it is {0 1 2 3 12 13 14 15}. */ - for (i = 0; i < nelt / 2; i++) - sel[i] = i; - for (i = nelt / 2; i < nelt; i++) - sel[i] = nelt + i; - if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "select is not supported by target\n"); - return false; - } - select_mask = vect_gen_perm_mask_checked (vectype, sel); - - for (i = 0; i < log_length; i++) - { - for (j = 0; j < length; j += 2) - { - first_vect = dr_chain[j]; - second_vect = dr_chain[j + 1]; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - first_vect, first_vect, - perm2_mask1); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - vect[0] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - second_vect, second_vect, - perm2_mask2); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - vect[1] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect[0], vect[1], shift1_mask); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2 + length/2] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect[0], vect[1], select_mask); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); - } - return true; - } - if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2) - { - unsigned int k = 0, l = 0; - - /* Generating permutation constant to get all elements in rigth order. - For vector length 8 it is {0 3 6 1 4 7 2 5}. */ - for (i = 0; i < nelt; i++) - { - if (3 * k + (l % 3) >= nelt) - { - k = 0; - l += (3 - (nelt % 3)); - } - sel[i] = 3 * k + (l % 3); - k++; - } - if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shuffle of 3 fields structure is not \ - supported by target\n"); - return false; - } - perm3_mask = vect_gen_perm_mask_checked (vectype, sel); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {6 7 8 9 10 11 12 13}. */ - for (i = 0; i < nelt; i++) - sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; - if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift1_mask = vect_gen_perm_mask_checked (vectype, sel); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {5 6 7 8 9 10 11 12}. */ - for (i = 0; i < nelt; i++) - sel[i] = 2 * (nelt / 3) + 1 + i; - if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift2_mask = vect_gen_perm_mask_checked (vectype, sel); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {3 4 5 6 7 8 9 10}. */ - for (i = 0; i < nelt; i++) - sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; - if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift3_mask = vect_gen_perm_mask_checked (vectype, sel); - - /* Generating permutation constant to shift all elements. - For vector length 8 it is {5 6 7 8 9 10 11 12}. */ - for (i = 0; i < nelt; i++) - sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; - if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "shift permutation is not supported by target\n"); - return false; - } - shift4_mask = vect_gen_perm_mask_checked (vectype, sel); - - for (k = 0; k < 3; k++) - { - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - dr_chain[k], dr_chain[k], - perm3_mask); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - vect[k] = data_ref; - } - - for (k = 0; k < 3; k++) - { - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect[k % 3], vect[(k + 1) % 3], - shift1_mask); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - vect_shift[k] = data_ref; - } - - for (k = 0; k < 3; k++) - { - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, - vect_shift[(4 - k) % 3], - vect_shift[(3 - k) % 3], - shift2_mask); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - vect[k] = data_ref; - } - - (*result_chain)[3 - (nelt % 3)] = vect[2]; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], - vect[0], shift3_mask); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[nelt % 3] = data_ref; - - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); - perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], - vect[1], shift4_mask); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[0] = data_ref; - return true; - } - return false; -} - -/* Function vect_transform_grouped_load. - - Given a chain of input interleaved data-refs (in DR_CHAIN), build statements - to perform their permutation and ascribe the result vectorized statements to - the scalar statements. -*/ - -void -vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size, - gimple_stmt_iterator *gsi) -{ - machine_mode mode; - vec<tree> result_chain = vNULL; - - /* DR_CHAIN contains input data-refs that are a part of the interleaving. - RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted - vectors, that are ready for vector computation. */ - result_chain.create (size); - - /* If reassociation width for vector type is 2 or greater target machine can - execute 2 or more vector instructions in parallel. Otherwise try to - get chain for loads group using vect_shift_permute_load_chain. */ - mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt))); - if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 - || pow2p_hwi (size) - || !vect_shift_permute_load_chain (dr_chain, size, stmt, - gsi, &result_chain)) - vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain); - vect_record_grouped_load_vectors (stmt, result_chain); - result_chain.release (); -} - -/* RESULT_CHAIN contains the output of a group of grouped loads that were - generated as part of the vectorization of STMT. Assign the statement - for each vector to the associated scalar statement. */ - -void -vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain) -{ - gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); - gimple *next_stmt, *new_stmt; - unsigned int i, gap_count; - tree tmp_data_ref; - - /* Put a permuted data-ref in the VECTORIZED_STMT field. - Since we scan the chain starting from it's first node, their order - corresponds the order of data-refs in RESULT_CHAIN. */ - next_stmt = first_stmt; - gap_count = 1; - FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) - { - if (!next_stmt) - break; - - /* Skip the gaps. Loads created for the gaps will be removed by dead - code elimination pass later. No need to check for the first stmt in - the group, since it always exists. - GROUP_GAP is the number of steps in elements from the previous - access (if there is no gap GROUP_GAP is 1). We skip loads that - correspond to the gaps. */ - if (next_stmt != first_stmt - && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt))) - { - gap_count++; - continue; - } - - while (next_stmt) - { - new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); - /* We assume that if VEC_STMT is not NULL, this is a case of multiple - copies, and we put the new vector statement in the first available - RELATED_STMT. */ - if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt))) - STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt; - else - { - if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) - { - gimple *prev_stmt = - STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)); - gimple *rel_stmt = - STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)); - while (rel_stmt) - { - prev_stmt = rel_stmt; - rel_stmt = - STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt)); - } - - STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = - new_stmt; - } - } - - next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); - gap_count = 1; - /* If NEXT_STMT accesses the same DR as the previous statement, - put the same TMP_DATA_REF as its vectorized statement; otherwise - get the next data-ref from RESULT_CHAIN. */ - if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) - break; - } - } -} - -/* Function vect_force_dr_alignment_p. - - Returns whether the alignment of a DECL can be forced to be aligned - on ALIGNMENT bit boundary. */ - -bool -vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment) -{ - if (!VAR_P (decl)) - return false; - - if (decl_in_symtab_p (decl) - && !symtab_node::get (decl)->can_increase_alignment_p ()) - return false; - - if (TREE_STATIC (decl)) - return (alignment <= MAX_OFILE_ALIGNMENT); - else - return (alignment <= MAX_STACK_ALIGNMENT); -} - - -/* Return whether the data reference DR is supported with respect to its - alignment. - If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even - it is aligned, i.e., check if it is possible to vectorize it with different - alignment. */ - -enum dr_alignment_support -vect_supportable_dr_alignment (struct data_reference *dr, - bool check_aligned_accesses) -{ - gimple *stmt = DR_STMT (dr); - stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - machine_mode mode = TYPE_MODE (vectype); - loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); - struct loop *vect_loop = NULL; - bool nested_in_vect_loop = false; - - if (aligned_access_p (dr) && !check_aligned_accesses) - return dr_aligned; - - /* For now assume all conditional loads/stores support unaligned - access without any special code. */ - if (is_gimple_call (stmt) - && gimple_call_internal_p (stmt) - && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD - || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) - return dr_unaligned_supported; - - if (loop_vinfo) - { - vect_loop = LOOP_VINFO_LOOP (loop_vinfo); - nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt); - } - - /* Possibly unaligned access. */ - - /* We can choose between using the implicit realignment scheme (generating - a misaligned_move stmt) and the explicit realignment scheme (generating - aligned loads with a REALIGN_LOAD). There are two variants to the - explicit realignment scheme: optimized, and unoptimized. - We can optimize the realignment only if the step between consecutive - vector loads is equal to the vector size. Since the vector memory - accesses advance in steps of VS (Vector Size) in the vectorized loop, it - is guaranteed that the misalignment amount remains the same throughout the - execution of the vectorized loop. Therefore, we can create the - "realignment token" (the permutation mask that is passed to REALIGN_LOAD) - at the loop preheader. - - However, in the case of outer-loop vectorization, when vectorizing a - memory access in the inner-loop nested within the LOOP that is now being - vectorized, while it is guaranteed that the misalignment of the - vectorized memory access will remain the same in different outer-loop - iterations, it is *not* guaranteed that is will remain the same throughout - the execution of the inner-loop. This is because the inner-loop advances - with the original scalar step (and not in steps of VS). If the inner-loop - step happens to be a multiple of VS, then the misalignment remains fixed - and we can use the optimized realignment scheme. For example: - - for (i=0; i<N; i++) - for (j=0; j<M; j++) - s += a[i+j]; - - When vectorizing the i-loop in the above example, the step between - consecutive vector loads is 1, and so the misalignment does not remain - fixed across the execution of the inner-loop, and the realignment cannot - be optimized (as illustrated in the following pseudo vectorized loop): - - for (i=0; i<N; i+=4) - for (j=0; j<M; j++){ - vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} - // when j is {0,1,2,3,4,5,6,7,...} respectively. - // (assuming that we start from an aligned address). - } - - We therefore have to use the unoptimized realignment scheme: - - for (i=0; i<N; i+=4) - for (j=k; j<M; j+=4) - vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming - // that the misalignment of the initial address is - // 0). - - The loop can then be vectorized as follows: - - for (k=0; k<4; k++){ - rt = get_realignment_token (&vp[k]); - for (i=0; i<N; i+=4){ - v1 = vp[i+k]; - for (j=k; j<M; j+=4){ - v2 = vp[i+j+VS-1]; - va = REALIGN_LOAD <v1,v2,rt>; - vs += va; - v1 = v2; - } - } - } */ - - if (DR_IS_READ (dr)) - { - bool is_packed = false; - tree type = (TREE_TYPE (DR_REF (dr))); - - if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing - && (!targetm.vectorize.builtin_mask_for_load - || targetm.vectorize.builtin_mask_for_load ())) - { - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - - /* If we are doing SLP then the accesses need not have the - same alignment, instead it depends on the SLP group size. */ - if (loop_vinfo - && STMT_SLP_TYPE (stmt_info) - && (LOOP_VINFO_VECT_FACTOR (loop_vinfo) - * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))) - % TYPE_VECTOR_SUBPARTS (vectype) != 0)) - ; - else if (!loop_vinfo - || (nested_in_vect_loop - && (TREE_INT_CST_LOW (DR_STEP (dr)) - != GET_MODE_SIZE (TYPE_MODE (vectype))))) - return dr_explicit_realign; - else - return dr_explicit_realign_optimized; - } - if (!known_alignment_for_access_p (dr)) - is_packed = not_size_aligned (DR_REF (dr)); - - if (targetm.vectorize.support_vector_misalignment - (mode, type, DR_MISALIGNMENT (dr), is_packed)) - /* Can't software pipeline the loads, but can at least do them. */ - return dr_unaligned_supported; - } - else - { - bool is_packed = false; - tree type = (TREE_TYPE (DR_REF (dr))); - - if (!known_alignment_for_access_p (dr)) - is_packed = not_size_aligned (DR_REF (dr)); - - if (targetm.vectorize.support_vector_misalignment - (mode, type, DR_MISALIGNMENT (dr), is_packed)) - return dr_unaligned_supported; - } - - /* Unsupported. */ - return dr_unaligned_unsupported; -} |