summaryrefslogtreecommitdiff
path: root/gcc/tree-vect-data-refs.c
diff options
context:
space:
mode:
authorbstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4>2015-01-21 22:01:24 +0000
committerbstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4>2015-01-21 22:01:24 +0000
commitffde65b31066f17eef243be882bb89a6e19370aa (patch)
treeea876d041c0a63eefccdac5416a8678e75da4cfc /gcc/tree-vect-data-refs.c
parenta8c7acc4db08ce7c8ac3ddcb943f9219e2893792 (diff)
downloadgcc-ffde65b31066f17eef243be882bb89a6e19370aa.tar.gz
[.]
2015-01-21 Basile Starynkevitch <basile@starynkevitch.net> {{merged with trunk -i.e. GCC5.0 in stage4- using svn merge -r209216:219879 svn+ssh://bstarynk@gcc.gnu.org/svn/gcc/trunk but should probably have used svn merge -r209216:219879 ^/trunk we don't use svnmerge.py anymore since our svn is version 1.8.10 }} VERY UNSTABLE 2015-01-20 Basile Starynkevitch <basile@starynkevitch.net> Move previous topdir ChangeLog.MELT to ChangeLog.MELT.2008-2014 [contrib/] 2015-01-21 Basile Starynkevitch <basile@starynkevitch.net> * MELT-Plugin-Makefile: Able to make upgrade-melt as a plugin. Works for GCC 5.0. Remove GCC 4.7 old stuff. Move previous contrib/ChangeLog.MELT to ChangeLog.MELT.2008-2014 [gcc/] 2015-01-21 Basile Starynkevitch <basile@starynkevitch.net> {{merged with trunk -i.e. GCC5.0 in stage4- using svn merge -r209216:219879 svn+ssh://bstarynk@gcc.gnu.org/svn/gcc/trunk but should probably have used svn merge -r209216:219879 ^/trunk **@@@ UNSTABLE since libmelt-ana-gimple.melt not compiling, but translator painfully bootstrapping!!@@@@ }} * toplev.c: Merged manually by keeping MELT extra stuff. * toplev.h: Likewise. * gengtype.c: Add "melt-runtime.h" in list, but merged with trunk. * melt-runtime.h (MELT_VERSION_STRING): Bump to "1.2-pre-merged". (meltgc_walk_gimple_seq): Remove. (gt_ggc_mx_gimple_statement_d): Same for GCC 4.9 & 5.0 * melt-runtime.cc: Update copyright year. (ggc_alloc_cleared_melt_valuevector_st, melt_resize_scangcvect): Call ggc_internal_cleared_alloc. (melt_val2passflag): Skip TODO_verify_ssa, TODO_verify_flow, TODO_verify_stmts, TODO_verify_rtl_sharing for GCC 5.0. (meltgc_walkstmt_cb, meltgc_walktree_cb) (melt_tree_walk_frame_size, meltgc_walk_gimple_seq): Remove. (melt_gt_ggc_mx_gimple_seq_d): Call gt_ggc_mx_gimple_statement_base. * melt-build-script.tpl: Update copyright year. Don't symlink meltrunsup.h anymore. * melt-build-script.sh: Regenerate. * melt/warmelt-base.melt: Update copyright year. (valdesc_object, valdesc_mapobjects, valdesc_mapstrings) (valdesc_multiple, valdesc_closure, valdesc_routine, valdesc_hook) (valdesc_bucketlongs, valdesc_jsonobject, valdesc_string) (valdesc_strbuf, valdesc_pair, valdesc_list, valdesc_int) (valdesc_double, valdesc_mixint, valdesc_mixloc) (valdesc_mixbigint, valdesc_real, valdesc_special_data): Use ggc_internal_alloc & ggc_internal_cleared_alloc for GCC 5.0. (json_canonical_name): Use ISUPPER, ISALPHA, TOUPPER instead of their standard <ctype.h> lowercase macros. * melt/warmelt-modes.melt: Update copyright year. (generate_runtypesupport_forwcopy_fun): Emit both GCC 4.9 & 5.0 compatible code. * melt/libmelt-ana-base.melt: Update copyright year. * melt/libmelt-ana-gimple.melt: TO BE IMPROVED * melt/generated/*: Painfully regenerated several times thru GCC 4.9 MELT plugin. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/melt-branch@219975 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/tree-vect-data-refs.c')
-rw-r--r--gcc/tree-vect-data-refs.c989
1 files changed, 820 insertions, 169 deletions
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index fbc35a3fe3c..52d6a869c4e 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -1,5 +1,5 @@
/* Data References Analysis and Manipulation Utilities for Vectorization.
- Copyright (C) 2003-2014 Free Software Foundation, Inc.
+ Copyright (C) 2003-2015 Free Software Foundation, Inc.
Contributed by Dorit Naishlos <dorit@il.ibm.com>
and Ira Rosen <irar@il.ibm.com>
@@ -24,10 +24,25 @@ along with GCC; see the file COPYING3. If not see
#include "coretypes.h"
#include "dumpfile.h"
#include "tm.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "vec.h"
+#include "double-int.h"
+#include "input.h"
+#include "alias.h"
+#include "symtab.h"
+#include "wide-int.h"
+#include "inchash.h"
#include "tree.h"
+#include "fold-const.h"
#include "stor-layout.h"
#include "tm_p.h"
#include "target.h"
+#include "predict.h"
+#include "hard-reg-set.h"
+#include "function.h"
+#include "dominance.h"
+#include "cfg.h"
#include "basic-block.h"
#include "gimple-pretty-print.h"
#include "tree-ssa-alias.h"
@@ -47,16 +62,34 @@ along with GCC; see the file COPYING3. If not see
#include "tree-ssa-loop-ivopts.h"
#include "tree-ssa-loop-manip.h"
#include "tree-ssa-loop.h"
-#include "dumpfile.h"
#include "cfgloop.h"
#include "tree-chrec.h"
#include "tree-scalar-evolution.h"
#include "tree-vectorizer.h"
#include "diagnostic-core.h"
+#include "hash-map.h"
+#include "plugin-api.h"
+#include "ipa-ref.h"
#include "cgraph.h"
/* Need to include rtl.h, expr.h, etc. for optabs. */
+#include "hashtab.h"
+#include "rtl.h"
+#include "flags.h"
+#include "statistics.h"
+#include "real.h"
+#include "fixed-value.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "calls.h"
+#include "emit-rtl.h"
+#include "varasm.h"
+#include "stmt.h"
#include "expr.h"
+#include "insn-codes.h"
#include "optabs.h"
+#include "builtins.h"
/* Return true if load- or store-lanes optab OPTAB is implemented for
COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
@@ -65,7 +98,7 @@ static bool
vect_lanes_optab_supported_p (const char *name, convert_optab optab,
tree vectype, unsigned HOST_WIDE_INT count)
{
- enum machine_mode mode, array_mode;
+ machine_mode mode, array_mode;
bool limit_p;
mode = TYPE_MODE (vectype);
@@ -373,11 +406,14 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
.. = a[i+1];
where we will end up loading { a[i], a[i+1] } once, make
sure that inserting group loads before the first load and
- stores after the last store will do the right thing. */
- if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
- && GROUP_SAME_DR_STMT (stmtinfo_a))
- || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
- && GROUP_SAME_DR_STMT (stmtinfo_b)))
+ stores after the last store will do the right thing.
+ Similar for groups like
+ a[i] = ...;
+ ... = a[i];
+ a[i+1] = ...;
+ where loads from the group interleave with the store. */
+ if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
+ || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
{
gimple earlier_stmt;
earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
@@ -1066,7 +1102,7 @@ vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
elem.npeel = npeel;
- slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
+ slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
if (slot)
slot->count++;
else
@@ -1075,7 +1111,8 @@ vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
slot->npeel = npeel;
slot->dr = dr;
slot->count = 1;
- new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
+ new_slot
+ = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
*new_slot = slot;
}
@@ -1195,15 +1232,15 @@ vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
res.inside_cost = INT_MAX;
res.outside_cost = INT_MAX;
LOOP_VINFO_PEELING_HTAB (loop_vinfo)
- .traverse <_vect_peel_extended_info *,
- vect_peeling_hash_get_lowest_cost> (&res);
+ ->traverse <_vect_peel_extended_info *,
+ vect_peeling_hash_get_lowest_cost> (&res);
}
else
{
res.peel_info.count = 0;
LOOP_VINFO_PEELING_HTAB (loop_vinfo)
- .traverse <_vect_peel_extended_info *,
- vect_peeling_hash_get_most_frequent> (&res);
+ ->traverse <_vect_peel_extended_info *,
+ vect_peeling_hash_get_most_frequent> (&res);
}
*npeel = res.peel_info.npeel;
@@ -1395,8 +1432,9 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
size_zero_node) < 0;
/* Save info about DR in the hash table. */
- if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
- LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
+ if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
+ LOOP_VINFO_PEELING_HTAB (loop_vinfo)
+ = new hash_table<peel_info_hasher> (1);
vectype = STMT_VINFO_VECTYPE (stmt_info);
nelements = TYPE_VECTOR_SUBPARTS (vectype);
@@ -1508,10 +1546,20 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
|| !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
do_peeling = false;
- if (do_peeling && all_misalignments_unknown
+ /* If we don't know how many times the peeling loop will run
+ assume it will run VF-1 times and disable peeling if the remaining
+ iters are less than the vectorization factor. */
+ if (do_peeling
+ && all_misalignments_unknown
+ && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && (LOOP_VINFO_INT_NITERS (loop_vinfo)
+ < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
+ do_peeling = false;
+
+ if (do_peeling
+ && all_misalignments_unknown
&& vect_supportable_dr_alignment (dr0, false))
{
-
/* Check if the target requires to prefer stores over loads, i.e., if
misaligned stores are more expensive than misaligned loads (taking
drs with same alignment into account). */
@@ -1598,6 +1646,14 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
&body_cost_vec);
if (!dr0 || !npeel)
do_peeling = false;
+
+ /* If peeling by npeel will result in a remaining loop not iterating
+ enough to be vectorized then do not peel. */
+ if (do_peeling
+ && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && (LOOP_VINFO_INT_NITERS (loop_vinfo)
+ < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
+ do_peeling = false;
}
if (do_peeling)
@@ -2506,8 +2562,7 @@ vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
linear. Don't modify the original vector's order, it is needed for
determining what dependencies are reversed. */
vec<data_reference_p> datarefs_copy = datarefs.copy ();
- qsort (datarefs_copy.address (), datarefs_copy.length (),
- sizeof (data_reference_p), dr_group_sort_cmp);
+ datarefs_copy.qsort (dr_group_sort_cmp);
/* Build the interleaving chains. */
for (i = 0; i < datarefs_copy.length () - 1;)
@@ -2527,11 +2582,14 @@ vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
over them. The we can just skip ahead to the next DR here. */
/* Check that the data-refs have same first location (except init)
- and they are both either store or load (not load and store). */
+ and they are both either store or load (not load and store,
+ not masked loads or stores). */
if (DR_IS_READ (dra) != DR_IS_READ (drb)
|| !operand_equal_p (DR_BASE_ADDRESS (dra),
DR_BASE_ADDRESS (drb), 0)
- || !dr_equal_offsets_p (dra, drb))
+ || !dr_equal_offsets_p (dra, drb)
+ || !gimple_assign_single_p (DR_STMT (dra))
+ || !gimple_assign_single_p (DR_STMT (drb)))
break;
/* Check that the data-refs have the same constant size and step. */
@@ -2677,14 +2735,6 @@ comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
return 0;
}
-template <class T> static void
-swap (T& a, T& b)
-{
- T c (a);
- a = b;
- b = c;
-}
-
/* Function vect_vfa_segment_size.
Create an expression that computes the size of segment
@@ -2817,7 +2867,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
dr_with_seg_len (dr_b, segment_length_b));
if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
- swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
+ std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
}
@@ -2867,8 +2917,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
and DR_A1 and DR_A2 are two consecutive memrefs. */
if (*dr_a1 == *dr_a2)
{
- swap (dr_a1, dr_b1);
- swap (dr_a2, dr_b2);
+ std::swap (dr_a1, dr_b1);
+ std::swap (dr_a2, dr_b2);
}
if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
@@ -2898,15 +2948,13 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
*/
- HOST_WIDE_INT
- min_seg_len_b = (TREE_CODE (dr_b1->seg_len) == INTEGER_CST) ?
- TREE_INT_CST_LOW (dr_b1->seg_len) :
- vect_factor;
+ HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
+ ? tree_to_shwi (dr_b1->seg_len)
+ : vect_factor);
if (diff <= min_seg_len_b
- || (TREE_CODE (dr_a1->seg_len) == INTEGER_CST
- && diff - (HOST_WIDE_INT) TREE_INT_CST_LOW (dr_a1->seg_len) <
- min_seg_len_b))
+ || (tree_fits_shwi_p (dr_a1->seg_len)
+ && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
{
if (dump_enabled_p ())
{
@@ -2956,7 +3004,7 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
tree offtype = NULL_TREE;
tree decl, base, off;
- enum machine_mode pmode;
+ machine_mode pmode;
int punsignedp, pvolatilep;
base = DR_REF (dr);
@@ -2999,8 +3047,8 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
{
if (off == NULL_TREE)
{
- double_int moff = mem_ref_offset (base);
- off = double_int_to_tree (sizetype, moff);
+ offset_int moff = mem_ref_offset (base);
+ off = wide_int_to_tree (sizetype, moff);
}
else
off = size_binop (PLUS_EXPR, off,
@@ -3172,7 +3220,7 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
bool
vect_analyze_data_refs (loop_vec_info loop_vinfo,
bb_vec_info bb_vinfo,
- int *min_vf)
+ int *min_vf, unsigned *n_stmts)
{
struct loop *loop = NULL;
basic_block bb = NULL;
@@ -3207,6 +3255,9 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
{
gimple stmt = gsi_stmt (gsi);
+ if (is_gimple_debug (stmt))
+ continue;
+ ++*n_stmts;
if (!find_data_references_in_stmt (loop, stmt, &datarefs))
{
if (is_gimple_call (stmt) && loop->safelen)
@@ -3214,7 +3265,7 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
tree fndecl = gimple_call_fndecl (stmt), op;
if (fndecl != NULL_TREE)
{
- struct cgraph_node *node = cgraph_get_node (fndecl);
+ struct cgraph_node *node = cgraph_node::get (fndecl);
if (node != NULL && node->simd_clones != NULL)
{
unsigned int j, n = gimple_call_num_args (stmt);
@@ -3260,6 +3311,9 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
{
gimple stmt = gsi_stmt (gsi);
+ if (is_gimple_debug (stmt))
+ continue;
+ ++*n_stmts;
if (!find_data_references_in_stmt (NULL, stmt,
&BB_VINFO_DATAREFS (bb_vinfo)))
{
@@ -3523,7 +3577,7 @@ again:
tree outer_step, outer_base, outer_init;
HOST_WIDE_INT pbitsize, pbitpos;
tree poffset;
- enum machine_mode pmode;
+ machine_mode pmode;
int punsignedp, pvolatilep;
affine_iv base_iv, offset_iv;
tree dinit;
@@ -3832,6 +3886,9 @@ vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
is as follows:
if LOOP=i_loop: &in (relative to i_loop)
if LOOP=j_loop: &in+i*2B (relative to j_loop)
+ BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
+ initial address. Unlike OFFSET, which is number of elements to
+ be added, BYTE_OFFSET is measured in bytes.
Output:
1. Return an SSA_NAME whose value is the address of the memory location of
@@ -3845,7 +3902,8 @@ tree
vect_create_addr_base_for_vector_ref (gimple stmt,
gimple_seq *new_stmt_list,
tree offset,
- struct loop *loop)
+ struct loop *loop,
+ tree byte_offset)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
@@ -3898,6 +3956,12 @@ vect_create_addr_base_for_vector_ref (gimple stmt,
base_offset = fold_build2 (PLUS_EXPR, sizetype,
base_offset, offset);
}
+ if (byte_offset)
+ {
+ byte_offset = fold_convert (sizetype, byte_offset);
+ base_offset = fold_build2 (PLUS_EXPR, sizetype,
+ base_offset, byte_offset);
+ }
/* base + base_offset */
if (loop_vinfo)
@@ -3919,8 +3983,12 @@ vect_create_addr_base_for_vector_ref (gimple stmt,
&& TREE_CODE (addr_base) == SSA_NAME)
{
duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
- if (offset)
+ unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
+ int misalign = DR_MISALIGNMENT (dr);
+ if (offset || byte_offset || (misalign == -1))
mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
+ else
+ set_ptr_info_alignment (SSA_NAME_PTR_INFO (addr_base), align, misalign);
}
if (dump_enabled_p ())
@@ -3955,6 +4023,10 @@ vect_create_addr_base_for_vector_ref (gimple stmt,
5. BSI: location where the new stmts are to be placed if there is no loop
6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
pointing to the initial address.
+ 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
+ to the initial address accessed by the data-ref in STMT. This is
+ similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
+ in bytes.
Output:
1. Declare a new ptr to vector_type, and have it point to the base of the
@@ -3968,6 +4040,8 @@ vect_create_addr_base_for_vector_ref (gimple stmt,
initial_address = &a[init];
if OFFSET is supplied:
initial_address = &a[init + OFFSET];
+ if BYTE_OFFSET is supplied:
+ initial_address = &a[init] + BYTE_OFFSET;
Return the initial_address in INITIAL_ADDRESS.
@@ -3985,7 +4059,7 @@ tree
vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
tree offset, tree *initial_address,
gimple_stmt_iterator *gsi, gimple *ptr_incr,
- bool only_init, bool *inv_p)
+ bool only_init, bool *inv_p, tree byte_offset)
{
const char *base_name;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
@@ -4128,10 +4202,10 @@ vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
/* (2) Calculate the initial address of the aggregate-pointer, and set
the aggregate-pointer to point to it before the loop. */
- /* Create: (&(base[init_val+offset]) in the loop preheader. */
+ /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
- offset, loop);
+ offset, loop, byte_offset);
if (new_stmt_list)
{
if (pe)
@@ -4282,7 +4356,7 @@ bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
tree update = TYPE_SIZE_UNIT (vectype);
- gimple incr_stmt;
+ gassign *incr_stmt;
ssa_op_iter iter;
use_operand_p use_p;
tree new_dataref_ptr;
@@ -4290,9 +4364,9 @@ bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
if (bump)
update = bump;
- new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
- incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
- dataref_ptr, update);
+ new_dataref_ptr = copy_ssa_name (dataref_ptr);
+ incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
+ dataref_ptr, update);
vect_finish_stmt_generation (stmt, incr_stmt, gsi);
/* Copy the points-to information if it exists. */
@@ -4340,9 +4414,9 @@ vect_create_destination_var (tree scalar_dest, tree vectype)
name = get_name (scalar_dest);
if (name)
- asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
+ new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
else
- asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
+ new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
vec_dest = vect_get_new_vect_var (type, kind, new_name);
free (new_name);
@@ -4357,15 +4431,16 @@ vect_create_destination_var (tree scalar_dest, tree vectype)
bool
vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
{
- enum machine_mode mode = TYPE_MODE (vectype);
+ machine_mode mode = TYPE_MODE (vectype);
- /* vect_permute_store_chain requires the group size to be a power of two. */
- if (exact_log2 (count) == -1)
+ /* vect_permute_store_chain requires the group size to be equal to 3 or
+ be a power of two. */
+ if (count != 3 && exact_log2 (count) == -1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "the size of the group of accesses"
- " is not a power of 2\n");
+ "the size of the group of accesses"
+ " is not a power of 2 or not eqaul to 3\n");
return false;
}
@@ -4374,23 +4449,76 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
{
unsigned int i, nelt = GET_MODE_NUNITS (mode);
unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
- for (i = 0; i < nelt / 2; i++)
+
+ if (count == 3)
{
- sel[i * 2] = i;
- sel[i * 2 + 1] = i + nelt;
+ unsigned int j0 = 0, j1 = 0, j2 = 0;
+ unsigned int i, j;
+
+ for (j = 0; j < 3; j++)
+ {
+ int nelt0 = ((3 - j) * nelt) % 3;
+ int nelt1 = ((3 - j) * nelt + 1) % 3;
+ int nelt2 = ((3 - j) * nelt + 2) % 3;
+ for (i = 0; i < nelt; i++)
+ {
+ if (3 * i + nelt0 < nelt)
+ sel[3 * i + nelt0] = j0++;
+ if (3 * i + nelt1 < nelt)
+ sel[3 * i + nelt1] = nelt + j1++;
+ if (3 * i + nelt2 < nelt)
+ sel[3 * i + nelt2] = 0;
+ }
+ if (!can_vec_perm_p (mode, false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf (MSG_MISSED_OPTIMIZATION,
+ "permutaion op not supported by target.\n");
+ return false;
+ }
+
+ for (i = 0; i < nelt; i++)
+ {
+ if (3 * i + nelt0 < nelt)
+ sel[3 * i + nelt0] = 3 * i + nelt0;
+ if (3 * i + nelt1 < nelt)
+ sel[3 * i + nelt1] = 3 * i + nelt1;
+ if (3 * i + nelt2 < nelt)
+ sel[3 * i + nelt2] = nelt + j2++;
+ }
+ if (!can_vec_perm_p (mode, false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf (MSG_MISSED_OPTIMIZATION,
+ "permutaion op not supported by target.\n");
+ return false;
+ }
+ }
+ return true;
}
- if (can_vec_perm_p (mode, false, sel))
+ else
{
- for (i = 0; i < nelt; i++)
- sel[i] += nelt / 2;
- if (can_vec_perm_p (mode, false, sel))
- return true;
+ /* If length is not equal to 3 then only power of 2 is supported. */
+ gcc_assert (exact_log2 (count) != -1);
+
+ for (i = 0; i < nelt / 2; i++)
+ {
+ sel[i * 2] = i;
+ sel[i * 2 + 1] = i + nelt;
+ }
+ if (can_vec_perm_p (mode, false, sel))
+ {
+ for (i = 0; i < nelt; i++)
+ sel[i] += nelt / 2;
+ if (can_vec_perm_p (mode, false, sel))
+ return true;
+ }
}
}
if (dump_enabled_p ())
dump_printf (MSG_MISSED_OPTIMIZATION,
- "interleave op not supported by target.\n");
+ "permutaion op not supported by target.\n");
return false;
}
@@ -4410,9 +4538,9 @@ vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
/* Function vect_permute_store_chain.
Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
- a power of 2, generate interleave_high/low stmts to reorder the data
- correctly for the stores. Return the final references for stores in
- RESULT_CHAIN.
+ a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
+ the data correctly for the stores. Return the final references for stores
+ in RESULT_CHAIN.
E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
The input is 4 vectors each containing 8 elements. We assign a number to
@@ -4479,7 +4607,9 @@ vect_permute_store_chain (vec<tree> dr_chain,
gimple perm_stmt;
tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
tree perm_mask_low, perm_mask_high;
- unsigned int i, n;
+ tree data_ref;
+ tree perm3_mask_low, perm3_mask_high;
+ unsigned int i, n, log_length = exact_log2 (length);
unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
@@ -4487,47 +4617,108 @@ vect_permute_store_chain (vec<tree> dr_chain,
memcpy (result_chain->address (), dr_chain.address (),
length * sizeof (tree));
- for (i = 0, n = nelt / 2; i < n; i++)
+ if (length == 3)
{
- sel[i * 2] = i;
- sel[i * 2 + 1] = i + nelt;
- }
- perm_mask_high = vect_gen_perm_mask (vectype, sel);
- gcc_assert (perm_mask_high != NULL);
+ unsigned int j0 = 0, j1 = 0, j2 = 0;
+
+ for (j = 0; j < 3; j++)
+ {
+ int nelt0 = ((3 - j) * nelt) % 3;
+ int nelt1 = ((3 - j) * nelt + 1) % 3;
+ int nelt2 = ((3 - j) * nelt + 2) % 3;
- for (i = 0; i < nelt; i++)
- sel[i] += nelt / 2;
- perm_mask_low = vect_gen_perm_mask (vectype, sel);
- gcc_assert (perm_mask_low != NULL);
+ for (i = 0; i < nelt; i++)
+ {
+ if (3 * i + nelt0 < nelt)
+ sel[3 * i + nelt0] = j0++;
+ if (3 * i + nelt1 < nelt)
+ sel[3 * i + nelt1] = nelt + j1++;
+ if (3 * i + nelt2 < nelt)
+ sel[3 * i + nelt2] = 0;
+ }
+ perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
- for (i = 0, n = exact_log2 (length); i < n; i++)
- {
- for (j = 0; j < length/2; j++)
- {
- vect1 = dr_chain[j];
- vect2 = dr_chain[j+length/2];
+ for (i = 0; i < nelt; i++)
+ {
+ if (3 * i + nelt0 < nelt)
+ sel[3 * i + nelt0] = 3 * i + nelt0;
+ if (3 * i + nelt1 < nelt)
+ sel[3 * i + nelt1] = 3 * i + nelt1;
+ if (3 * i + nelt2 < nelt)
+ sel[3 * i + nelt2] = nelt + j2++;
+ }
+ perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
+
+ vect1 = dr_chain[0];
+ vect2 = dr_chain[1];
/* Create interleaving stmt:
- high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
- high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
- perm_stmt
- = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
- vect1, vect2, perm_mask_high);
+ low = VEC_PERM_EXPR <vect1, vect2,
+ {j, nelt, *, j + 1, nelt + j + 1, *,
+ j + 2, nelt + j + 2, *, ...}> */
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
+ vect2, perm3_mask_low);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
- (*result_chain)[2*j] = high;
+ vect1 = data_ref;
+ vect2 = dr_chain[2];
/* Create interleaving stmt:
- low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
- nelt*3/2+1, ...}> */
- low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
- perm_stmt
- = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
- vect1, vect2, perm_mask_low);
+ low = VEC_PERM_EXPR <vect1, vect2,
+ {0, 1, nelt + j, 3, 4, nelt + j + 1,
+ 6, 7, nelt + j + 2, ...}> */
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
+ vect2, perm3_mask_high);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
- (*result_chain)[2*j+1] = low;
+ (*result_chain)[j] = data_ref;
+ }
+ }
+ else
+ {
+ /* If length is not equal to 3 then only power of 2 is supported. */
+ gcc_assert (exact_log2 (length) != -1);
+
+ for (i = 0, n = nelt / 2; i < n; i++)
+ {
+ sel[i * 2] = i;
+ sel[i * 2 + 1] = i + nelt;
}
- memcpy (dr_chain.address (), result_chain->address (),
- length * sizeof (tree));
+ perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
+
+ for (i = 0; i < nelt; i++)
+ sel[i] += nelt / 2;
+ perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
+
+ for (i = 0, n = log_length; i < n; i++)
+ {
+ for (j = 0; j < length/2; j++)
+ {
+ vect1 = dr_chain[j];
+ vect2 = dr_chain[j+length/2];
+
+ /* Create interleaving stmt:
+ high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
+ ...}> */
+ high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
+ perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
+ vect2, perm_mask_high);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[2*j] = high;
+
+ /* Create interleaving stmt:
+ low = VEC_PERM_EXPR <vect1, vect2,
+ {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
+ ...}> */
+ low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
+ perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
+ vect2, perm_mask_low);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[2*j+1] = low;
+ }
+ memcpy (dr_chain.address (), result_chain->address (),
+ length * sizeof (tree));
+ }
}
}
@@ -4600,11 +4791,10 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
gimple inc;
tree ptr;
tree data_ref;
- gimple new_stmt;
basic_block new_bb;
tree msq_init = NULL_TREE;
tree new_temp;
- gimple phi_stmt;
+ gphi *phi_stmt;
tree msq = NULL_TREE;
gimple_seq stmts = NULL;
bool inv_p;
@@ -4695,15 +4885,16 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
if (alignment_support_scheme == dr_explicit_realign_optimized)
{
/* Create msq_init = *(floor(p1)) in the loop preheader */
+ gassign *new_stmt;
gcc_assert (!compute_in_loop);
vec_dest = vect_create_destination_var (scalar_dest, vectype);
ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
NULL_TREE, &init_addr, NULL, &inc,
true, &inv_p);
- new_temp = copy_ssa_name (ptr, NULL);
- new_stmt = gimple_build_assign_with_ops
- (BIT_AND_EXPR, new_temp, ptr,
+ new_temp = copy_ssa_name (ptr);
+ new_stmt = gimple_build_assign
+ (new_temp, BIT_AND_EXPR, ptr,
build_int_cst (TREE_TYPE (ptr),
-(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
@@ -4731,6 +4922,7 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
if (targetm.vectorize.builtin_mask_for_load)
{
+ gcall *new_stmt;
tree builtin_decl;
/* Compute INIT_ADDR - the initial addressed accessed by this memref. */
@@ -4788,7 +4980,7 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
pe = loop_preheader_edge (containing_loop);
vec_dest = vect_create_destination_var (scalar_dest, vectype);
- msq = make_ssa_name (vec_dest, NULL);
+ msq = make_ssa_name (vec_dest);
phi_stmt = create_phi_node (msq, containing_loop->header);
add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
@@ -4804,38 +4996,78 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
bool
vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
{
- enum machine_mode mode = TYPE_MODE (vectype);
+ machine_mode mode = TYPE_MODE (vectype);
- /* vect_permute_load_chain requires the group size to be a power of two. */
- if (exact_log2 (count) == -1)
+ /* vect_permute_load_chain requires the group size to be equal to 3 or
+ be a power of two. */
+ if (count != 3 && exact_log2 (count) == -1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "the size of the group of accesses"
- " is not a power of 2\n");
+ "the size of the group of accesses"
+ " is not a power of 2 or not equal to 3\n");
return false;
}
/* Check that the permutation is supported. */
if (VECTOR_MODE_P (mode))
{
- unsigned int i, nelt = GET_MODE_NUNITS (mode);
+ unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
- for (i = 0; i < nelt; i++)
- sel[i] = i * 2;
- if (can_vec_perm_p (mode, false, sel))
+ if (count == 3)
+ {
+ unsigned int k;
+ for (k = 0; k < 3; k++)
+ {
+ for (i = 0; i < nelt; i++)
+ if (3 * i + k < 2 * nelt)
+ sel[i] = 3 * i + k;
+ else
+ sel[i] = 0;
+ if (!can_vec_perm_p (mode, false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 3 loads is not supported by"
+ " target\n");
+ return false;
+ }
+ for (i = 0, j = 0; i < nelt; i++)
+ if (3 * i + k < 2 * nelt)
+ sel[i] = i;
+ else
+ sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+ if (!can_vec_perm_p (mode, false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 3 loads is not supported by"
+ " target\n");
+ return false;
+ }
+ }
+ return true;
+ }
+ else
{
+ /* If length is not equal to 3 then only power of 2 is supported. */
+ gcc_assert (exact_log2 (count) != -1);
for (i = 0; i < nelt; i++)
- sel[i] = i * 2 + 1;
+ sel[i] = i * 2;
if (can_vec_perm_p (mode, false, sel))
- return true;
- }
+ {
+ for (i = 0; i < nelt; i++)
+ sel[i] = i * 2 + 1;
+ if (can_vec_perm_p (mode, false, sel))
+ return true;
+ }
+ }
}
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "extract even/odd not supported by target\n");
+ "extract even/odd not supported by target\n");
return false;
}
@@ -4853,8 +5085,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
/* Function vect_permute_load_chain.
Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
- a power of 2, generate extract_even/odd stmts to reorder the input data
- correctly. Return the final references for loads in RESULT_CHAIN.
+ a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
+ the input data correctly. Return the final references for loads in
+ RESULT_CHAIN.
E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
The input is 4 vectors each containing 8 elements. We assign a number to each
@@ -4935,6 +5168,7 @@ vect_permute_load_chain (vec<tree> dr_chain,
{
tree data_ref, first_vect, second_vect;
tree perm_mask_even, perm_mask_odd;
+ tree perm3_mask_low, perm3_mask_high;
gimple perm_stmt;
tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
unsigned int i, j, log_length = exact_log2 (length);
@@ -4945,45 +5179,426 @@ vect_permute_load_chain (vec<tree> dr_chain,
memcpy (result_chain->address (), dr_chain.address (),
length * sizeof (tree));
- for (i = 0; i < nelt; ++i)
- sel[i] = i * 2;
- perm_mask_even = vect_gen_perm_mask (vectype, sel);
- gcc_assert (perm_mask_even != NULL);
+ if (length == 3)
+ {
+ unsigned int k;
- for (i = 0; i < nelt; ++i)
- sel[i] = i * 2 + 1;
- perm_mask_odd = vect_gen_perm_mask (vectype, sel);
- gcc_assert (perm_mask_odd != NULL);
+ for (k = 0; k < 3; k++)
+ {
+ for (i = 0; i < nelt; i++)
+ if (3 * i + k < 2 * nelt)
+ sel[i] = 3 * i + k;
+ else
+ sel[i] = 0;
+ perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
+
+ for (i = 0, j = 0; i < nelt; i++)
+ if (3 * i + k < 2 * nelt)
+ sel[i] = i;
+ else
+ sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+
+ perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
+
+ first_vect = dr_chain[0];
+ second_vect = dr_chain[1];
+
+ /* Create interleaving stmt (low part of):
+ low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+ ...}> */
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
+ second_vect, perm3_mask_low);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
- for (i = 0; i < log_length; i++)
+ /* Create interleaving stmt (high part of):
+ high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+ ...}> */
+ first_vect = data_ref;
+ second_vect = dr_chain[2];
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
+ second_vect, perm3_mask_high);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[k] = data_ref;
+ }
+ }
+ else
{
- for (j = 0; j < length; j += 2)
+ /* If length is not equal to 3 then only power of 2 is supported. */
+ gcc_assert (exact_log2 (length) != -1);
+
+ for (i = 0; i < nelt; ++i)
+ sel[i] = i * 2;
+ perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
+
+ for (i = 0; i < nelt; ++i)
+ sel[i] = i * 2 + 1;
+ perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
+
+ for (i = 0; i < log_length; i++)
{
- first_vect = dr_chain[j];
- second_vect = dr_chain[j+1];
-
- /* data_ref = permute_even (first_data_ref, second_data_ref); */
- data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
- perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
- first_vect, second_vect,
- perm_mask_even);
+ for (j = 0; j < length; j += 2)
+ {
+ first_vect = dr_chain[j];
+ second_vect = dr_chain[j+1];
+
+ /* data_ref = permute_even (first_data_ref, second_data_ref); */
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+ first_vect, second_vect,
+ perm_mask_even);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[j/2] = data_ref;
+
+ /* data_ref = permute_odd (first_data_ref, second_data_ref); */
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+ first_vect, second_vect,
+ perm_mask_odd);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[j/2+length/2] = data_ref;
+ }
+ memcpy (dr_chain.address (), result_chain->address (),
+ length * sizeof (tree));
+ }
+ }
+}
+
+/* Function vect_shift_permute_load_chain.
+
+ Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
+ sequence of stmts to reorder the input data accordingly.
+ Return the final references for loads in RESULT_CHAIN.
+ Return true if successed, false otherwise.
+
+ E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
+ The input is 3 vectors each containing 8 elements. We assign a
+ number to each element, the input sequence is:
+
+ 1st vec: 0 1 2 3 4 5 6 7
+ 2nd vec: 8 9 10 11 12 13 14 15
+ 3rd vec: 16 17 18 19 20 21 22 23
+
+ The output sequence should be:
+
+ 1st vec: 0 3 6 9 12 15 18 21
+ 2nd vec: 1 4 7 10 13 16 19 22
+ 3rd vec: 2 5 8 11 14 17 20 23
+
+ We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
+
+ First we shuffle all 3 vectors to get correct elements order:
+
+ 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
+ 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
+ 3rd vec: (16 19 22) (17 20 23) (18 21)
+
+ Next we unite and shift vector 3 times:
+
+ 1st step:
+ shift right by 6 the concatenation of:
+ "1st vec" and "2nd vec"
+ ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
+ "2nd vec" and "3rd vec"
+ ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
+ "3rd vec" and "1st vec"
+ (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
+ | New vectors |
+
+ So that now new vectors are:
+
+ 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
+ 2nd vec: (10 13) (16 19 22) (17 20 23)
+ 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
+
+ 2nd step:
+ shift right by 5 the concatenation of:
+ "1st vec" and "3rd vec"
+ ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
+ "2nd vec" and "1st vec"
+ (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
+ "3rd vec" and "2nd vec"
+ (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
+ | New vectors |
+
+ So that now new vectors are:
+
+ 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
+ 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
+ 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
+
+ 3rd step:
+ shift right by 5 the concatenation of:
+ "1st vec" and "1st vec"
+ ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
+ shift right by 3 the concatenation of:
+ "2nd vec" and "2nd vec"
+ (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
+ | New vectors |
+
+ So that now all vectors are READY:
+ 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
+ 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
+ 3rd vec: ( 1 4 7) (10 13) (16 19 22)
+
+ This algorithm is faster than one in vect_permute_load_chain if:
+ 1. "shift of a concatination" is faster than general permutation.
+ This is usually so.
+ 2. The TARGET machine can't execute vector instructions in parallel.
+ This is because each step of the algorithm depends on previous.
+ The algorithm in vect_permute_load_chain is much more parallel.
+
+ The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
+*/
+
+static bool
+vect_shift_permute_load_chain (vec<tree> dr_chain,
+ unsigned int length,
+ gimple stmt,
+ gimple_stmt_iterator *gsi,
+ vec<tree> *result_chain)
+{
+ tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
+ tree perm2_mask1, perm2_mask2, perm3_mask;
+ tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
+ gimple perm_stmt;
+
+ tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
+ unsigned int i;
+ unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
+ unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+
+ result_chain->quick_grow (length);
+ memcpy (result_chain->address (), dr_chain.address (),
+ length * sizeof (tree));
+
+ if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
+ {
+ unsigned int j, log_length = exact_log2 (length);
+ for (i = 0; i < nelt / 2; ++i)
+ sel[i] = i * 2;
+ for (i = 0; i < nelt / 2; ++i)
+ sel[nelt / 2 + i] = i * 2 + 1;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 2 fields structure is not \
+ supported by target\n");
+ return false;
+ }
+ perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
+
+ for (i = 0; i < nelt / 2; ++i)
+ sel[i] = i * 2 + 1;
+ for (i = 0; i < nelt / 2; ++i)
+ sel[nelt / 2 + i] = i * 2;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 2 fields structure is not \
+ supported by target\n");
+ return false;
+ }
+ perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {4 5 6 7 8 9 10 11}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = nelt / 2 + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+ /* Generating permutation constant to select vector from 2.
+ For vector length 8 it is {0 1 2 3 12 13 14 15}. */
+ for (i = 0; i < nelt / 2; i++)
+ sel[i] = i;
+ for (i = nelt / 2; i < nelt; i++)
+ sel[i] = nelt + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "select is not supported by target\n");
+ return false;
+ }
+ select_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+ for (i = 0; i < log_length; i++)
+ {
+ for (j = 0; j < length; j += 2)
+ {
+ first_vect = dr_chain[j];
+ second_vect = dr_chain[j + 1];
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+ first_vect, first_vect,
+ perm2_mask1);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect[0] = data_ref;
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+ second_vect, second_vect,
+ perm2_mask2);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect[1] = data_ref;
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+ vect[0], vect[1], shift1_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[j/2 + length/2] = data_ref;
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+ vect[0], vect[1], select_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[j/2] = data_ref;
+ }
+ memcpy (dr_chain.address (), result_chain->address (),
+ length * sizeof (tree));
+ }
+ return true;
+ }
+ if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
+ {
+ unsigned int k = 0, l = 0;
+
+ /* Generating permutation constant to get all elements in rigth order.
+ For vector length 8 it is {0 3 6 1 4 7 2 5}. */
+ for (i = 0; i < nelt; i++)
+ {
+ if (3 * k + (l % 3) >= nelt)
+ {
+ k = 0;
+ l += (3 - (nelt % 3));
+ }
+ sel[i] = 3 * k + (l % 3);
+ k++;
+ }
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shuffle of 3 fields structure is not \
+ supported by target\n");
+ return false;
+ }
+ perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {6 7 8 9 10 11 12 13}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {5 6 7 8 9 10 11 12}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = 2 * (nelt / 3) + 1 + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {3 4 5 6 7 8 9 10}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+ /* Generating permutation constant to shift all elements.
+ For vector length 8 it is {5 6 7 8 9 10 11 12}. */
+ for (i = 0; i < nelt; i++)
+ sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
+ if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "shift permutation is not supported by target\n");
+ return false;
+ }
+ shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+ for (k = 0; k < 3; k++)
+ {
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+ dr_chain[k], dr_chain[k],
+ perm3_mask);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
- (*result_chain)[j/2] = data_ref;
+ vect[k] = data_ref;
+ }
+
+ for (k = 0; k < 3; k++)
+ {
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+ vect[k % 3], vect[(k + 1) % 3],
+ shift1_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ vect_shift[k] = data_ref;
+ }
- /* data_ref = permute_odd (first_data_ref, second_data_ref); */
- data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
- perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
- first_vect, second_vect,
- perm_mask_odd);
+ for (k = 0; k < 3; k++)
+ {
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+ vect_shift[(4 - k) % 3],
+ vect_shift[(3 - k) % 3],
+ shift2_mask);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
- (*result_chain)[j/2+length/2] = data_ref;
+ vect[k] = data_ref;
}
- memcpy (dr_chain.address (), result_chain->address (),
- length * sizeof (tree));
+
+ (*result_chain)[3 - (nelt % 3)] = vect[2];
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
+ vect[0], shift3_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[nelt % 3] = data_ref;
+
+ data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
+ perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
+ vect[1], shift4_mask);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ (*result_chain)[0] = data_ref;
+ return true;
}
+ return false;
}
-
/* Function vect_transform_grouped_load.
Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
@@ -4995,13 +5610,23 @@ void
vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
gimple_stmt_iterator *gsi)
{
+ machine_mode mode;
vec<tree> result_chain = vNULL;
/* DR_CHAIN contains input data-refs that are a part of the interleaving.
RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
vectors, that are ready for vector computation. */
result_chain.create (size);
- vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
+
+ /* If reassociation width for vector type is 2 or greater target machine can
+ execute 2 or more vector instructions in parallel. Otherwise try to
+ get chain for loads group using vect_shift_permute_load_chain. */
+ mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
+ if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
+ || exact_log2 (size) != -1
+ || !vect_shift_permute_load_chain (dr_chain, size, stmt,
+ gsi, &result_chain))
+ vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
vect_record_grouped_load_vectors (stmt, result_chain);
result_chain.release ();
}
@@ -5091,20 +5716,33 @@ vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
if (TREE_CODE (decl) != VAR_DECL)
return false;
- /* We cannot change alignment of common or external symbols as another
- translation unit may contain a definition with lower alignment.
- The rules of common symbol linking mean that the definition
- will override the common symbol. The same is true for constant
- pool entries which may be shared and are not properly merged
- by LTO. */
- if (DECL_EXTERNAL (decl)
- || DECL_COMMON (decl)
- || DECL_IN_CONSTANT_POOL (decl))
+ /* With -fno-toplevel-reorder we may have already output the constant. */
+ if (TREE_ASM_WRITTEN (decl))
return false;
- if (TREE_ASM_WRITTEN (decl))
+ /* Constant pool entries may be shared and not properly merged by LTO. */
+ if (DECL_IN_CONSTANT_POOL (decl))
return false;
+ if (TREE_PUBLIC (decl) || DECL_EXTERNAL (decl))
+ {
+ symtab_node *snode;
+
+ /* We cannot change alignment of symbols that may bind to symbols
+ in other translation unit that may contain a definition with lower
+ alignment. */
+ if (!decl_binds_to_current_def_p (decl))
+ return false;
+
+ /* When compiling partition, be sure the symbol is not output by other
+ partition. */
+ snode = symtab_node::get (decl);
+ if (flag_ltrans
+ && (snode->in_other_partition
+ || snode->get_partitioning_class () == SYMBOL_DUPLICATE))
+ return false;
+ }
+
/* Do not override the alignment as specified by the ABI when the used
attribute is set. */
if (DECL_PRESERVE_P (decl))
@@ -5113,10 +5751,23 @@ vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
/* Do not override explicit alignment set by the user when an explicit
section name is also used. This is a common idiom used by many
software projects. */
- if (DECL_SECTION_NAME (decl) != NULL_TREE
- && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
+ if (TREE_STATIC (decl)
+ && DECL_SECTION_NAME (decl) != NULL
+ && !symtab_node::get (decl)->implicit_section)
return false;
+ /* If symbol is an alias, we need to check that target is OK. */
+ if (TREE_STATIC (decl))
+ {
+ tree target = symtab_node::get (decl)->ultimate_alias_target ()->decl;
+ if (target != decl)
+ {
+ if (DECL_PRESERVE_P (target))
+ return false;
+ decl = target;
+ }
+ }
+
if (TREE_STATIC (decl))
return (alignment <= MAX_OFILE_ALIGNMENT);
else
@@ -5137,7 +5788,7 @@ vect_supportable_dr_alignment (struct data_reference *dr,
gimple stmt = DR_STMT (dr);
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
- enum machine_mode mode = TYPE_MODE (vectype);
+ machine_mode mode = TYPE_MODE (vectype);
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
struct loop *vect_loop = NULL;
bool nested_in_vect_loop = false;