diff options
Diffstat (limited to 'gcc/tree-vect-loop.c')
-rw-r--r-- | gcc/tree-vect-loop.c | 1100 |
1 files changed, 752 insertions, 348 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index a7c3d3d7e29..8740a7573ac 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -890,8 +890,10 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = vect_reduction_def; /* Store the reduction cycles for possible vectorization in - loop-aware SLP. */ - LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt); + loop-aware SLP if it was not detected as reduction + chain. */ + if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt))) + LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt); } } } @@ -1776,6 +1778,10 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo) if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def && ! PURE_SLP_STMT (stmt_info)) ok = vectorizable_induction (phi, NULL, NULL, NULL); + else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def + || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) + && ! PURE_SLP_STMT (stmt_info)) + ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL); } if (ok && STMT_VINFO_LIVE_P (stmt_info)) @@ -1799,7 +1805,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo) { gimple *stmt = gsi_stmt (si); if (!gimple_clobber_p (stmt) - && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL)) + && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL)) return false; } } /* bbs */ @@ -2129,21 +2135,17 @@ start_over: goto again; } - min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) - * vectorization_factor) - 1); + min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) + * vectorization_factor); /* Use the cost model only if it is more conservative than user specified threshold. */ - th = (unsigned) min_scalar_loop_bound; - if (min_profitable_iters - && (!min_scalar_loop_bound - || min_profitable_iters > min_scalar_loop_bound)) - th = (unsigned) min_profitable_iters; + th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters); LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th; if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) - && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th) + && LOOP_VINFO_INT_NITERS (loop_vinfo) < th) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -2162,7 +2164,7 @@ start_over: estimated_niter = max_niter; if (estimated_niter != -1 && ((unsigned HOST_WIDE_INT) estimated_niter - <= MAX (th, (unsigned)min_profitable_estimate))) + < MAX (th, (unsigned) min_profitable_estimate))) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -2179,9 +2181,9 @@ start_over: /* Decide whether we need to create an epilogue loop to handle remaining scalar iterations. */ - th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1) - / LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - * LOOP_VINFO_VECT_FACTOR (loop_vinfo); + th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + / LOOP_VINFO_VECT_FACTOR (loop_vinfo)) + * LOOP_VINFO_VECT_FACTOR (loop_vinfo)); if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) @@ -2244,7 +2246,7 @@ start_over: /* Niters for at least one iteration of vectorized loop. */ niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); /* One additional iteration because of peeling for gap. */ - if (!LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) niters_th++; if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th) LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th; @@ -2811,27 +2813,29 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi, return NULL; } + if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))) + return NULL; + nloop_uses = 0; auto_vec<gphi *, 3> lcphis; - if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))) - FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) - { - gimple *use_stmt = USE_STMT (use_p); - if (is_gimple_debug (use_stmt)) - continue; - if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) - nloop_uses++; - else - /* We can have more than one loop-closed PHI. */ - lcphis.safe_push (as_a <gphi *> (use_stmt)); - if (nloop_uses > 1) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "reduction used in loop.\n"); - return NULL; - } - } + FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) + { + gimple *use_stmt = USE_STMT (use_p); + if (is_gimple_debug (use_stmt)) + continue; + if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) + nloop_uses++; + else + /* We can have more than one loop-closed PHI. */ + lcphis.safe_push (as_a <gphi *> (use_stmt)); + if (nloop_uses > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "reduction used in loop.\n"); + return NULL; + } + } /* If DEF_STMT is a phi node itself, we expect it to have a single argument defined in the inner loop. */ @@ -2896,9 +2900,9 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi, gimple instruction for the first simple tests and only do this if we're allowed to change code at all. */ if (code == MINUS_EXPR - && (op1 = gimple_assign_rhs1 (def_stmt)) - && TREE_CODE (op1) == SSA_NAME - && SSA_NAME_DEF_STMT (op1) == phi) + && ! ((op1 = gimple_assign_rhs2 (def_stmt)) + && TREE_CODE (op1) == SSA_NAME + && SSA_NAME_DEF_STMT (op1) == phi)) code = PLUS_EXPR; if (code == COND_EXPR) @@ -3149,6 +3153,7 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi, /* Try to find SLP reduction chain. */ if (! nested_in_vect_loop && code != COND_EXPR + && orig_code != MINUS_EXPR && vect_is_slp_reduction (loop_info, phi, def_stmt)) { if (dump_enabled_p ()) @@ -3158,10 +3163,125 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi, return def_stmt; } + /* Dissolve group eventually half-built by vect_is_slp_reduction. */ + gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt)); + while (first) + { + gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)); + GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL; + GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL; + first = next; + } + + /* Look for the expression computing loop_arg from loop PHI result. */ + auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; + auto_bitmap visited; + tree lookfor = PHI_RESULT (phi); + ssa_op_iter curri; + use_operand_p curr = op_iter_init_phiuse (&curri, as_a <gphi *>(phi), + SSA_OP_USE); + while (USE_FROM_PTR (curr) != loop_arg) + curr = op_iter_next_use (&curri); + curri.i = curri.numops; + do + { + path.safe_push (std::make_pair (curri, curr)); + tree use = USE_FROM_PTR (curr); + if (use == lookfor) + break; + gimple *def = SSA_NAME_DEF_STMT (use); + if (gimple_nop_p (def) + || ! flow_bb_inside_loop_p (loop, gimple_bb (def))) + { +pop: + do + { + std::pair<ssa_op_iter, use_operand_p> x = path.pop (); + curri = x.first; + curr = x.second; + do + curr = op_iter_next_use (&curri); + /* Skip already visited or non-SSA operands (from iterating + over PHI args). */ + while (curr != NULL_USE_OPERAND_P + && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME + || ! bitmap_set_bit (visited, + SSA_NAME_VERSION + (USE_FROM_PTR (curr))))); + } + while (curr == NULL_USE_OPERAND_P && ! path.is_empty ()); + if (curr == NULL_USE_OPERAND_P) + break; + } + else + { + if (gimple_code (def) == GIMPLE_PHI) + curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE); + else + curr = op_iter_init_use (&curri, def, SSA_OP_USE); + while (curr != NULL_USE_OPERAND_P + && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME + || ! bitmap_set_bit (visited, + SSA_NAME_VERSION + (USE_FROM_PTR (curr))))) + curr = op_iter_next_use (&curri); + if (curr == NULL_USE_OPERAND_P) + goto pop; + } + } + while (1); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + dump_printf_loc (MSG_NOTE, vect_location, + "reduction path: "); + unsigned i; + std::pair<ssa_op_iter, use_operand_p> *x; + FOR_EACH_VEC_ELT (path, i, x) + { + dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second)); + dump_printf (MSG_NOTE, " "); + } + dump_printf (MSG_NOTE, "\n"); + } + + /* Check whether the reduction path detected is valid. */ + bool fail = path.length () == 0; + bool neg = false; + for (unsigned i = 1; i < path.length (); ++i) + { + gimple *use_stmt = USE_STMT (path[i].second); + tree op = USE_FROM_PTR (path[i].second); + if (! has_single_use (op) + || ! is_gimple_assign (use_stmt)) + { + fail = true; + break; + } + if (gimple_assign_rhs_code (use_stmt) != code) + { + if (code == PLUS_EXPR + && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) + { + /* Track whether we negate the reduction value each iteration. */ + if (gimple_assign_rhs2 (use_stmt) == op) + neg = ! neg; + } + else + { + fail = true; + break; + } + } + } + if (! fail && ! neg) + return def_stmt; + if (dump_enabled_p ()) - report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, - "reduction: unknown pattern: "); - + { + report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt, + "reduction: unknown pattern: "); + } + return NULL; } @@ -3183,6 +3303,8 @@ vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi, stmt_vec_info reduc_def_info = vinfo_for_stmt (phi); STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type; STMT_VINFO_REDUC_DEF (reduc_def_info) = def; + reduc_def_info = vinfo_for_stmt (def); + STMT_VINFO_REDUC_DEF (reduc_def_info) = phi; } return def; } @@ -3542,7 +3664,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost) { if (vec_outside_cost <= 0) - min_profitable_iters = 1; + min_profitable_iters = 0; else { min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf @@ -3580,13 +3702,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, " Calculated minimum iters for profitability: %d\n", min_profitable_iters); - min_profitable_iters = - min_profitable_iters < vf ? vf : min_profitable_iters; - - /* Because the condition we create is: - if (niters <= min_profitable_iters) - then skip the vectorized loop. */ - min_profitable_iters--; + /* We want the vectorized loop to execute at least once. */ + if (min_profitable_iters < (vf + peel_iters_prologue + peel_iters_epilogue)) + min_profitable_iters = vf + peel_iters_prologue + peel_iters_epilogue; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -3603,7 +3721,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */ if (vec_outside_cost <= 0) - min_profitable_estimate = 1; + min_profitable_estimate = 0; else { min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf @@ -3612,7 +3730,6 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, / ((scalar_single_iter_cost * vf) - vec_inside_cost); } - min_profitable_estimate --; min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -3625,7 +3742,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET vector elements (not bits) for a vector of mode MODE. */ static void -calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset, +calc_vec_perm_mask_for_shift (machine_mode mode, unsigned int offset, unsigned char *sel) { unsigned int i, nelt = GET_MODE_NUNITS (mode); @@ -3638,7 +3755,7 @@ calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset, MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_ it supports vec_perm_const with masks for all necessary shift amounts. */ static bool -have_whole_vector_shift (enum machine_mode mode) +have_whole_vector_shift (machine_mode mode) { if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing) return true; @@ -3658,29 +3775,6 @@ have_whole_vector_shift (enum machine_mode mode) return true; } -/* Return the reduction operand (with index REDUC_INDEX) of STMT. */ - -static tree -get_reduction_op (gimple *stmt, int reduc_index) -{ - switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) - { - case GIMPLE_SINGLE_RHS: - gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) - == ternary_op); - return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index); - case GIMPLE_UNARY_RHS: - return gimple_assign_rhs1 (stmt); - case GIMPLE_BINARY_RHS: - return (reduc_index - ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt)); - case GIMPLE_TERNARY_RHS: - return gimple_op (stmt, reduc_index + 1); - default: - gcc_unreachable (); - } -} - /* TODO: Close dependency between vect_model_*_cost and vectorizable_* functions. Design better to avoid maintenance issues. */ @@ -4045,6 +4139,192 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val, return init_def; } +/* Get at the initial defs for the reduction PHIs in SLP_NODE. + NUMBER_OF_VECTORS is the number of vector defs to create. */ + +static void +get_initial_defs_for_reduction (slp_tree slp_node, + vec<tree> *vec_oprnds, + unsigned int number_of_vectors, + enum tree_code code, bool reduc_chain) +{ + vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node); + gimple *stmt = stmts[0]; + stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); + unsigned nunits; + tree vec_cst; + tree *elts; + unsigned j, number_of_places_left_in_vector; + tree vector_type, scalar_type; + tree vop; + int group_size = stmts.length (); + unsigned int vec_num, i; + unsigned number_of_copies = 1; + vec<tree> voprnds; + voprnds.create (number_of_vectors); + bool constant_p; + tree neutral_op = NULL; + struct loop *loop; + gimple_seq ctor_seq = NULL; + + vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); + scalar_type = TREE_TYPE (vector_type); + nunits = TYPE_VECTOR_SUBPARTS (vector_type); + + gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def); + + loop = (gimple_bb (stmt))->loop_father; + gcc_assert (loop); + + /* op is the reduction operand of the first stmt already. */ + /* For additional copies (see the explanation of NUMBER_OF_COPIES below) + we need either neutral operands or the original operands. See + get_initial_def_for_reduction() for details. */ + switch (code) + { + case WIDEN_SUM_EXPR: + case DOT_PROD_EXPR: + case SAD_EXPR: + case PLUS_EXPR: + case MINUS_EXPR: + case BIT_IOR_EXPR: + case BIT_XOR_EXPR: + neutral_op = build_zero_cst (scalar_type); + break; + + case MULT_EXPR: + neutral_op = build_one_cst (scalar_type); + break; + + case BIT_AND_EXPR: + neutral_op = build_all_ones_cst (scalar_type); + break; + + /* For MIN/MAX we don't have an easy neutral operand but + the initial values can be used fine here. Only for + a reduction chain we have to force a neutral element. */ + case MAX_EXPR: + case MIN_EXPR: + if (! reduc_chain) + neutral_op = NULL; + else + neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, + loop_preheader_edge (loop)); + break; + + default: + gcc_assert (! reduc_chain); + neutral_op = NULL; + } + + /* NUMBER_OF_COPIES is the number of times we need to use the same values in + created vectors. It is greater than 1 if unrolling is performed. + + For example, we have two scalar operands, s1 and s2 (e.g., group of + strided accesses of size two), while NUNITS is four (i.e., four scalars + of this type can be packed in a vector). The output vector will contain + two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES + will be 2). + + If GROUP_SIZE > NUNITS, the scalars will be split into several vectors + containing the operands. + + For example, NUNITS is four as before, and the group size is 8 + (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and + {s5, s6, s7, s8}. */ + + number_of_copies = nunits * number_of_vectors / group_size; + + number_of_places_left_in_vector = nunits; + constant_p = true; + elts = XALLOCAVEC (tree, nunits); + for (j = 0; j < number_of_copies; j++) + { + for (i = group_size - 1; stmts.iterate (i, &stmt); i--) + { + tree op; + /* Get the def before the loop. In reduction chain we have only + one initial value. */ + if ((j != (number_of_copies - 1) + || (reduc_chain && i != 0)) + && neutral_op) + op = neutral_op; + else + op = PHI_ARG_DEF_FROM_EDGE (stmt, + loop_preheader_edge (loop)); + + /* Create 'vect_ = {op0,op1,...,opn}'. */ + number_of_places_left_in_vector--; + elts[number_of_places_left_in_vector] = op; + if (!CONSTANT_CLASS_P (op)) + constant_p = false; + + if (number_of_places_left_in_vector == 0) + { + if (constant_p) + vec_cst = build_vector (vector_type, elts); + else + { + vec<constructor_elt, va_gc> *v; + unsigned k; + vec_alloc (v, nunits); + for (k = 0; k < nunits; ++k) + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]); + vec_cst = build_constructor (vector_type, v); + } + tree init; + gimple_stmt_iterator gsi; + init = vect_init_vector (stmt, vec_cst, vector_type, NULL); + if (ctor_seq != NULL) + { + gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init)); + gsi_insert_seq_before_without_update (&gsi, ctor_seq, + GSI_SAME_STMT); + ctor_seq = NULL; + } + voprnds.quick_push (init); + + number_of_places_left_in_vector = nunits; + constant_p = true; + } + } + } + + /* Since the vectors are created in the reverse order, we should invert + them. */ + vec_num = voprnds.length (); + for (j = vec_num; j != 0; j--) + { + vop = voprnds[j - 1]; + vec_oprnds->quick_push (vop); + } + + voprnds.release (); + + /* In case that VF is greater than the unrolling factor needed for the SLP + group of stmts, NUMBER_OF_VECTORS to be created is greater than + NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have + to replicate the vectors. */ + while (number_of_vectors > vec_oprnds->length ()) + { + tree neutral_vec = NULL; + + if (neutral_op) + { + if (!neutral_vec) + neutral_vec = build_vector_from_val (vector_type, neutral_op); + + vec_oprnds->quick_push (neutral_vec); + } + else + { + for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++) + vec_oprnds->quick_push (vop); + } + } +} + + /* Function vect_create_epilog_for_reduction Create code at the loop-epilog to finalize the result of a reduction @@ -4066,8 +4346,6 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val, DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled. SLP_NODE is an SLP node containing a group of reduction statements. The first one in this group is STMT. - INDUCTION_INDEX is the index of the loop for condition reductions. - Otherwise it is undefined. This function: 1. Creates the reduction def-use cycles: sets the arguments for @@ -4110,10 +4388,12 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val, static void vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, + gimple *reduc_def_stmt, int ncopies, enum tree_code reduc_code, vec<gimple *> reduction_phis, - int reduc_index, bool double_reduc, - slp_tree slp_node, tree induction_index) + bool double_reduc, + slp_tree slp_node, + slp_instance slp_node_instance) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); stmt_vec_info prev_phi_info; @@ -4134,7 +4414,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, tree bitsize; tree adjustment_def = NULL; tree vec_initial_def = NULL; - tree reduction_op, expr, def, initial_def = NULL; + tree expr, def, initial_def = NULL; tree orig_name, scalar_result; imm_use_iterator imm_iter, phi_imm_iter; use_operand_p use_p, phi_use_p; @@ -4151,6 +4431,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, bool slp_reduc = false; tree new_phi_result; gimple *inner_phi = NULL; + tree induction_index = NULL_TREE; if (slp_node) group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); @@ -4163,9 +4444,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, gcc_assert (!slp_node); } - reduction_op = get_reduction_op (stmt, reduc_index); - - vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); + vectype = STMT_VINFO_VECTYPE (stmt_info); gcc_assert (vectype); mode = TYPE_MODE (vectype); @@ -4189,14 +4468,19 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, /* Get the loop-entry arguments. */ enum vect_def_type initial_def_dt = vect_unknown_def_type; if (slp_node) - vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs, - NULL, slp_node, reduc_index); + { + unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + vec_initial_defs.reserve (vec_num); + get_initial_defs_for_reduction (slp_node_instance->reduc_phis, + &vec_initial_defs, vec_num, code, + GROUP_FIRST_ELEMENT (stmt_info)); + } else { /* Get at the scalar def before the loop, that defines the initial value of the reduction variable. */ - gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op); - initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt, + gimple *def_stmt; + initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, loop_preheader_edge (loop)); vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt); vec_initial_def = get_initial_def_for_reduction (stmt, initial_def, @@ -4264,6 +4548,95 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, } } + /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) + which is updated with the current index of the loop for every match of + the original loop's cond_expr (VEC_STMT). This results in a vector + containing the last time the condition passed for that vector lane. + The first match will be a 1 to allow 0 to be used for non-matching + indexes. If there are no matches at all then the vector will be all + zeroes. */ + if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) + { + tree indx_before_incr, indx_after_incr; + int nunits_out = TYPE_VECTOR_SUBPARTS (vectype); + int k; + + gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); + gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); + + int scalar_precision + = GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (vectype))); + tree cr_index_scalar_type = make_unsigned_type (scalar_precision); + tree cr_index_vector_type = build_vector_type + (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype)); + + /* First we create a simple vector induction variable which starts + with the values {1,2,3,...} (SERIES_VECT) and increments by the + vector size (STEP). */ + + /* Create a {1,2,3,...} vector. */ + tree *vtemp = XALLOCAVEC (tree, nunits_out); + for (k = 0; k < nunits_out; ++k) + vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1); + tree series_vect = build_vector (cr_index_vector_type, vtemp); + + /* Create a vector of the step value. */ + tree step = build_int_cst (cr_index_scalar_type, nunits_out); + tree vec_step = build_vector_from_val (cr_index_vector_type, step); + + /* Create an induction variable. */ + gimple_stmt_iterator incr_gsi; + bool insert_after; + standard_iv_increment_position (loop, &incr_gsi, &insert_after); + create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi, + insert_after, &indx_before_incr, &indx_after_incr); + + /* Next create a new phi node vector (NEW_PHI_TREE) which starts + filled with zeros (VEC_ZERO). */ + + /* Create a vector of 0s. */ + tree zero = build_zero_cst (cr_index_scalar_type); + tree vec_zero = build_vector_from_val (cr_index_vector_type, zero); + + /* Create a vector phi node. */ + tree new_phi_tree = make_ssa_name (cr_index_vector_type); + new_phi = create_phi_node (new_phi_tree, loop->header); + set_vinfo_for_stmt (new_phi, + new_stmt_vec_info (new_phi, loop_vinfo)); + add_phi_arg (as_a <gphi *> (new_phi), vec_zero, + loop_preheader_edge (loop), UNKNOWN_LOCATION); + + /* Now take the condition from the loops original cond_expr + (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for + every match uses values from the induction variable + (INDEX_BEFORE_INCR) otherwise uses values from the phi node + (NEW_PHI_TREE). + Finally, we update the phi (NEW_PHI_TREE) to take the value of + the new cond_expr (INDEX_COND_EXPR). */ + + /* Duplicate the condition from vec_stmt. */ + tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt)); + + /* Create a conditional, where the condition is taken from vec_stmt + (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and + else is the phi (NEW_PHI_TREE). */ + tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, + ccompare, indx_before_incr, + new_phi_tree); + induction_index = make_ssa_name (cr_index_vector_type); + gimple *index_condition = gimple_build_assign (induction_index, + index_cond_expr); + gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT); + stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition, + loop_vinfo); + STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type; + set_vinfo_for_stmt (index_condition, index_vec_info); + + /* Update the phi with the vec cond. */ + add_phi_arg (as_a <gphi *> (new_phi), induction_index, + loop_latch_edge (loop), UNKNOWN_LOCATION); + } + /* 2. Create epilog code. The reduction epilog code operates across the elements of the vector of partial results computed by the vectorized loop. @@ -4414,20 +4787,17 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) { tree first_vect = PHI_RESULT (new_phis[0]); - tree tmp; gassign *new_vec_stmt = NULL; - vec_dest = vect_create_destination_var (scalar_dest, vectype); for (k = 1; k < new_phis.length (); k++) { gimple *next_phi = new_phis[k]; tree second_vect = PHI_RESULT (next_phi); - - tmp = build2 (code, vectype, first_vect, second_vect); - new_vec_stmt = gimple_build_assign (vec_dest, tmp); - first_vect = make_ssa_name (vec_dest, new_vec_stmt); - gimple_assign_set_lhs (new_vec_stmt, first_vect); + tree tem = make_ssa_name (vec_dest, new_vec_stmt); + new_vec_stmt = gimple_build_assign (tem, code, + first_vect, second_vect); gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); + first_vect = tem; } new_phi_result = first_vect; @@ -4437,6 +4807,28 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, new_phis.safe_push (new_vec_stmt); } } + /* Likewise if we couldn't use a single defuse cycle. */ + else if (ncopies > 1) + { + gcc_assert (new_phis.length () == 1); + tree first_vect = PHI_RESULT (new_phis[0]); + gassign *new_vec_stmt = NULL; + vec_dest = vect_create_destination_var (scalar_dest, vectype); + gimple *next_phi = new_phis[0]; + for (int k = 1; k < ncopies; ++k) + { + next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi)); + tree second_vect = PHI_RESULT (next_phi); + tree tem = make_ssa_name (vec_dest, new_vec_stmt); + new_vec_stmt = gimple_build_assign (tem, code, + first_vect, second_vect); + gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT); + first_vect = tem; + } + new_phi_result = first_vect; + new_phis.truncate (0); + new_phis.safe_push (new_vec_stmt); + } else new_phi_result = PHI_RESULT (new_phis[0]); @@ -4545,12 +4937,10 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, /* Convert the reduced value back to the result type and set as the result. */ - tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type, - data_reduc); - epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast); - new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); - gimple_assign_set_lhs (epilog_stmt, new_temp); - gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + gimple_seq stmts = NULL; + new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type, + data_reduc); + gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); scalar_results.safe_push (new_temp); } else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION @@ -4615,6 +5005,11 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt, val = new_val; } } + /* Convert the reduced value back to the result type and set as the + result. */ + gimple_seq stmts = NULL; + val = gimple_convert (&stmts, scalar_type, val); + gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); scalar_results.safe_push (val); } @@ -5262,11 +5657,11 @@ is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop) bool vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, - gimple **vec_stmt, slp_tree slp_node) + gimple **vec_stmt, slp_tree slp_node, + slp_instance slp_node_instance) { tree vec_dest; tree scalar_dest; - tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); tree vectype_in = NULL_TREE; @@ -5279,23 +5674,20 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, tree new_temp = NULL_TREE; gimple *def_stmt; enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type; - gphi *new_phi = NULL; tree scalar_type; bool is_simple_use; gimple *orig_stmt; - stmt_vec_info orig_stmt_info; - tree expr = NULL_TREE; + stmt_vec_info orig_stmt_info = NULL; int i; int ncopies; int epilog_copies; stmt_vec_info prev_stmt_info, prev_phi_info; bool single_defuse_cycle = false; - tree reduc_def = NULL_TREE; gimple *new_stmt = NULL; int j; tree ops[3]; + enum vect_def_type dts[3]; bool nested_cycle = false, found_nested_cycle_def = false; - gimple *reduc_def_stmt = NULL; bool double_reduc = false; basic_block def_bb; struct loop * def_stmt_loop, *outer_loop = NULL; @@ -5303,14 +5695,27 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, gimple *def_arg_stmt; auto_vec<tree> vec_oprnds0; auto_vec<tree> vec_oprnds1; + auto_vec<tree> vec_oprnds2; auto_vec<tree> vect_defs; auto_vec<gimple *> phis; int vec_num; - tree def0, def1, tem, op1 = NULL_TREE; + tree def0, tem; bool first_p = true; tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; tree cond_reduc_val = NULL_TREE; + /* Make sure it was already recognized as a reduction computation. */ + if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def + && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle) + return false; + + if (nested_in_vect_loop_p (loop, stmt)) + { + outer_loop = loop; + loop = loop->inner; + nested_cycle = true; + } + /* In case of reduction chain we switch to the first stmt in the chain, but we don't update STMT_INFO, since only the last stmt is marked as reduction and has reduction properties. */ @@ -5321,11 +5726,97 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, first_p = false; } - if (nested_in_vect_loop_p (loop, stmt)) + if (gimple_code (stmt) == GIMPLE_PHI) { - outer_loop = loop; - loop = loop->inner; - nested_cycle = true; + /* Analysis is fully done on the reduction stmt invocation. */ + if (! vec_stmt) + { + if (slp_node) + slp_node_instance->reduc_phis = slp_node; + + STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; + return true; + } + + gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info); + if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt))) + reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt)); + + gcc_assert (is_gimple_assign (reduc_stmt)); + for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k) + { + tree op = gimple_op (reduc_stmt, k); + if (op == gimple_phi_result (stmt)) + continue; + if (k == 1 + && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR) + continue; + tem = get_vectype_for_scalar_type (TREE_TYPE (op)); + if (! vectype_in + || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in)) + vectype_in = tem; + break; + } + gcc_assert (vectype_in); + + if (slp_node) + ncopies = 1; + else + ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo) + / TYPE_VECTOR_SUBPARTS (vectype_in)); + + use_operand_p use_p; + gimple *use_stmt; + if (ncopies > 1 + && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt)) + <= vect_used_only_live) + && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt) + && (use_stmt == reduc_stmt + || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) + == reduc_stmt))) + single_defuse_cycle = true; + + /* Create the destination vector */ + scalar_dest = gimple_assign_lhs (reduc_stmt); + vec_dest = vect_create_destination_var (scalar_dest, vectype_out); + + if (slp_node) + /* The size vect_schedule_slp_instance computes is off for us. */ + vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo) + * SLP_TREE_SCALAR_STMTS (slp_node).length ()) + / TYPE_VECTOR_SUBPARTS (vectype_in)); + else + vec_num = 1; + + /* Generate the reduction PHIs upfront. */ + prev_phi_info = NULL; + for (j = 0; j < ncopies; j++) + { + if (j == 0 || !single_defuse_cycle) + { + for (i = 0; i < vec_num; i++) + { + /* Create the reduction-phi that defines the reduction + operand. */ + gimple *new_phi = create_phi_node (vec_dest, loop->header); + set_vinfo_for_stmt (new_phi, + new_stmt_vec_info (new_phi, loop_vinfo)); + + if (slp_node) + SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi); + else + { + if (j == 0) + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi; + else + STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; + prev_phi_info = vinfo_for_stmt (new_phi); + } + } + } + } + + return true; } /* 1. Is vectorizable reduction? */ @@ -5341,11 +5832,6 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, && !STMT_VINFO_LIVE_P (stmt_info)) return false; - /* Make sure it was already recognized as a reduction computation. */ - if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def - && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle) - return false; - /* 2. Has this been recognized as a reduction pattern? Check if STMT represents a pattern that has been recognized @@ -5393,10 +5879,6 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, default: gcc_unreachable (); } - /* The default is that the reduction variable is the last in statement. */ - int reduc_index = op_type - 1; - if (code == MINUS_EXPR) - reduc_index = 0; if (code == COND_EXPR && slp_node) return false; @@ -5416,20 +5898,29 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, The last use is the reduction variable. In case of nested cycle this assumption is not true: we use reduc_index to record the index of the reduction variable. */ + gimple *reduc_def_stmt = NULL; + int reduc_index = -1; for (i = 0; i < op_type; i++) { - if (i == reduc_index) - continue; - /* The condition of COND_EXPR is checked in vectorizable_condition(). */ if (i == 0 && code == COND_EXPR) continue; is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, - &def_stmt, &dt, &tem); - if (!vectype_in) - vectype_in = tem; + &def_stmt, &dts[i], &tem); + dt = dts[i]; gcc_assert (is_simple_use); + if (dt == vect_reduction_def) + { + reduc_def_stmt = def_stmt; + reduc_index = i; + continue; + } + else + { + if (!vectype_in) + vectype_in = tem; + } if (dt != vect_internal_def && dt != vect_external_def @@ -5459,21 +5950,29 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, } } - is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo, - &def_stmt, &dt, &tem); if (!vectype_in) - vectype_in = tem; - gcc_assert (is_simple_use); - if (!found_nested_cycle_def) - reduc_def_stmt = def_stmt; + vectype_in = vectype_out; - if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI) + /* When vectorizing a reduction chain w/o SLP the reduction PHI is not + directy used in stmt. */ + if (reduc_index == -1) + { + if (orig_stmt) + reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info); + else + reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info); + } + + if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI) return false; - if (!(dt == vect_reduction_def - || dt == vect_nested_cycle - || ((dt == vect_internal_def || dt == vect_external_def - || dt == vect_constant_def || dt == vect_induction_def) + if (!(reduc_index == -1 + || dts[reduc_index] == vect_reduction_def + || dts[reduc_index] == vect_nested_cycle + || ((dts[reduc_index] == vect_internal_def + || dts[reduc_index] == vect_external_def + || dts[reduc_index] == vect_constant_def + || dts[reduc_index] == vect_induction_def) && nested_cycle && found_nested_cycle_def))) { /* For pattern recognized stmts, orig_stmt might be a reduction, @@ -5528,7 +6027,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, && types_compatible_p (TREE_TYPE (cond_initial_val), TREE_TYPE (cond_reduc_val))) { - tree e = fold_build2 (LE_EXPR, boolean_type_node, + tree e = fold_binary (LE_EXPR, boolean_type_node, cond_initial_val, cond_reduc_val); if (e && (integer_onep (e) || integer_zerop (e))) { @@ -5826,26 +6325,6 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, } } - if (!vec_stmt) /* transformation not required. */ - { - if (first_p) - vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies); - STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; - return true; - } - - /* Transform. */ - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); - - /* FORNOW: Multiple types are not supported for condition. */ - if (code == COND_EXPR) - gcc_assert (ncopies == 1); - - /* Create the destination vector */ - vec_dest = vect_create_destination_var (scalar_dest, vectype_out); - /* In case the vectorization factor (VF) is bigger than the number of elements that we can fit in a vectype (nunits), we have to generate more than one vector stmt - i.e - we need to "unroll" the @@ -5871,9 +6350,17 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, (i.e. we generate VF/2 results in a single register). In this case for each copy we get the vector def for the reduction variable from the vectorized reduction operation generated in the previous iteration. - */ - if (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) + This only works when we see both the reduction PHI and its only consumer + in vectorizable_reduction and there are no intermediate stmts + participating. */ + use_operand_p use_p; + gimple *use_stmt; + if (ncopies > 1 + && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) + && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt) + && (use_stmt == stmt + || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt)) { single_defuse_cycle = true; epilog_copies = 1; @@ -5881,6 +6368,41 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, else epilog_copies = ncopies; + /* If the reduction stmt is one of the patterns that have lane + reduction embedded we cannot handle the case of ! single_defuse_cycle. */ + if ((ncopies > 1 + && ! single_defuse_cycle) + && (code == DOT_PROD_EXPR + || code == WIDEN_SUM_EXPR + || code == SAD_EXPR)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "multi def-use cycle not possible for lane-reducing " + "reduction operation\n"); + return false; + } + + if (!vec_stmt) /* transformation not required. */ + { + if (first_p) + vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies); + STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; + return true; + } + + /* Transform. */ + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); + + /* FORNOW: Multiple types are not supported for condition. */ + if (code == COND_EXPR) + gcc_assert (ncopies == 1); + + /* Create the destination vector */ + vec_dest = vect_create_destination_var (scalar_dest, vectype_out); + prev_stmt_info = NULL; prev_phi_info = NULL; if (slp_node) @@ -5889,8 +6411,9 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, { vec_num = 1; vec_oprnds0.create (1); + vec_oprnds1.create (1); if (op_type == ternary_op) - vec_oprnds1.create (1); + vec_oprnds2.create (1); } phis.create (vec_num); @@ -5898,22 +6421,13 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, if (!slp_node) vect_defs.quick_push (NULL_TREE); + if (slp_node) + phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis)); + else + phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt))); + for (j = 0; j < ncopies; j++) { - if (j == 0 || !single_defuse_cycle) - { - for (i = 0; i < vec_num; i++) - { - /* Create the reduction-phi that defines the reduction - operand. */ - new_phi = create_phi_node (vec_dest, loop->header); - set_vinfo_for_stmt (new_phi, - new_stmt_vec_info (new_phi, loop_vinfo)); - if (j == 0 || slp_node) - phis.quick_push (new_phi); - } - } - if (code == COND_EXPR) { gcc_assert (!slp_node); @@ -5934,96 +6448,70 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, auto_vec<tree, 3> slp_ops; auto_vec<vec<tree>, 3> vec_defs; - slp_ops.quick_push (reduc_index == 0 ? NULL : ops[0]); - slp_ops.quick_push (reduc_index == 1 ? NULL : ops[1]); + slp_ops.quick_push (ops[0]); + slp_ops.quick_push (ops[1]); if (op_type == ternary_op) - slp_ops.quick_push (reduc_index == 2 ? NULL : ops[2]); + slp_ops.quick_push (ops[2]); - vect_get_slp_defs (slp_ops, slp_node, &vec_defs, -1); + vect_get_slp_defs (slp_ops, slp_node, &vec_defs); - vec_oprnds0.safe_splice (vec_defs[reduc_index == 0 ? 1 : 0]); - vec_defs[reduc_index == 0 ? 1 : 0].release (); + vec_oprnds0.safe_splice (vec_defs[0]); + vec_defs[0].release (); + vec_oprnds1.safe_splice (vec_defs[1]); + vec_defs[1].release (); if (op_type == ternary_op) { - vec_oprnds1.safe_splice (vec_defs[reduc_index == 2 ? 1 : 2]); - vec_defs[reduc_index == 2 ? 1 : 2].release (); + vec_oprnds2.safe_splice (vec_defs[2]); + vec_defs[2].release (); } } else { - loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index], - stmt); - vec_oprnds0.quick_push (loop_vec_def0); + vec_oprnds0.quick_push + (vect_get_vec_def_for_operand (ops[0], stmt)); + vec_oprnds1.quick_push + (vect_get_vec_def_for_operand (ops[1], stmt)); if (op_type == ternary_op) - { - op1 = reduc_index == 0 ? ops[2] : ops[1]; - loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt); - vec_oprnds1.quick_push (loop_vec_def1); - } + vec_oprnds2.quick_push + (vect_get_vec_def_for_operand (ops[2], stmt)); } } else { if (!slp_node) { - enum vect_def_type dt; - gimple *dummy_stmt; - - vect_is_simple_use (ops[!reduc_index], loop_vinfo, - &dummy_stmt, &dt); - loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, - loop_vec_def0); - vec_oprnds0[0] = loop_vec_def0; - if (op_type == ternary_op) - { - vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt); - loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, - loop_vec_def1); - vec_oprnds1[0] = loop_vec_def1; - } - } - - if (single_defuse_cycle) - reduc_def = gimple_assign_lhs (new_stmt); + gcc_assert (reduc_index != -1 || ! single_defuse_cycle); - STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; + if (single_defuse_cycle && reduc_index == 0) + vec_oprnds0[0] = gimple_assign_lhs (new_stmt); + else + vec_oprnds0[0] + = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]); + if (single_defuse_cycle && reduc_index == 1) + vec_oprnds1[0] = gimple_assign_lhs (new_stmt); + else + vec_oprnds1[0] + = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]); + if (op_type == ternary_op) + { + if (single_defuse_cycle && reduc_index == 2) + vec_oprnds2[0] = gimple_assign_lhs (new_stmt); + else + vec_oprnds2[0] + = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]); + } + } } FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) { - if (slp_node) - reduc_def = PHI_RESULT (phis[i]); - else - { - if (!single_defuse_cycle || j == 0) - reduc_def = PHI_RESULT (new_phi); - } - - def1 = ((op_type == ternary_op) - ? vec_oprnds1[i] : NULL); - if (op_type == binary_op) - { - if (reduc_index == 0) - expr = build2 (code, vectype_out, reduc_def, def0); - else - expr = build2 (code, vectype_out, def0, reduc_def); - } - else - { - if (reduc_index == 0) - expr = build3 (code, vectype_out, reduc_def, def0, def1); - else - { - if (reduc_index == 1) - expr = build3 (code, vectype_out, def0, reduc_def, def1); - else - expr = build3 (code, vectype_out, def0, def1, reduc_def); - } - } + tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; + if (op_type == ternary_op) + vop[2] = vec_oprnds2[i]; - new_stmt = gimple_build_assign (vec_dest, expr); new_temp = make_ssa_name (vec_dest, new_stmt); - gimple_assign_set_lhs (new_stmt, new_temp); + new_stmt = gimple_build_assign (new_temp, code, + vop[0], vop[1], vop[2]); vect_finish_stmt_generation (stmt, new_stmt, gsi); if (slp_node) @@ -6044,103 +6532,17 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; prev_stmt_info = vinfo_for_stmt (new_stmt); - prev_phi_info = vinfo_for_stmt (new_phi); } - tree indx_before_incr, indx_after_incr, cond_name = NULL; - /* Finalize the reduction-phi (set its arguments) and create the epilog reduction code. */ if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node) - { - new_temp = gimple_assign_lhs (*vec_stmt); - vect_defs[0] = new_temp; - - /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) - which is updated with the current index of the loop for every match of - the original loop's cond_expr (VEC_STMT). This results in a vector - containing the last time the condition passed for that vector lane. - The first match will be a 1 to allow 0 to be used for non-matching - indexes. If there are no matches at all then the vector will be all - zeroes. */ - if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION) - { - int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); - int k; - - gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR); - - /* First we create a simple vector induction variable which starts - with the values {1,2,3,...} (SERIES_VECT) and increments by the - vector size (STEP). */ - - /* Create a {1,2,3,...} vector. */ - tree *vtemp = XALLOCAVEC (tree, nunits_out); - for (k = 0; k < nunits_out; ++k) - vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1); - tree series_vect = build_vector (cr_index_vector_type, vtemp); - - /* Create a vector of the step value. */ - tree step = build_int_cst (cr_index_scalar_type, nunits_out); - tree vec_step = build_vector_from_val (cr_index_vector_type, step); - - /* Create an induction variable. */ - gimple_stmt_iterator incr_gsi; - bool insert_after; - standard_iv_increment_position (loop, &incr_gsi, &insert_after); - create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi, - insert_after, &indx_before_incr, &indx_after_incr); - - /* Next create a new phi node vector (NEW_PHI_TREE) which starts - filled with zeros (VEC_ZERO). */ - - /* Create a vector of 0s. */ - tree zero = build_zero_cst (cr_index_scalar_type); - tree vec_zero = build_vector_from_val (cr_index_vector_type, zero); - - /* Create a vector phi node. */ - tree new_phi_tree = make_ssa_name (cr_index_vector_type); - new_phi = create_phi_node (new_phi_tree, loop->header); - set_vinfo_for_stmt (new_phi, - new_stmt_vec_info (new_phi, loop_vinfo)); - add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop), - UNKNOWN_LOCATION); - - /* Now take the condition from the loops original cond_expr - (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for - every match uses values from the induction variable - (INDEX_BEFORE_INCR) otherwise uses values from the phi node - (NEW_PHI_TREE). - Finally, we update the phi (NEW_PHI_TREE) to take the value of - the new cond_expr (INDEX_COND_EXPR). */ - - /* Duplicate the condition from vec_stmt. */ - tree ccompare = unshare_expr (gimple_assign_rhs1 (*vec_stmt)); - - /* Create a conditional, where the condition is taken from vec_stmt - (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and - else is the phi (NEW_PHI_TREE). */ - tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type, - ccompare, indx_before_incr, - new_phi_tree); - cond_name = make_ssa_name (cr_index_vector_type); - gimple *index_condition = gimple_build_assign (cond_name, - index_cond_expr); - gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT); - stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition, - loop_vinfo); - STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type; - set_vinfo_for_stmt (index_condition, index_vec_info); - - /* Update the phi with the vec cond. */ - add_phi_arg (new_phi, cond_name, loop_latch_edge (loop), - UNKNOWN_LOCATION); - } - } + vect_defs[0] = gimple_assign_lhs (*vec_stmt); - vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies, - epilog_reduc_code, phis, reduc_index, - double_reduc, slp_node, cond_name); + vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt, + epilog_copies, + epilog_reduc_code, phis, + double_reduc, slp_node, slp_node_instance); return true; } @@ -6912,28 +7314,28 @@ scale_profile_for_vect_loop (struct loop *loop, unsigned vf) } if (freq_h > 0) { - gcov_type scale; + profile_probability p; /* Avoid dropping loop body profile counter to 0 because of zero count in loop's preheader. */ if (!(freq_e > profile_count::from_gcov_type (1))) freq_e = profile_count::from_gcov_type (1); - /* This should not overflow. */ - scale = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h); - scale_loop_frequencies (loop, scale, REG_BR_PROB_BASE); + p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h); + scale_loop_frequencies (loop, p); } basic_block exit_bb = single_pred (loop->latch); edge exit_e = single_exit (loop); exit_e->count = loop_preheader_edge (loop)->count; - exit_e->probability = REG_BR_PROB_BASE / (new_est_niter + 1); + exit_e->probability = profile_probability::always () + .apply_scale (1, new_est_niter + 1); edge exit_l = single_pred_edge (loop->latch); - int prob = exit_l->probability; - exit_l->probability = REG_BR_PROB_BASE - exit_e->probability; + profile_probability prob = exit_l->probability; + exit_l->probability = exit_e->probability.invert (); exit_l->count = exit_bb->count - exit_e->count; - if (prob > 0) - scale_bbs_frequencies_int (&loop->latch, 1, exit_l->probability, prob); + if (prob.initialized_p () && exit_l->probability.initialized_p ()) + scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob); } /* Function vect_transform_loop. @@ -6971,7 +7373,7 @@ vect_transform_loop (loop_vec_info loop_vinfo) run at least the vectorization factor number of times checking is pointless, too. */ th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); - if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 + if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) { if (dump_enabled_p ()) @@ -7077,7 +7479,9 @@ vect_transform_loop (loop_vec_info loop_vinfo) && dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); - if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def + if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def + || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def + || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) && ! PURE_SLP_STMT (stmt_info)) { if (dump_enabled_p ()) @@ -7456,9 +7860,9 @@ optimize_mask_stores (struct loop *loop) e->flags = EDGE_TRUE_VALUE; efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); /* Put STORE_BB to likely part. */ - efalse->probability = PROB_UNLIKELY; + efalse->probability = profile_probability::unlikely (); store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse); - make_edge (store_bb, join_bb, EDGE_FALLTHRU); + make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU); if (dom_info_available_p (CDI_DOMINATORS)) set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); if (dump_enabled_p ()) |