diff options
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 30 | ||||
-rw-r--r-- | gcc/config/c6x/c6x.c | 2 | ||||
-rw-r--r-- | gcc/doc/invoke.texi | 5 | ||||
-rw-r--r-- | gcc/haifa-sched.c | 338 | ||||
-rw-r--r-- | gcc/params.def | 7 | ||||
-rw-r--r-- | gcc/sched-int.h | 6 |
6 files changed, 356 insertions, 32 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 75303db9860..483b3641f45 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,33 @@ +2011-09-30 Bernd Schmidt <bernds@codesourcery.com> + + * haifa-sched.c (modulo_ii, modulo_max_states, modulo_n_insns, + modulo_insns_scheduled, modulo_iter0_max_uid, modulo_backtracks_left, + modulo_last_stage): New static variables. + (set_modulo_params, discard_delay_pairs_above): New functions. + (struct delay_pair): New member stages. + (htab_i2_traverse, htab_i1_traverse): New static functions. + (record_delay_slot_pair): New arg stages. All callers changed. + Record it. + (pair_delay): Take stages into account. + (add_delay_dependencies): Don't do so for stage pairs. + (struct sched_block_state): New member modulo_epilogue. + (save_backtrack_point): Don't set SHADOW_P for stage pairs. + (unschedule_insns_until): Decrease modulo_insns_scheduled. + Set HARD_DEP without using or. + (resolve_dependencies): New static function. + (prune_ready_list): New arg modulo_epilogue_p. All callers changed. + If it is true, allow only insns with INSN_EXACT_TICK set. + (schedule_block): Return bool, always true for normal scheduling, + true or false depending on modulo scheduling success otherwise. + Add bookkeeping for modulo scheduling, and call resolve_dependencies + on everything left over after a modulo schedule. + (haifa_sched_init): Remove check_cfg call. Clear modulo_ii. + * sched-int.h (schedule_block, record_delay_slot_pair): Adjust + declarations. + (set_modulo_params, discard_delay_pairs_above): Declare. + * params.def (PARAM_MAX_MODULO_BACKTRACK_ATTEMPS): New. + * doc/invoke.texi (--param): Document it. + 2011-09-30 Richard Guenther <rguenther@suse.de> PR middle-end/50574 diff --git a/gcc/config/c6x/c6x.c b/gcc/config/c6x/c6x.c index f2ab0b88aa9..98c65b56cc8 100644 --- a/gcc/config/c6x/c6x.c +++ b/gcc/config/c6x/c6x.c @@ -4810,7 +4810,7 @@ split_delayed_branch (rtx insn) i1 = emit_insn_before (pat, insn); PATTERN (insn) = newpat; INSN_CODE (insn) = -1; - record_delay_slot_pair (i1, insn, 5); + record_delay_slot_pair (i1, insn, 5, 0); } /* Split every insn (i.e. jumps and calls) which can have delay slots into diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 0ce15ff4ebc..8e43f823583 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -8474,6 +8474,11 @@ before flushing the current state and starting over. Large functions with few branches or calls can create excessively large lists which needlessly consume memory and resources. +@item max-modulo-backtrack-attempts +The maximum number of backtrack attempts the scheduler should make +when modulo scheduling a loop. Larger values can exponentially increase +compile time. + @item max-inline-insns-single Several parameters control the tree inliner used in gcc. This number sets the maximum number of instructions (counted in GCC's diff --git a/gcc/haifa-sched.c b/gcc/haifa-sched.c index 34a692fd712..a5b4aebefee 100644 --- a/gcc/haifa-sched.c +++ b/gcc/haifa-sched.c @@ -163,6 +163,31 @@ int issue_rate; enable a DCE pass. */ bool sched_no_dce; +/* The current initiation interval used when modulo scheduling. */ +static int modulo_ii; + +/* The maximum number of stages we are prepared to handle. */ +static int modulo_max_stages; + +/* The number of insns that exist in each iteration of the loop. We use this + to detect when we've scheduled all insns from the first iteration. */ +static int modulo_n_insns; + +/* The current count of insns in the first iteration of the loop that have + already been scheduled. */ +static int modulo_insns_scheduled; + +/* The maximum uid of insns from the first iteration of the loop. */ +static int modulo_iter0_max_uid; + +/* The number of times we should attempt to backtrack when modulo scheduling. + Decreased each time we have to backtrack. */ +static int modulo_backtracks_left; + +/* The stage in which the last insn from the original loop was + scheduled. */ +static int modulo_last_stage; + /* sched-verbose controls the amount of debugging output the scheduler prints. It is controlled by -fsched-verbose=N: N>0 and no -DSR : the output is directed to stderr. @@ -507,6 +532,29 @@ haifa_classify_insn (const_rtx insn) { return haifa_classify_rtx (PATTERN (insn)); } + +/* After the scheduler initialization function has been called, this function + can be called to enable modulo scheduling. II is the initiation interval + we should use, it affects the delays for delay_pairs that were recorded as + separated by a given number of stages. + + MAX_STAGES provides us with a limit + after which we give up scheduling; the caller must have unrolled at least + as many copies of the loop body and recorded delay_pairs for them. + + INSNS is the number of real (non-debug) insns in one iteration of + the loop. MAX_UID can be used to test whether an insn belongs to + the first iteration of the loop; all of them have a uid lower than + MAX_UID. */ +void +set_modulo_params (int ii, int max_stages, int insns, int max_uid) +{ + modulo_ii = ii; + modulo_max_stages = max_stages; + modulo_n_insns = insns; + modulo_iter0_max_uid = max_uid; + modulo_backtracks_left = PARAM_VALUE (PARAM_MAX_MODULO_BACKTRACK_ATTEMPTS); +} /* A structure to record a pair of insns where the first one is a real insn that has delay slots, and the second is its delayed shadow. @@ -518,6 +566,10 @@ struct delay_pair struct delay_pair *next_same_i1; rtx i1, i2; int cycles; + /* When doing modulo scheduling, we a delay_pair can also be used to + show that I1 and I2 are the same insn in a different stage. If that + is the case, STAGES will be nonzero. */ + int stages; }; /* Two hash tables to record delay_pairs, one indexed by I1 and the other @@ -525,6 +577,62 @@ struct delay_pair static htab_t delay_htab; static htab_t delay_htab_i2; +/* Called through htab_traverse. Walk the hashtable using I2 as + index, and delete all elements involving an UID higher than + that pointed to by *DATA. */ +static int +htab_i2_traverse (void **slot, void *data) +{ + int maxuid = *(int *)data; + struct delay_pair *p = *(struct delay_pair **)slot; + if (INSN_UID (p->i2) >= maxuid || INSN_UID (p->i1) >= maxuid) + { + htab_clear_slot (delay_htab_i2, slot); + } + return 1; +} + +/* Called through htab_traverse. Walk the hashtable using I2 as + index, and delete all elements involving an UID higher than + that pointed to by *DATA. */ +static int +htab_i1_traverse (void **slot, void *data) +{ + int maxuid = *(int *)data; + struct delay_pair **pslot = (struct delay_pair **)slot; + struct delay_pair *p, *first, **pprev; + + if (INSN_UID ((*pslot)->i1) >= maxuid) + { + htab_clear_slot (delay_htab, slot); + return 1; + } + pprev = &first; + for (p = *pslot; p; p = p->next_same_i1) + { + if (INSN_UID (p->i2) < maxuid) + { + *pprev = p; + pprev = &p->next_same_i1; + } + } + *pprev = NULL; + if (first == NULL) + htab_clear_slot (delay_htab, slot); + else + *pslot = first; + return 1; +} + +/* Discard all delay pairs which involve an insn with an UID higher + than MAX_UID. */ +void +discard_delay_pairs_above (int max_uid) +{ + htab_traverse (delay_htab, htab_i1_traverse, &max_uid); + htab_traverse (delay_htab_i2, htab_i2_traverse, &max_uid); +} + /* Returns a hash value for X (which really is a delay_pair), based on hashing just I1. */ static hashval_t @@ -555,18 +663,24 @@ delay_i2_eq (const void *x, const void *y) return ((const struct delay_pair *) x)->i2 == y; } -/* This function can be called by a port just before it starts the - final scheduling pass. It records the fact that an instruction - with delay slots has been split into two insns, I1 and I2. The - first one will be scheduled normally and initiates the operation. - The second one is a shadow which must follow a specific number of - CYCLES after I1; its only purpose is to show the side effect that - occurs at that cycle in the RTL. If a JUMP_INSN or a CALL_INSN has - been split, I1 should be a normal INSN, while I2 retains the - original insn type. */ +/* This function can be called by a port just before it starts the final + scheduling pass. It records the fact that an instruction with delay + slots has been split into two insns, I1 and I2. The first one will be + scheduled normally and initiates the operation. The second one is a + shadow which must follow a specific number of cycles after I1; its only + purpose is to show the side effect that occurs at that cycle in the RTL. + If a JUMP_INSN or a CALL_INSN has been split, I1 should be a normal INSN, + while I2 retains the original insn type. + + There are two ways in which the number of cycles can be specified, + involving the CYCLES and STAGES arguments to this function. If STAGES + is zero, we just use the value of CYCLES. Otherwise, STAGES is a factor + which is multiplied by MODULO_II to give the number of cycles. This is + only useful if the caller also calls set_modulo_params to enable modulo + scheduling. */ void -record_delay_slot_pair (rtx i1, rtx i2, int cycles) +record_delay_slot_pair (rtx i1, rtx i2, int cycles, int stages) { struct delay_pair *p = XNEW (struct delay_pair); struct delay_pair **slot; @@ -574,6 +688,7 @@ record_delay_slot_pair (rtx i1, rtx i2, int cycles) p->i1 = i1; p->i2 = i2; p->cycles = cycles; + p->stages = stages; if (!delay_htab) { @@ -596,7 +711,10 @@ record_delay_slot_pair (rtx i1, rtx i2, int cycles) static int pair_delay (struct delay_pair *p) { - return p->cycles; + if (p->stages == 0) + return p->cycles; + else + return p->stages * modulo_ii; } /* Given an insn INSN, add a dependence on its delayed shadow if it @@ -619,6 +737,8 @@ add_delay_dependencies (rtx insn) if (!pair) return; add_dependence (insn, pair->i1, REG_DEP_ANTI); + if (pair->stages) + return; FOR_EACH_DEP (pair->i2, SD_LIST_BACK, sd_it, dep) { @@ -626,7 +746,7 @@ add_delay_dependencies (rtx insn) struct delay_pair *other_pair = (struct delay_pair *)htab_find_with_hash (delay_htab_i2, pro, htab_hash_pointer (pro)); - if (!other_pair) + if (!other_pair || other_pair->stages) continue; if (pair_delay (other_pair) >= pair_delay (pair)) { @@ -1851,6 +1971,9 @@ struct sched_block_state /* True if a shadow insn has been scheduled in the current cycle, which means that no more normal insns can be issued. */ bool shadows_only_p; + /* True if we're winding down a modulo schedule, which means that we only + issue insns with INSN_EXACT_TICK set. */ + bool modulo_epilogue; /* Initialized with the machine's issue rate every cycle, and updated by calls to the variable_issue hook. */ int can_issue_more; @@ -2223,7 +2346,7 @@ save_backtrack_point (struct delay_pair *pair, mark_backtrack_feeds (pair->i2, 1); INSN_TICK (pair->i2) = INVALID_TICK; INSN_EXACT_TICK (pair->i2) = clock_var + pair_delay (pair); - SHADOW_P (pair->i2) = true; + SHADOW_P (pair->i2) = pair->stages == 0; pair = pair->next_same_i1; } } @@ -2249,11 +2372,14 @@ unschedule_insns_until (rtx insn) if (last != insn) INSN_TICK (last) = INVALID_TICK; + if (modulo_ii > 0 && INSN_UID (last) < modulo_iter0_max_uid) + modulo_insns_scheduled--; + for (sd_it = sd_iterator_start (last, SD_LIST_RES_FORW); sd_iterator_cond (&sd_it, &dep);) { rtx con = DEP_CON (dep); - TODO_SPEC (con) |= HARD_DEP; + TODO_SPEC (con) = HARD_DEP; INSN_TICK (con) = INVALID_TICK; sd_unresolve_dep (sd_it); } @@ -2467,6 +2593,56 @@ estimate_shadow_tick (struct delay_pair *p) return 0; } +/* If INSN has no unresolved backwards dependencies, add it to the schedule and + recursively resolve all its forward dependencies. */ +static void +resolve_dependencies (rtx insn) +{ + sd_iterator_def sd_it; + dep_t dep; + + /* Don't use sd_lists_empty_p; it ignores debug insns. */ + if (DEPS_LIST_FIRST (INSN_HARD_BACK_DEPS (insn)) != NULL + || DEPS_LIST_FIRST (INSN_SPEC_BACK_DEPS (insn)) != NULL) + return; + + if (sched_verbose >= 4) + fprintf (sched_dump, ";;\tquickly resolving %d\n", INSN_UID (insn)); + + if (QUEUE_INDEX (insn) >= 0) + queue_remove (insn); + + VEC_safe_push (rtx, heap, scheduled_insns, insn); + + /* Update dependent instructions. */ + for (sd_it = sd_iterator_start (insn, SD_LIST_FORW); + sd_iterator_cond (&sd_it, &dep);) + { + rtx next = DEP_CON (dep); + + if (sched_verbose >= 4) + fprintf (sched_dump, ";;\t\tdep %d against %d\n", INSN_UID (insn), + INSN_UID (next)); + + /* Resolve the dependence between INSN and NEXT. + sd_resolve_dep () moves current dep to another list thus + advancing the iterator. */ + sd_resolve_dep (sd_it); + + if (!IS_SPECULATION_BRANCHY_CHECK_P (insn)) + { + resolve_dependencies (next); + } + else + /* Check always has only one forward dependence (to the first insn in + the recovery block), therefore, this will be executed only once. */ + { + gcc_assert (sd_lists_empty_p (insn, SD_LIST_FORW)); + } + } +} + + /* Return the head and tail pointers of ebb starting at BEG and ending at END. */ void @@ -3448,15 +3624,12 @@ commit_schedule (rtx prev_head, rtx tail, basic_block *target_bb) issue an asm statement. If SHADOWS_ONLY_P is true, we eliminate all real insns and only - leave those for which SHADOW_P is true. - - Return the number of cycles we must - advance to find the next ready instruction, or zero if there remain - insns on the ready list. */ + leave those for which SHADOW_P is true. If MODULO_EPILOGUE is true, + we only leave insns which have an INSN_EXACT_TICK. */ static void prune_ready_list (state_t temp_state, bool first_cycle_insn_p, - bool shadows_only_p) + bool shadows_only_p, bool modulo_epilogue_p) { int i; @@ -3467,6 +3640,12 @@ prune_ready_list (state_t temp_state, bool first_cycle_insn_p, int cost = 0; const char *reason = "resource conflict"; + if (modulo_epilogue_p && !DEBUG_INSN_P (insn) + && INSN_EXACT_TICK (insn) == INVALID_TICK) + { + cost = max_insn_queue_index; + reason = "not an epilogue insn"; + } if (shadows_only_p && !DEBUG_INSN_P (insn) && !SHADOW_P (insn)) { cost = 1; @@ -3580,10 +3759,11 @@ verify_shadows (void) TARGET_BB, possibly bringing insns from subsequent blocks in the same region. */ -void +bool schedule_block (basic_block *target_bb) { int i; + bool success = modulo_ii == 0; struct sched_block_state ls; state_t temp_state = NULL; /* It is used for multipass scheduling. */ int sort_p, advance, start_clock_var; @@ -3704,6 +3884,9 @@ schedule_block (basic_block *target_bb) gcc_assert (VEC_length (rtx, scheduled_insns) == 0); sort_p = TRUE; must_backtrack = false; + modulo_insns_scheduled = 0; + + ls.modulo_epilogue = false; /* Loop until all the insns in BB are scheduled. */ while ((*current_sched_info->schedule_more_p) ()) @@ -3733,8 +3916,41 @@ schedule_block (basic_block *target_bb) } while (advance > 0); - if (ready.n_ready > 0) - prune_ready_list (temp_state, true, false); + if (ls.modulo_epilogue) + { + int stage = clock_var / modulo_ii; + if (stage > modulo_last_stage * 2 + 2) + { + if (sched_verbose >= 2) + fprintf (sched_dump, + ";;\t\tmodulo scheduled succeeded at II %d\n", + modulo_ii); + success = true; + goto end_schedule; + } + } + else if (modulo_ii > 0) + { + int stage = clock_var / modulo_ii; + if (stage > modulo_max_stages) + { + if (sched_verbose >= 2) + fprintf (sched_dump, + ";;\t\tfailing schedule due to excessive stages\n"); + goto end_schedule; + } + if (modulo_n_insns == modulo_insns_scheduled + && stage > modulo_last_stage) + { + if (sched_verbose >= 2) + fprintf (sched_dump, + ";;\t\tfound kernel after %d stages, II %d\n", + stage, modulo_ii); + ls.modulo_epilogue = true; + } + } + + prune_ready_list (temp_state, true, false, ls.modulo_epilogue); if (ready.n_ready == 0) continue; if (must_backtrack) @@ -3912,6 +4128,11 @@ schedule_block (basic_block *target_bb) /* DECISION is made. */ + if (modulo_ii > 0 && INSN_UID (insn) < modulo_iter0_max_uid) + { + modulo_insns_scheduled++; + modulo_last_stage = clock_var / modulo_ii; + } if (TODO_SPEC (insn) & SPECULATIVE) generate_recovery_code (insn); @@ -3964,7 +4185,8 @@ schedule_block (basic_block *target_bb) ls.first_cycle_insn_p = false; if (ready.n_ready > 0) - prune_ready_list (temp_state, false, ls.shadows_only_p); + prune_ready_list (temp_state, false, ls.shadows_only_p, + ls.modulo_epilogue); } do_backtrack: @@ -3979,6 +4201,12 @@ schedule_block (basic_block *target_bb) break; } } + if (must_backtrack && modulo_ii > 0) + { + if (modulo_backtracks_left == 0) + goto end_schedule; + modulo_backtracks_left--; + } while (must_backtrack) { struct haifa_saved_data *failed; @@ -4012,7 +4240,48 @@ schedule_block (basic_block *target_bb) } } } + if (ls.modulo_epilogue) + success = true; end_schedule: + if (modulo_ii > 0) + { + /* Once again, debug insn suckiness: they can be on the ready list + even if they have unresolved dependencies. To make our view + of the world consistent, remove such "ready" insns. */ + restart_debug_insn_loop: + for (i = ready.n_ready - 1; i >= 0; i--) + { + rtx x; + + x = ready_element (&ready, i); + if (DEPS_LIST_FIRST (INSN_HARD_BACK_DEPS (x)) != NULL + || DEPS_LIST_FIRST (INSN_SPEC_BACK_DEPS (x)) != NULL) + { + ready_remove (&ready, i); + goto restart_debug_insn_loop; + } + } + for (i = ready.n_ready - 1; i >= 0; i--) + { + rtx x; + + x = ready_element (&ready, i); + resolve_dependencies (x); + } + for (i = 0; i <= max_insn_queue_index; i++) + { + rtx link; + while ((link = insn_queue[i]) != NULL) + { + rtx x = XEXP (link, 0); + insn_queue[i] = XEXP (link, 1); + QUEUE_INDEX (x) = QUEUE_NOWHERE; + free_INSN_LIST_node (link); + resolve_dependencies (x); + } + } + } + /* Debug info. */ if (sched_verbose) { @@ -4020,11 +4289,11 @@ schedule_block (basic_block *target_bb) debug_ready_list (&ready); } - if (current_sched_info->queue_must_finish_empty) + if (modulo_ii == 0 && current_sched_info->queue_must_finish_empty) /* Sanity check -- queue must be empty now. Meaningless if region has multiple bbs. */ gcc_assert (!q_size && !ready.n_ready && !ready.n_debug); - else + else if (modulo_ii == 0) { /* We must maintain QUEUE_INDEX between blocks in region. */ for (i = ready.n_ready - 1; i >= 0; i--) @@ -4052,9 +4321,16 @@ schedule_block (basic_block *target_bb) } } - commit_schedule (prev_head, tail, target_bb); - if (sched_verbose) - fprintf (sched_dump, ";; total time = %d\n", clock_var); + if (success) + { + commit_schedule (prev_head, tail, target_bb); + if (sched_verbose) + fprintf (sched_dump, ";; total time = %d\n", clock_var); + } + else + last_scheduled_insn = tail; + + VEC_truncate (rtx, scheduled_insns, 0); if (!current_sched_info->queue_must_finish_empty || haifa_recovery_bb_recently_added_p) @@ -4092,6 +4368,8 @@ schedule_block (basic_block *target_bb) current_sched_info->tail = tail; free_backtrack_queue (); + + return success; } /* Set_priorities: compute priority of each insn in the block. */ @@ -4302,6 +4580,8 @@ haifa_sched_init (void) nr_begin_data = nr_begin_control = nr_be_in_data = nr_be_in_control = 0; before_recovery = 0; after_recovery = 0; + + modulo_ii = 0; } /* Finish work with the data specific to the Haifa scheduler. */ diff --git a/gcc/params.def b/gcc/params.def index a795c38aefc..5e49c48f7b9 100644 --- a/gcc/params.def +++ b/gcc/params.def @@ -165,6 +165,13 @@ DEFPARAM(PARAM_MAX_PENDING_LIST_LENGTH, "The maximum length of scheduling's pending operations list", 32, 0, 0) +/* This parameter limits the number of backtracking attempts when using the + haifa scheduler for modulo scheduling. */ +DEFPARAM(PARAM_MAX_MODULO_BACKTRACK_ATTEMPTS, + "max-modulo-backtrack-attempts", + "The maximum number of backtrack attempts the scheduler should make when modulo scheduling a loop", + 40, 0, 0) + DEFPARAM(PARAM_LARGE_FUNCTION_INSNS, "large-function-insns", "The size of function body to be considered large", diff --git a/gcc/sched-int.h b/gcc/sched-int.h index 6797397b9bb..5e90cd1da53 100644 --- a/gcc/sched-int.h +++ b/gcc/sched-int.h @@ -1257,7 +1257,7 @@ extern int dep_cost (dep_t); extern int set_priorities (rtx, rtx); extern void sched_setup_bb_reg_pressure_info (basic_block, rtx); -extern void schedule_block (basic_block *); +extern bool schedule_block (basic_block *); extern int cycle_issued_insns; extern int issue_rate; @@ -1330,7 +1330,9 @@ extern int current_blocks; extern int target_bb; extern bool sched_no_dce; -extern void record_delay_slot_pair (rtx, rtx, int); +extern void set_modulo_params (int, int, int, int); +extern void record_delay_slot_pair (rtx, rtx, int, int); +extern void discard_delay_pairs_above (int); extern void free_delay_pairs (void); extern void add_delay_dependencies (rtx); extern bool sched_is_disabled_for_current_region_p (void); |