diff options
author | mkuvyrkov <mkuvyrkov@138bc75d-0d04-0410-961f-82ee72b054a4> | 2015-01-17 01:06:43 +0000 |
---|---|---|
committer | mkuvyrkov <mkuvyrkov@138bc75d-0d04-0410-961f-82ee72b054a4> | 2015-01-17 01:06:43 +0000 |
commit | 34aaed439380226950d65f37c02a549dfdebb16c (patch) | |
tree | b58630fa9c7bde3cf786f46d244c041a2357c3bd /gcc/haifa-sched.c | |
parent | b61d7819c8262a330e8feeb9a83b7c3c3e742c08 (diff) | |
download | gcc-34aaed439380226950d65f37c02a549dfdebb16c.tar.gz |
Model cache auto-prefetcher in scheduler
* config/arm/arm-protos.h (struct tune_params): New field
sched_autopref_queue_depth.
* config/arm/arm.c (sched-int.h): Include header.
(arm_first_cycle_multipass_dfa_lookahead_guard,)
(TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define hook.
(arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,)
(arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,)
(arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,)
(arm_cortex_a53_tune, arm_cortex_a57_tune, arm_xgene1_tune,)
(arm_cortex_a5_tune, arm_cortex_a9_tune, arm_cortex_a12_tune,)
(arm_v7m_tune, arm_cortex_m7_tune, arm_v6m_tune, arm_fa726te_tune):
Specify sched_autopref_queue_depth value. Enabled for A15 and A57.
* config/arm/t-arm (arm.o): Update.
* haifa-sched.c (update_insn_after_change): Update.
(rank_for_schedule): Use auto-prefetcher model, if requested.
(autopref_multipass_init): New static function.
(autopref_rank_for_schedule): New rank_for_schedule heuristic.
(autopref_multipass_dfa_lookahead_guard_started_dump_p): New static
variable for debug dumps.
(autopref_multipass_dfa_lookahead_guard_1): New static helper function.
(autopref_multipass_dfa_lookahead_guard): New global function that
implements TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD hook.
(init_h_i_d): Update.
* params.def (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH): New tuning knob.
* sched-int.h (enum autopref_multipass_data_status): New const enum.
(autopref_multipass_data_): Structure for auto-prefetcher data.
(autopref_multipass_data_def, autopref_multipass_data_t): New typedefs.
(struct _haifa_insn_data:autopref_multipass_data): New field.
(INSN_AUTOPREF_MULTIPASS_DATA): New access macro.
(autopref_multipass_dfa_lookahead_guard): Declare.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@219789 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/haifa-sched.c')
-rw-r--r-- | gcc/haifa-sched.c | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/gcc/haifa-sched.c b/gcc/haifa-sched.c index 98cb9e4ba56..795ff79e898 100644 --- a/gcc/haifa-sched.c +++ b/gcc/haifa-sched.c @@ -841,6 +841,7 @@ add_delay_dependencies (rtx_insn *insn) /* Forward declarations. */ static int priority (rtx_insn *); +static int autopref_rank_for_schedule (const rtx_insn *, const rtx_insn *); static int rank_for_schedule (const void *, const void *); static void swap_sort (rtx_insn **, int); static void queue_insn (rtx_insn *, int, const char *); @@ -1184,6 +1185,12 @@ update_insn_after_change (rtx_insn *insn) INSN_COST (insn) = -1; /* Invalidate INSN_TICK, so it'll be recalculated. */ INSN_TICK (insn) = INVALID_TICK; + + /* Invalidate autoprefetch data entry. */ + INSN_AUTOPREF_MULTIPASS_DATA (insn)[0].status + = AUTOPREF_MULTIPASS_DATA_UNINITIALIZED; + INSN_AUTOPREF_MULTIPASS_DATA (insn)[1].status + = AUTOPREF_MULTIPASS_DATA_UNINITIALIZED; } @@ -2724,6 +2731,13 @@ rank_for_schedule (const void *x, const void *y) if (flag_sched_critical_path_heuristic && priority_val) return rfs_result (RFS_PRIORITY, priority_val, tmp, tmp2); + if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) >= 0) + { + int autopref = autopref_rank_for_schedule (tmp, tmp2); + if (autopref != 0) + return autopref; + } + /* Prefer speculative insn with greater dependencies weakness. */ if (flag_sched_spec_insn_heuristic && spec_info) { @@ -5500,6 +5514,241 @@ insn_finishes_cycle_p (rtx_insn *insn) return false; } +/* Functions to model cache auto-prefetcher. + + Some of the CPUs have cache auto-prefetcher, which /seems/ to initiate + memory prefetches if it sees instructions with consequitive memory accesses + in the instruction stream. Details of such hardware units are not published, + so we can only guess what exactly is going on there. + In the scheduler, we model abstract auto-prefetcher. If there are memory + insns in the ready list (or the queue) that have same memory base, but + different offsets, then we delay the insns with larger offsets until insns + with smaller offsets get scheduled. If PARAM_SCHED_AUTOPREF_QUEUE_DEPTH + is "1", then we look at the ready list; if it is N>1, then we also look + through N-1 queue entries. + If the param is N>=0, then rank_for_schedule will consider auto-prefetching + among its heuristics. + Param value of "-1" disables modelling of the auto-prefetcher. */ + +/* Initialize autoprefetcher model data for INSN. */ +static void +autopref_multipass_init (const rtx_insn *insn, int write) +{ + autopref_multipass_data_t data = &INSN_AUTOPREF_MULTIPASS_DATA (insn)[write]; + + gcc_assert (data->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED); + data->base = NULL_RTX; + data->offset = 0; + /* Set insn entry initialized, but not relevant for auto-prefetcher. */ + data->status = AUTOPREF_MULTIPASS_DATA_IRRELEVANT; + + rtx set = single_set (insn); + if (set == NULL_RTX) + return; + + rtx mem = write ? SET_DEST (set) : SET_SRC (set); + if (!MEM_P (mem)) + return; + + struct address_info info; + decompose_mem_address (&info, mem); + + /* TODO: Currently only (base+const) addressing is supported. */ + if (info.base == NULL || !REG_P (*info.base) + || (info.disp != NULL && !CONST_INT_P (*info.disp))) + return; + + /* This insn is relevant for auto-prefetcher. */ + data->base = *info.base; + data->offset = info.disp ? INTVAL (*info.disp) : 0; + data->status = AUTOPREF_MULTIPASS_DATA_NORMAL; +} + +/* Helper function for rank_for_schedule sorting. */ +static int +autopref_rank_for_schedule (const rtx_insn *insn1, const rtx_insn *insn2) +{ + for (int write = 0; write < 2; ++write) + { + autopref_multipass_data_t data1 + = &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write]; + autopref_multipass_data_t data2 + = &INSN_AUTOPREF_MULTIPASS_DATA (insn2)[write]; + + if (data1->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED) + autopref_multipass_init (insn1, write); + if (data1->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT) + continue; + + if (data2->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED) + autopref_multipass_init (insn2, write); + if (data2->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT) + continue; + + if (!rtx_equal_p (data1->base, data2->base)) + continue; + + return data1->offset - data2->offset; + } + + return 0; +} + +/* True if header of debug dump was printed. */ +static bool autopref_multipass_dfa_lookahead_guard_started_dump_p; + +/* Helper for autopref_multipass_dfa_lookahead_guard. + Return "1" if INSN1 should be delayed in favor of INSN2. */ +static int +autopref_multipass_dfa_lookahead_guard_1 (const rtx_insn *insn1, + const rtx_insn *insn2, int write) +{ + autopref_multipass_data_t data1 + = &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write]; + autopref_multipass_data_t data2 + = &INSN_AUTOPREF_MULTIPASS_DATA (insn2)[write]; + + if (data2->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED) + autopref_multipass_init (insn2, write); + if (data2->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT) + return 0; + + if (rtx_equal_p (data1->base, data2->base) + && data1->offset > data2->offset) + { + if (sched_verbose >= 2) + { + if (!autopref_multipass_dfa_lookahead_guard_started_dump_p) + { + fprintf (sched_dump, + ";;\t\tnot trying in max_issue due to autoprefetch " + "model: "); + autopref_multipass_dfa_lookahead_guard_started_dump_p = true; + } + + fprintf (sched_dump, " %d(%d)", INSN_UID (insn1), INSN_UID (insn2)); + } + + return 1; + } + + return 0; +} + +/* General note: + + We could have also hooked autoprefetcher model into + first_cycle_multipass_backtrack / first_cycle_multipass_issue hooks + to enable intelligent selection of "[r1+0]=r2; [r1+4]=r3" on the same cycle + (e.g., once "[r1+0]=r2" is issued in max_issue(), "[r1+4]=r3" gets + unblocked). We don't bother about this yet because target of interest + (ARM Cortex-A15) can issue only 1 memory operation per cycle. */ + +/* Implementation of first_cycle_multipass_dfa_lookahead_guard hook. + Return "1" if INSN1 should not be considered in max_issue due to + auto-prefetcher considerations. */ +int +autopref_multipass_dfa_lookahead_guard (rtx_insn *insn1, int ready_index) +{ + int r = 0; + + if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) <= 0) + return 0; + + if (sched_verbose >= 2 && ready_index == 0) + autopref_multipass_dfa_lookahead_guard_started_dump_p = false; + + for (int write = 0; write < 2; ++write) + { + autopref_multipass_data_t data1 + = &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write]; + + if (data1->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED) + autopref_multipass_init (insn1, write); + if (data1->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT) + continue; + + if (ready_index == 0 + && data1->status == AUTOPREF_MULTIPASS_DATA_DONT_DELAY) + /* We allow only a single delay on priviledged instructions. + Doing otherwise would cause infinite loop. */ + { + if (sched_verbose >= 2) + { + if (!autopref_multipass_dfa_lookahead_guard_started_dump_p) + { + fprintf (sched_dump, + ";;\t\tnot trying in max_issue due to autoprefetch " + "model: "); + autopref_multipass_dfa_lookahead_guard_started_dump_p = true; + } + + fprintf (sched_dump, " *%d*", INSN_UID (insn1)); + } + continue; + } + + for (int i2 = 0; i2 < ready.n_ready; ++i2) + { + rtx_insn *insn2 = get_ready_element (i2); + if (insn1 == insn2) + continue; + r = autopref_multipass_dfa_lookahead_guard_1 (insn1, insn2, write); + if (r) + { + if (ready_index == 0) + { + r = -1; + data1->status = AUTOPREF_MULTIPASS_DATA_DONT_DELAY; + } + goto finish; + } + } + + if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) == 1) + continue; + + /* Everything from the current queue slot should have been moved to + the ready list. */ + gcc_assert (insn_queue[NEXT_Q_AFTER (q_ptr, 0)] == NULL_RTX); + + int n_stalls = PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) - 1; + if (n_stalls > max_insn_queue_index) + n_stalls = max_insn_queue_index; + + for (int stalls = 1; stalls <= n_stalls; ++stalls) + { + for (rtx_insn_list *link = insn_queue[NEXT_Q_AFTER (q_ptr, stalls)]; + link != NULL_RTX; + link = link->next ()) + { + rtx_insn *insn2 = link->insn (); + r = autopref_multipass_dfa_lookahead_guard_1 (insn1, insn2, + write); + if (r) + { + /* Queue INSN1 until INSN2 can issue. */ + r = -stalls; + if (ready_index == 0) + data1->status = AUTOPREF_MULTIPASS_DATA_DONT_DELAY; + goto finish; + } + } + } + } + + finish: + if (sched_verbose >= 2 + && autopref_multipass_dfa_lookahead_guard_started_dump_p + && (ready_index == ready.n_ready - 1 || r < 0)) + /* This does not /always/ trigger. We don't output EOL if the last + insn is not recognized (INSN_CODE < 0) and lookahead_guard is not + called. We can live with this. */ + fprintf (sched_dump, "\n"); + + return r; +} + /* Define type for target data used in multipass scheduling. */ #ifndef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T # define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T int @@ -8710,6 +8959,10 @@ init_h_i_d (rtx_insn *insn) INSN_EXACT_TICK (insn) = INVALID_TICK; INTER_TICK (insn) = INVALID_TICK; TODO_SPEC (insn) = HARD_DEP; + INSN_AUTOPREF_MULTIPASS_DATA (insn)[0].status + = AUTOPREF_MULTIPASS_DATA_UNINITIALIZED; + INSN_AUTOPREF_MULTIPASS_DATA (insn)[1].status + = AUTOPREF_MULTIPASS_DATA_UNINITIALIZED; } } |