diff options
author | rakdver <rakdver@138bc75d-0d04-0410-961f-82ee72b054a4> | 2006-11-12 20:05:49 +0000 |
---|---|---|
committer | rakdver <rakdver@138bc75d-0d04-0410-961f-82ee72b054a4> | 2006-11-12 20:05:49 +0000 |
commit | 53d4d5ccad6817ab0c190f407b57fa14b9ec6db8 (patch) | |
tree | 872aef0e893c395cc6fdd9d9fe625854bf2281f4 /gcc/tree-ssa-loop-prefetch.c | |
parent | faa56cf9fb2c67f8b284c4d9fcdc641a7728a4a9 (diff) | |
download | gcc-53d4d5ccad6817ab0c190f407b57fa14b9ec6db8.tar.gz |
* tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve
comments.
(issue_prefetch_ref): Move assignment to write_p out of loop.
(determine_unroll_factor): Do not take PARAM_MAX_UNROLL_TIMES and
SIMULTANEOUS_PREFETCHES into account.
(loop_prefetch_arrays): Do not pass ahead to determine_unroll_factor.
* lambda-code.c (lcm): Renamed to ...
(least_common_multiple): ... and exported.
* tree-flow.h (least_common_multiple): Declare.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@118730 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/tree-ssa-loop-prefetch.c')
-rw-r--r-- | gcc/tree-ssa-loop-prefetch.c | 103 |
1 files changed, 50 insertions, 53 deletions
diff --git a/gcc/tree-ssa-loop-prefetch.c b/gcc/tree-ssa-loop-prefetch.c index 41ada264a04..b40fc0f6617 100644 --- a/gcc/tree-ssa-loop-prefetch.c +++ b/gcc/tree-ssa-loop-prefetch.c @@ -744,19 +744,21 @@ static bool schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor, unsigned ahead) { - unsigned max_prefetches, n_prefetches; + unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots; + unsigned slots_per_prefetch; struct mem_ref *ref; bool any = false; - max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead; - if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES) - max_prefetches = SIMULTANEOUS_PREFETCHES; + /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */ + remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES; + /* The prefetch will run for AHEAD iterations of the original loop, i.e., + AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration, + it will need a prefetch slot. */ + slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor; if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches); - - if (!max_prefetches) - return false; + fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n", + slots_per_prefetch); /* For now we just take memory references one by one and issue prefetches for as many as possible. The groups are sorted @@ -769,16 +771,24 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor, if (!should_issue_prefetch_p (ref)) continue; - ref->issue_prefetch_p = true; - - /* If prefetch_mod is less then unroll_factor, we need to insert - several prefetches for the reference. */ + /* If we need to prefetch the reference each PREFETCH_MOD iterations, + and we unroll the loop UNROLL_FACTOR times, we need to insert + ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each + iteration. */ n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) / ref->prefetch_mod); - if (max_prefetches <= n_prefetches) - return true; + prefetch_slots = n_prefetches * slots_per_prefetch; + + /* If more than half of the prefetches would be lost anyway, do not + issue the prefetch. */ + if (2 * remaining_prefetch_slots < prefetch_slots) + continue; + + ref->issue_prefetch_p = true; - max_prefetches -= n_prefetches; + if (remaining_prefetch_slots <= prefetch_slots) + return true; + remaining_prefetch_slots -= prefetch_slots; any = true; } @@ -822,6 +832,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) / ref->prefetch_mod); addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node); addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL); + write_p = ref->write_p ? integer_one_node : integer_zero_node; for (ap = 0; ap < n_prefetches; ap++) { @@ -832,10 +843,9 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL); /* Create the prefetch instruction. */ - write_p = ref->write_p ? integer_one_node : integer_zero_node; params = tree_cons (NULL_TREE, addr, tree_cons (NULL_TREE, write_p, NULL_TREE)); - + prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH], params); bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT); @@ -888,48 +898,36 @@ should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc, static unsigned determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs, - unsigned ahead, unsigned ninsns, - struct tree_niter_desc *desc) + unsigned ninsns, struct tree_niter_desc *desc) { - unsigned upper_bound, size_factor, constraint_factor; - unsigned factor, max_mod_constraint, ahead_factor; + unsigned upper_bound; + unsigned nfactor, factor, mod_constraint; struct mem_ref_group *agp; struct mem_ref *ref; - upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); - - /* First check whether the loop is not too large to unroll. */ - size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns; - if (size_factor <= 1) + /* First check whether the loop is not too large to unroll. We ignore + PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us + from unrolling them enough to make exactly one cache line covered by each + iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent + us from unrolling the loops too many times in cases where we only expect + gains from better scheduling and decreasing loop overhead, which is not + the case here. */ + upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns; + if (upper_bound <= 1) return 1; - if (size_factor < upper_bound) - upper_bound = size_factor; - - max_mod_constraint = 1; + /* Choose the factor so that we may prefetch each cache just once, + but bound the unrolling by UPPER_BOUND. */ + factor = 1; for (agp = refs; agp; agp = agp->next) for (ref = agp->refs; ref; ref = ref->next) - if (should_issue_prefetch_p (ref) - && ref->prefetch_mod > max_mod_constraint) - max_mod_constraint = ref->prefetch_mod; - - /* Set constraint_factor as large as needed to be able to satisfy the - largest modulo constraint. */ - constraint_factor = max_mod_constraint; - - /* If ahead is too large in comparison with the number of available - prefetches, unroll the loop as much as needed to be able to prefetch - at least partially some of the references in the loop. */ - ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1) - / SIMULTANEOUS_PREFETCHES); - - /* Unroll as much as useful, but bound the code size growth. */ - if (constraint_factor < ahead_factor) - factor = ahead_factor; - else - factor = constraint_factor; - if (factor > upper_bound) - factor = upper_bound; + if (should_issue_prefetch_p (ref)) + { + mod_constraint = ref->prefetch_mod; + nfactor = least_common_multiple (mod_constraint, factor); + if (nfactor <= upper_bound) + factor = nfactor; + } if (!should_unroll_loop_p (loop, desc, factor)) return 1; @@ -964,8 +962,7 @@ loop_prefetch_arrays (struct loops *loops, struct loop *loop) instructions executed per iteration of the loop. */ ninsns = tree_num_loop_insns (loop); ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns; - unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns, - &desc); + unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc); if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor); |