* tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve

comments. (issue_prefetch_ref): Move assignment to write_p out of loop. (determine_unroll_factor): Do not take PARAM_MAX_UNROLL_TIMES and SIMULTANEOUS_PREFETCHES into account. (loop_prefetch_arrays): Do not pass ahead to determine_unroll_factor. * lambda-code.c (lcm): Renamed to ... (least_common_multiple): ... and exported. * tree-flow.h (least_common_multiple): Declare. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@118730 138bc75d-0d04-0410-961f-82ee72b054a4
author: rakdver <rakdver@138bc75d-0d04-0410-961f-82ee72b054a4> 2006-11-12 20:05:49 +0000
committer: rakdver <rakdver@138bc75d-0d04-0410-961f-82ee72b054a4> 2006-11-12 20:05:49 +0000
commit: 53d4d5ccad6817ab0c190f407b57fa14b9ec6db8 (patch)
tree: 872aef0e893c395cc6fdd9d9fe625854bf2281f4 /gcc/tree-ssa-loop-prefetch.c
parent: faa56cf9fb2c67f8b284c4d9fcdc641a7728a4a9 (diff)
download: gcc-53d4d5ccad6817ab0c190f407b57fa14b9ec6db8.tar.gz
1 files changed, 50 insertions, 53 deletions
diff --git a/gcc/tree-ssa-loop-prefetch.c b/gcc/tree-ssa-loop-prefetch.c
index 41ada264a04..b40fc0f6617 100644
--- a/gcc/tree-ssa-loop-prefetch.c
+++ b/gcc/tree-ssa-loop-prefetch.c
@@ -744,19 +744,21 @@ static bool
 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
 		     unsigned ahead)
 {
-  unsigned max_prefetches, n_prefetches;
+  unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
+  unsigned slots_per_prefetch;
   struct mem_ref *ref;
   bool any = false;
 
-  max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead;
-  if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES)
-    max_prefetches = SIMULTANEOUS_PREFETCHES;
+  /* At most SIMULTANEOUS_PREFETCHES should be running at the same time.  */
+  remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
 
+  /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
+     AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
+     it will need a prefetch slot.  */
+  slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
   if (dump_file && (dump_flags & TDF_DETAILS))
-    fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches);
-
-  if (!max_prefetches)
-    return false;
+    fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
+	     slots_per_prefetch);
 
   /* For now we just take memory references one by one and issue
      prefetches for as many as possible.  The groups are sorted
@@ -769,16 +771,24 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
 	if (!should_issue_prefetch_p (ref))
 	  continue;
 
-	ref->issue_prefetch_p = true;
-
-	/* If prefetch_mod is less then unroll_factor, we need to insert
-	   several prefetches for the reference.  */
+	/* If we need to prefetch the reference each PREFETCH_MOD iterations,
+	   and we unroll the loop UNROLL_FACTOR times, we need to insert
+	   ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
+	   iteration.  */
 	n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
 			/ ref->prefetch_mod);
-	if (max_prefetches <= n_prefetches)
-	  return true;
+	prefetch_slots = n_prefetches * slots_per_prefetch;
+
+	/* If more than half of the prefetches would be lost anyway, do not
+	   issue the prefetch.  */
+	if (2 * remaining_prefetch_slots < prefetch_slots)
+	  continue;
+
+	ref->issue_prefetch_p = true;
 
-	max_prefetches -= n_prefetches;
+	if (remaining_prefetch_slots <= prefetch_slots)
+	  return true;
+	remaining_prefetch_slots -= prefetch_slots;
 	any = true;
       }
 
@@ -822,6 +832,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
 		  / ref->prefetch_mod);
   addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
   addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
+  write_p = ref->write_p ? integer_one_node : integer_zero_node;
 
   for (ap = 0; ap < n_prefetches; ap++)
     {
@@ -832,10 +843,9 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
       addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
 
       /* Create the prefetch instruction.  */
-      write_p = ref->write_p ? integer_one_node : integer_zero_node;
       params = tree_cons (NULL_TREE, addr,
 			  tree_cons (NULL_TREE, write_p, NULL_TREE));
-				 
+
       prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
 					   params);
       bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
@@ -888,48 +898,36 @@ should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
 
 static unsigned
 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
-			 unsigned ahead, unsigned ninsns,
-			 struct tree_niter_desc *desc)
+			 unsigned ninsns, struct tree_niter_desc *desc)
 {
-  unsigned upper_bound, size_factor, constraint_factor;
-  unsigned factor, max_mod_constraint, ahead_factor;
+  unsigned upper_bound;
+  unsigned nfactor, factor, mod_constraint;
   struct mem_ref_group *agp;
   struct mem_ref *ref;
 
-  upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
-
-  /* First check whether the loop is not too large to unroll.  */
-  size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
-  if (size_factor <= 1)
+  /* First check whether the loop is not too large to unroll.  We ignore
+     PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
+     from unrolling them enough to make exactly one cache line covered by each
+     iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
+     us from unrolling the loops too many times in cases where we only expect
+     gains from better scheduling and decreasing loop overhead, which is not
+     the case here.  */
+  upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
+  if (upper_bound <= 1)
     return 1;
 
-  if (size_factor < upper_bound)
-    upper_bound = size_factor;
-
-  max_mod_constraint = 1;
+  /* Choose the factor so that we may prefetch each cache just once,
+     but bound the unrolling by UPPER_BOUND.  */
+  factor = 1;
   for (agp = refs; agp; agp = agp->next)
     for (ref = agp->refs; ref; ref = ref->next)
-      if (should_issue_prefetch_p (ref)
-	  && ref->prefetch_mod > max_mod_constraint)
-	max_mod_constraint = ref->prefetch_mod;
-
-  /* Set constraint_factor as large as needed to be able to satisfy the
-     largest modulo constraint.  */
-  constraint_factor = max_mod_constraint;
-
-  /* If ahead is too large in comparison with the number of available
-     prefetches, unroll the loop as much as needed to be able to prefetch
-     at least partially some of the references in the loop.  */
-  ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1)
-		  / SIMULTANEOUS_PREFETCHES);
-
-  /* Unroll as much as useful, but bound the code size growth.  */
-  if (constraint_factor < ahead_factor)
-    factor = ahead_factor;
-  else
-    factor = constraint_factor;
-  if (factor > upper_bound)
-    factor = upper_bound;
+      if (should_issue_prefetch_p (ref))
+	{
+	  mod_constraint = ref->prefetch_mod;
+	  nfactor = least_common_multiple (mod_constraint, factor);
+	  if (nfactor <= upper_bound)
+	    factor = nfactor;
+	}
 
   if (!should_unroll_loop_p (loop, desc, factor))
     return 1;
@@ -964,8 +962,7 @@ loop_prefetch_arrays (struct loops *loops, struct loop *loop)
      instructions executed per iteration of the loop.  */
   ninsns = tree_num_loop_insns (loop);
   ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
-  unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns,
-					   &desc);
+  unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc);
   if (dump_file && (dump_flags & TDF_DETAILS))
     fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
author	rakdver <rakdver@138bc75d-0d04-0410-961f-82ee72b054a4>	2006-11-12 20:05:49 +0000
committer	rakdver <rakdver@138bc75d-0d04-0410-961f-82ee72b054a4>	2006-11-12 20:05:49 +0000
commit	53d4d5ccad6817ab0c190f407b57fa14b9ec6db8 (patch)
tree	872aef0e893c395cc6fdd9d9fe625854bf2281f4 /gcc/tree-ssa-loop-prefetch.c
parent	faa56cf9fb2c67f8b284c4d9fcdc641a7728a4a9 (diff)
download	gcc-53d4d5ccad6817ab0c190f407b57fa14b9ec6db8.tar.gz