cfgloopanal.c (get_loop_hot_path): New function.

* cfgloopanal.c (get_loop_hot_path): New function. * tree-ssa-lop-ivcanon.c (struct loop_size): Add CONSTANT_IV, NUM_NON_PURE_CALLS_ON_HOT_PATH, NUM_PURE_CALLS_ON_HOT_PATH, NUM_BRANCHES_ON_HOT_PATH. (tree_estimate_loop_size): Compute the new values. (try_unroll_loop_completely): Disable unrolling of loops with only calls or too many branches. (tree_unroll_loops_completely): Deal also with outer loops of hot loops. * cfgloop.h (get_loop_hot_path): Declare. * params.def (PARAM_MAX_PEEL_BRANCHES): New parameters. * invoke.texi (max-peel-branches): Document. * gcc.dg/tree-ssa/loop-1.c: Make to look like a good unroling candidate still. * gcc.dg/tree-ssa/loop-23.c: Likewise. * gcc.dg/tree-ssa/cunroll-1.c: Unrolling now happens early. * gcc.dg/tree-prof/unroll-1.c: Remove confused dg-options. From-SVN: r193246

cfgloopanal.c (get_loop_hot_path): New function.
* cfgloopanal.c (get_loop_hot_path): New function. * tree-ssa-lop-ivcanon.c (struct loop_size): Add CONSTANT_IV, NUM_NON_PURE_CALLS_ON_HOT_PATH, NUM_PURE_CALLS_ON_HOT_PATH, NUM_BRANCHES_ON_HOT_PATH. (tree_estimate_loop_size): Compute the new values. (try_unroll_loop_completely): Disable unrolling of loops with only calls or too many branches. (tree_unroll_loops_completely): Deal also with outer loops of hot loops. * cfgloop.h (get_loop_hot_path): Declare. * params.def (PARAM_MAX_PEEL_BRANCHES): New parameters. * invoke.texi (max-peel-branches): Document. * gcc.dg/tree-ssa/loop-1.c: Make to look like a good unroling candidate still. * gcc.dg/tree-ssa/loop-23.c: Likewise. * gcc.dg/tree-ssa/cunroll-1.c: Unrolling now happens early. * gcc.dg/tree-prof/unroll-1.c: Remove confused dg-options. From-SVN: r193246
519cac4a · Jan Hubicka · Jan Hubicka · 425b784f · 519cac4a · 519cac4a
Commit 519cac4a authored Nov 06, 2012 by Jan Hubicka Committed by Jan Hubicka Nov 06, 2012
10 changed files
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -714,6 +714,7 @@ extern void doloop_optimize_loops (void);
 extern void move_loop_invariants (void);
 extern bool finite_loop_p (struct loop *);
 extern void scale_loop_profile (struct loop *loop, int scale, int iteration_bound);
+extern VEC (basic_block, heap) * get_loop_hot_path (const struct loop *loop);

 /* Returns the outermost loop of the loop nest that contains LOOP.*/
 static inline struct loop *

--- a/gcc/cfgloopanal.c
+++ b/gcc/cfgloopanal.c
@@ -483,3 +483,36 @@ single_likely_exit (struct loop *loop)
  VEC_free (edge, heap, exits);
  return found;
 }
+
+
+/* Gets basic blocks of a LOOP.  Header is the 0-th block, rest is in dfs
+   order against direction of edges from latch.  Specially, if
+   header != latch, latch is the 1-st block.  */
+
+VEC (basic_block, heap) *
+get_loop_hot_path (const struct loop *loop)
+{
+  basic_block bb = loop->header;
+  VEC (basic_block, heap) *path = NULL;
+  bitmap visited = BITMAP_ALLOC (NULL);
+
+  while (true)
+    {
+      edge_iterator ei;
+      edge e;
+      edge best = NULL;
+
+      VEC_safe_push (basic_block, heap, path, bb);
+      bitmap_set_bit (visited, bb->index);
+      FOR_EACH_EDGE (e, ei, bb->succs)
+        if ((!best || e->probability > best->probability)
+	    && !loop_exit_edge_p (loop, e)
+	    && !bitmap_bit_p (visited, e->dest->index))
+	  best = e;
+      if (!best || best->dest == loop->header)
+	break;
+      bb = best->dest;
+    }
+  BITMAP_FREE (visited);
+  return path;
+}
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -9085,6 +9085,9 @@ the loop code is peeled.
 @item max-peel-times
 The maximum number of peelings of a single loop.

+@item max-peel-branches
+The maximum number of branches on the hot path through the peeled sequence.
+
 @item max-completely-peeled-insns
 The maximum number of insns of a completely peeled loop.


--- a/gcc/params.def
+++ b/gcc/params.def
@@ -291,6 +291,11 @@ DEFPARAM(PARAM_MAX_PEEL_TIMES,
 	"max-peel-times",
 	"The maximum number of peelings of a single loop",
 	16, 0, 0)
+/* The maximum number of peelings of a single loop that is peeled completely.  */
+DEFPARAM(PARAM_MAX_PEEL_BRANCHES,
+	"max-peel-branches",
+	"The maximum number of branches on the path through the peeled sequence",
+	32, 0, 0)
 /* The maximum number of insns of a peeled loop.  */
 DEFPARAM(PARAM_MAX_COMPLETELY_PEELED_INSNS,
 	"max-completely-peeled-insns",

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
+2012-11-06  Jan Hubicka  <jh@suse.cz>
+
+	* gcc.dg/tree-ssa/loop-1.c: Make to look like a good unroling candidate still.
+	* gcc.dg/tree-ssa/loop-23.c: Likewise.
+	* gcc.dg/tree-ssa/cunroll-1.c: Unrolling now happens early.
+	* gcc.dg/tree-prof/unroll-1.c: Remove confused dg-options.
+
 2012-11-06  David Edelsohn  <dje.gcc@gmail.com>

 	* const-uniq-1.c: Expand regex to match AIX XCOFF labels.

--- a/gcc/testsuite/gcc.dg/tree-prof/unroll-1.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/unroll-1.c
@@ -21,4 +21,3 @@ main()
 }
 /* { dg-final-use { scan-rtl-dump "Considering unrolling loop with constant number of iterations" "loop2_unroll" } } */
 /* { dg-final-use { cleanup-rtl-dump "Not unrolling loop, doesn't roll" } } */
-/* { dg-options "-O3 -fdump-rtl-loop2_unroll -funroll-loops -fno-peel-loops" } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-1.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-1.c
 /* { dg-do compile } */
-/* { dg-options "-O3 -fdump-tree-cunroll-details" } */
+/* { dg-options "-O3 -fdump-tree-cunrolli-details" } */
 int a[2];
 test(int c)
 { 
@@ -10,4 +10,4 @@ test(int c)
 /* Array bounds says the loop will not roll much.  */
 /* { dg-final { scan-tree-dump "Unrolled loop 1 completely .duplicated 1 times.." "cunroll"} } */
 /* { dg-final { scan-tree-dump "Last iteration exit edge was proved true." "cunroll"} } */
-/* { dg-final { cleanup-tree-dump "cunroll" } } */
+/* { dg-final { cleanup-tree-dump "cunrolli" } } */
--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-1.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-1.c
@@ -17,13 +17,16 @@
   to the load from the GOT this also contains the name of the funtion so for
   each call the function name would appear twice.  */
 /* { dg-options "-O1 -ftree-loop-ivcanon -funroll-loops -fdump-tree-ivcanon-details -fdump-tree-cunroll-details -fdump-tree-optimized -mno-relax-pic-calls" { target mips*-*-* } } */
-
-void xxx(void)
+__attribute__ ((pure))
+int foo (int x);
+int xxx(void)
 {
  int x = 45;
+  int sum;

  while (x >>= 1)
-    foo ();
+    sum += foo (x) * 2;
+  return sum;
 }

 /* We should be able to find out that the loop iterates four times and unroll it completely.  */

--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-23.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-23.c
 /* { dg-do compile } */
 /* { dg-options "-O2 -funroll-loops -fdump-tree-cunroll-details" } */

-void bla(int);
+__attribute__ ((pure))
+int bla(int);

-void foo(void)
+int foo(void)
 {
  int i;
+  int sum;

  /* This loop used to appear to be too large for unrolling.  */
  for (i = 0; i < 4; i++)
    {
-      bla (i);
-      bla (2*i);
-      bla (3*i);
-      bla (4*i);
-      bla (5*i);
-      bla (6*i);
-      bla (7*i);
-      bla (8*i);
+      sum += bla (i);
+      sum += bla (2*i);
+      sum += bla (3*i);
+      sum += bla (4*i);
+      sum += bla (5*i);
+      sum += bla (6*i);
+      sum += bla (7*i);
+      sum += bla (8*i);
    }
+  return sum;
 }

 /* { dg-final { scan-tree-dump-times "Unrolled loop 1 completely" 1 "cunroll" } } */

--- a/gcc/tree-ssa-loop-ivcanon.c
+++ b/gcc/tree-ssa-loop-ivcanon.c
@@ -140,6 +140,20 @@ struct loop_size
     instructions after exit are not executed.  */
  int last_iteration;
  int last_iteration_eliminated_by_peeling;
+  
+  /* If some IV computation will become constant.  */
+  bool constant_iv;
+
+  /* Number of call stmts that are not a builtin and are pure or const
+     present on the hot path.  */
+  int num_pure_calls_on_hot_path;
+  /* Number of call stmts that are not a builtin and are not pure nor const
+     present on the hot path.  */
+  int num_non_pure_calls_on_hot_path;
+  /* Number of statements other than calls in the loop.  */
+  int non_call_stmts_on_hot_path;
+  /* Number of branches seen on the hot path.  */
+  int num_branches_on_hot_path;
 };

 /* Return true if OP in STMT will be constant after peeling LOOP.  */
@@ -188,7 +202,11 @@ constant_after_peeling (tree op, gimple stmt, struct loop *loop)
  return true;
 }

-/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS.
+/* Computes an estimated number of insns in LOOP.
+   EXIT (if non-NULL) is an exite edge that will be eliminated in all but last
+   iteration of the loop.
+   EDGE_TO_CANCEL (if non-NULL) is an non-exit edge eliminated in the last iteration
+   of loop.
   Return results in SIZE, estimate benefits for complete unrolling exiting by EXIT.  */

 static void
@@ -198,11 +216,17 @@ tree_estimate_loop_size (struct loop *loop, edge exit, edge edge_to_cancel, stru
  gimple_stmt_iterator gsi;
  unsigned int i;
  bool after_exit;
+  VEC (basic_block, heap) *path = get_loop_hot_path (loop);

  size->overall = 0;
  size->eliminated_by_peeling = 0;
  size->last_iteration = 0;
  size->last_iteration_eliminated_by_peeling = 0;
+  size->num_pure_calls_on_hot_path = 0;
+  size->num_non_pure_calls_on_hot_path = 0;
+  size->non_call_stmts_on_hot_path = 0;
+  size->num_branches_on_hot_path = 0;
+  size->constant_iv = 0;

  if (dump_file && (dump_flags & TDF_DETAILS))
    fprintf (dump_file, "Estimating sizes for loop %i\n", loop->num);
@@ -221,6 +245,8 @@ tree_estimate_loop_size (struct loop *loop, edge exit, edge edge_to_cancel, stru
 	  gimple stmt = gsi_stmt (gsi);
 	  int num = estimate_num_insns (stmt, &eni_size_weights);
 	  bool likely_eliminated = false;
+	  bool likely_eliminated_last = false;
+	  bool likely_eliminated_peeled = false;

 	  if (dump_file && (dump_flags & TDF_DETAILS))
 	    {
@@ -231,11 +257,21 @@ tree_estimate_loop_size (struct loop *loop, edge exit, edge edge_to_cancel, stru
 	  /* Look for reasons why we might optimize this stmt away. */

 	  /* Exit conditional.  */
-	  if (exit && body[i] == exit->src && stmt == last_stmt (exit->src))
+	  if (exit && body[i] == exit->src
+		   && stmt == last_stmt (exit->src))
 	    {
 	      if (dump_file && (dump_flags & TDF_DETAILS))
-	        fprintf (dump_file, "   Exit condition will be eliminated.\n");
-	      likely_eliminated = true;
+	        fprintf (dump_file, "   Exit condition will be eliminated "
+			 "in peeled copies.\n");
+	      likely_eliminated_peeled = true;
+	    }
+	  else if (edge_to_cancel && body[i] == edge_to_cancel->src
+		   && stmt == last_stmt (edge_to_cancel->src))
+	    {
+	      if (dump_file && (dump_flags & TDF_DETAILS))
+	        fprintf (dump_file, "   Exit condition will be eliminated "
+			 "in last copy.\n");
+	      likely_eliminated_last = true;
 	    }
 	  /* Sets of IV variables  */
 	  else if (gimple_code (stmt) == GIMPLE_ASSIGN
@@ -249,19 +285,22 @@ tree_estimate_loop_size (struct loop *loop, edge exit, edge edge_to_cancel, stru
 	  /* Assignments of IV variables.  */
 	  else if (gimple_code (stmt) == GIMPLE_ASSIGN
 		   && TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME
-		   && constant_after_peeling (gimple_assign_rhs1 (stmt), stmt,loop)
+		   && constant_after_peeling (gimple_assign_rhs1 (stmt), stmt, loop)
 		   && (gimple_assign_rhs_class (stmt) != GIMPLE_BINARY_RHS
 		       || constant_after_peeling (gimple_assign_rhs2 (stmt),
 		       				  stmt, loop)))
 	    {
+	      size->constant_iv = true;
 	      if (dump_file && (dump_flags & TDF_DETAILS))
 	        fprintf (dump_file, "   Constant expression will be folded away.\n");
 	      likely_eliminated = true;
 	    }
 	  /* Conditionals.  */
-	  else if (gimple_code (stmt) == GIMPLE_COND
-		   && constant_after_peeling (gimple_cond_lhs (stmt), stmt, loop)
-		   && constant_after_peeling (gimple_cond_rhs (stmt), stmt, loop))
+	  else if ((gimple_code (stmt) == GIMPLE_COND
+		    && constant_after_peeling (gimple_cond_lhs (stmt), stmt, loop)
+		    && constant_after_peeling (gimple_cond_rhs (stmt), stmt, loop))
+		   || (gimple_code (stmt) == GIMPLE_SWITCH
+		       && constant_after_peeling (gimple_switch_index (stmt), stmt, loop)))
 	    {
 	      if (dump_file && (dump_flags & TDF_DETAILS))
 	        fprintf (dump_file, "   Constant conditional.\n");
@@ -269,16 +308,49 @@ tree_estimate_loop_size (struct loop *loop, edge exit, edge edge_to_cancel, stru
 	    }

 	  size->overall += num;
-	  if (likely_eliminated)
+	  if (likely_eliminated || likely_eliminated_peeled)
 	    size->eliminated_by_peeling += num;
 	  if (!after_exit)
 	    {
 	      size->last_iteration += num;
-	      if (likely_eliminated)
+	      if (likely_eliminated || likely_eliminated_last)
 		size->last_iteration_eliminated_by_peeling += num;
 	    }
 	}
    }
+  while (VEC_length (basic_block, path))
+    {
+      basic_block bb = VEC_pop (basic_block, path);
+      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  gimple stmt = gsi_stmt (gsi);
+	  if (gimple_code (stmt) == GIMPLE_CALL)
+	    {
+	      int flags = gimple_call_flags (stmt);
+	      tree decl = gimple_call_fndecl (stmt);
+
+	      if (decl && DECL_IS_BUILTIN (decl)
+		  && is_inexpensive_builtin (decl))
+		;
+	      else if (flags & (ECF_PURE | ECF_CONST))
+		size->num_pure_calls_on_hot_path++;
+	      else
+		size->num_non_pure_calls_on_hot_path++;
+	      size->num_branches_on_hot_path ++;
+	    }
+	  else if (gimple_code (stmt) != GIMPLE_CALL
+		   && gimple_code (stmt) != GIMPLE_DEBUG)
+	    size->non_call_stmts_on_hot_path++;
+	  if (((gimple_code (stmt) == GIMPLE_COND
+	        && (!constant_after_peeling (gimple_cond_lhs (stmt), stmt, loop)
+		    || constant_after_peeling (gimple_cond_rhs (stmt), stmt, loop)))
+	       || (gimple_code (stmt) == GIMPLE_SWITCH
+		   && !constant_after_peeling (gimple_switch_index (stmt), stmt, loop)))
+	      && (!exit || bb != exit->src))
+	    size->num_branches_on_hot_path++;
+	}
+    }
+  VEC_free (basic_block, heap, path);
  if (dump_file && (dump_flags & TDF_DETAILS))
    fprintf (dump_file, "size: %i-%i, last_iteration: %i-%i\n", size->overall,
    	     size->eliminated_by_peeling, size->last_iteration,
@@ -644,34 +716,85 @@ try_unroll_loop_completely (struct loop *loop,
 		   (int) unr_insns);
 	}

+      /* If the code is going to shrink, we don't need to be extra cautious
+	 on guessing if the unrolling is going to be profitable.  */
+      if (unr_insns
+	  /* If there is IV variable that will become constant, we save
+	     one instruction in the loop prologue we do not account
+	     otherwise.  */
+	  <= ninsns + (size.constant_iv != false))
+	;
      /* We unroll only inner loops, because we do not consider it profitable
 	 otheriwse.  We still can cancel loopback edge of not rolling loop;
 	 this is always a good idea.  */
-      if (loop->inner && unr_insns > ninsns)
+      else if (ul == UL_NO_GROWTH)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "Not unrolling loop %d: size would grow.\n",
+		     loop->num);
+	  return false;
+	}
+      /* Outer loops tend to be less interesting candidates for complette
+	 unrolling unless we can do a lot of propagation into the inner loop
+	 body.  For now we disable outer loop unrolling when the code would
+	 grow.  */
+      else if (loop->inner)
 	{
 	  if (dump_file && (dump_flags & TDF_DETAILS))
-	    fprintf (dump_file, "Not unrolling loop %d:"
+	    fprintf (dump_file, "Not unrolling loop %d: "
 		     "it is not innermost and code would grow.\n",
 		     loop->num);
 	  return false;
 	}
-
-      if (unr_insns > ninsns
-	  && (unr_insns
-	      > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS)))
+      /* If there is call on a hot path through the loop, then
+	 there is most probably not much to optimize.  */
+      else if (size.num_non_pure_calls_on_hot_path)
 	{
 	  if (dump_file && (dump_flags & TDF_DETAILS))
-	    fprintf (dump_file, "Not unrolling loop %d "
-		     "(--param max-completely-peeled-insns limit reached).\n",
+	    fprintf (dump_file, "Not unrolling loop %d: "
+		     "contains call and code would grow.\n",
 		     loop->num);
 	  return false;
 	}
-
-      if (ul == UL_NO_GROWTH
-	  && unr_insns > ninsns)
+      /* If there is pure/const call in the function, then we
+	 can still optimize the unrolled loop body if it contains
+	 some other interesting code than the calls and code
+	 storing or cumulating the return value.  */
+      else if (size.num_pure_calls_on_hot_path
+	       /* One IV increment, one test, one ivtmp store
+		  and one usefull stmt.  That is about minimal loop
+		  doing pure call.  */
+	       && (size.non_call_stmts_on_hot_path
+		   <= 3 + size.num_pure_calls_on_hot_path))
 	{
 	  if (dump_file && (dump_flags & TDF_DETAILS))
-	    fprintf (dump_file, "Not unrolling loop %d: size would grow.\n",
+	    fprintf (dump_file, "Not unrolling loop %d: "
+		     "contains just pure calls and code would grow.\n",
+		     loop->num);
+	  return false;
+	}
+      /* Complette unrolling is major win when control flow is removed and
+	 one big basic block is created.  If the loop contains control flow
+	 the optimization may still be a win because of eliminating the loop
+	 overhead but it also may blow the branch predictor tables.
+	 Limit number of branches on the hot path through the peeled
+	 sequence.  */
+      else if (size.num_branches_on_hot_path * (int)n_unroll
+	       > PARAM_VALUE (PARAM_MAX_PEEL_BRANCHES))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "Not unrolling loop %d: "
+		     " number of branches on hot path in the unrolled sequence"
+		     " reach --param max-peel-branches limit.\n",
+		     loop->num);
+	  return false;
+	}
+      else if (unr_insns
+	       > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "Not unrolling loop %d: "
+		     "(--param max-completely-peeled-insns limit reached).\n",
 		     loop->num);
 	  return false;
 	}
@@ -689,6 +812,8 @@ try_unroll_loop_completely (struct loop *loop,
 	{
          free_original_copy_tables ();
 	  free (wont_exit);
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "Failed to duplicate the loop\n");
 	  return false;
 	}

@@ -968,7 +1093,7 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
 	{
 	  struct loop *loop_father = loop_outer (loop);

-	  if (may_increase_size && optimize_loop_for_speed_p (loop)
+	  if (may_increase_size && optimize_loop_nest_for_speed_p (loop)
 	      /* Unroll outermost loops only if asked to do so or they do
 		 not cause code growth.  */
 	      && (unroll_outer || loop_outer (loop_father)))