Commit 1cc521f1 by Michael Matz Committed by Michael Matz

Add unroll and jam pass

	* gimple-loop-jam.c: New file.
	* Makefile.in (OBJS): Add gimple-loop-jam.o.
	* common.opt (funroll-and-jam): New option.
	* opts.c (default_options_table): Add unroll-and-jam at -O3.
	* params.def (PARAM_UNROLL_JAM_MIN_PERCENT): New param.
	(PARAM_UNROLL_JAM_MAX_UNROLL): Ditto.
	* passes.def: Add pass_loop_jam.
	* timevar.def (TV_LOOP_JAM): Add.
	* tree-pass.h (make_pass_loop_jam): Declare.
	* cfgloop.c (flow_loop_tree_node_add): Add AFTER argument.
	* cfgloop.h (flow_loop_tree_node_add): Adjust declaration.
	* cfgloopmanip.c (duplicate_loop): Add AFTER argument, adjust call
	to flow_loop_tree_node_add.
	(duplicate_subloops, copy_loops_to): Append to sibling list.
	* cfgloopmanip.h: (duplicate_loop): Adjust declaration.
	* doc/invoke.texi (-funroll-and-jam): Document new option.
	(unroll-jam-min-percent, unroll-jam-max-unroll): Document new params.

testsuite/
	* gcc.dg/unroll-and-jam.c: New test.

From-SVN: r255467
parent 5a40ae3c
2017-12-07 Michael Matz <matz@suse.de>
Add unroll and jam pass
* gimple-loop-jam.c: New file.
* Makefile.in (OBJS): Add gimple-loop-jam.o.
* common.opt (funroll-and-jam): New option.
* opts.c (default_options_table): Add unroll-and-jam at -O3.
* params.def (PARAM_UNROLL_JAM_MIN_PERCENT): New param.
(PARAM_UNROLL_JAM_MAX_UNROLL): Ditto.
* passes.def: Add pass_loop_jam.
* timevar.def (TV_LOOP_JAM): Add.
* tree-pass.h (make_pass_loop_jam): Declare.
* cfgloop.c (flow_loop_tree_node_add): Add AT argument.
* cfgloop.h (flow_loop_tree_node_add): Adjust declaration.
* cfgloopmanip.c (duplicate_loop): Add AT argument, adjust call
to flow_loop_tree_node_add.
(duplicate_subloops, copy_loops_to): Append to sibling list.
* cfgloopmanip.h: (duplicate_loop): Adjust declaration.
* doc/invoke.texi (-funroll-and-jam): Document new option.
(unroll-jam-min-percent, unroll-jam-max-unroll): Document new params.
2017-12-07 Richard Biener <rguenther@suse.de> 2017-12-07 Richard Biener <rguenther@suse.de>
PR tree-optimization/83296 PR tree-optimization/83296
...@@ -1302,6 +1302,7 @@ OBJS = \ ...@@ -1302,6 +1302,7 @@ OBJS = \
gimple-iterator.o \ gimple-iterator.o \
gimple-fold.o \ gimple-fold.o \
gimple-laddress.o \ gimple-laddress.o \
gimple-loop-jam.o \
gimple-low.o \ gimple-low.o \
gimple-pretty-print.o \ gimple-pretty-print.o \
gimple-ssa-backprop.o \ gimple-ssa-backprop.o \
......
...@@ -296,13 +296,25 @@ establish_preds (struct loop *loop, struct loop *father) ...@@ -296,13 +296,25 @@ establish_preds (struct loop *loop, struct loop *father)
/* Add LOOP to the loop hierarchy tree where FATHER is father of the /* Add LOOP to the loop hierarchy tree where FATHER is father of the
added loop. If LOOP has some children, take care of that their added loop. If LOOP has some children, take care of that their
pred field will be initialized correctly. */ pred field will be initialized correctly. If AFTER is non-null
then it's expected it's a pointer into FATHERs inner sibling
list and LOOP is added behind AFTER, otherwise it's added in front
of FATHERs siblings. */
void void
flow_loop_tree_node_add (struct loop *father, struct loop *loop) flow_loop_tree_node_add (struct loop *father, struct loop *loop,
struct loop *after)
{ {
loop->next = father->inner; if (after)
father->inner = loop; {
loop->next = after->next;
after->next = loop;
}
else
{
loop->next = father->inner;
father->inner = loop;
}
establish_preds (loop, father); establish_preds (loop, father);
} }
......
...@@ -342,7 +342,8 @@ void rescan_loop_exit (edge, bool, bool); ...@@ -342,7 +342,8 @@ void rescan_loop_exit (edge, bool, bool);
void sort_sibling_loops (function *); void sort_sibling_loops (function *);
/* Loop data structure manipulation/querying. */ /* Loop data structure manipulation/querying. */
extern void flow_loop_tree_node_add (struct loop *, struct loop *); extern void flow_loop_tree_node_add (struct loop *, struct loop *,
struct loop * = NULL);
extern void flow_loop_tree_node_remove (struct loop *); extern void flow_loop_tree_node_remove (struct loop *);
extern bool flow_loop_nested_p (const struct loop *, const struct loop *); extern bool flow_loop_nested_p (const struct loop *, const struct loop *);
extern bool flow_bb_inside_loop_p (const struct loop *, const_basic_block); extern bool flow_bb_inside_loop_p (const struct loop *, const_basic_block);
......
...@@ -1000,9 +1000,11 @@ copy_loop_info (struct loop *loop, struct loop *target) ...@@ -1000,9 +1000,11 @@ copy_loop_info (struct loop *loop, struct loop *target)
} }
/* Copies copy of LOOP as subloop of TARGET loop, placing newly /* Copies copy of LOOP as subloop of TARGET loop, placing newly
created loop into loops structure. */ created loop into loops structure. If AFTER is non-null
the new loop is added at AFTER->next, otherwise in front of TARGETs
sibling list. */
struct loop * struct loop *
duplicate_loop (struct loop *loop, struct loop *target) duplicate_loop (struct loop *loop, struct loop *target, struct loop *after)
{ {
struct loop *cloop; struct loop *cloop;
cloop = alloc_loop (); cloop = alloc_loop ();
...@@ -1014,36 +1016,46 @@ duplicate_loop (struct loop *loop, struct loop *target) ...@@ -1014,36 +1016,46 @@ duplicate_loop (struct loop *loop, struct loop *target)
set_loop_copy (loop, cloop); set_loop_copy (loop, cloop);
/* Add it to target. */ /* Add it to target. */
flow_loop_tree_node_add (target, cloop); flow_loop_tree_node_add (target, cloop, after);
return cloop; return cloop;
} }
/* Copies structure of subloops of LOOP into TARGET loop, placing /* Copies structure of subloops of LOOP into TARGET loop, placing
newly created loops into loop tree. */ newly created loops into loop tree at the end of TARGETs sibling
list in the original order. */
void void
duplicate_subloops (struct loop *loop, struct loop *target) duplicate_subloops (struct loop *loop, struct loop *target)
{ {
struct loop *aloop, *cloop; struct loop *aloop, *cloop, *tail;
for (tail = target->inner; tail && tail->next; tail = tail->next)
;
for (aloop = loop->inner; aloop; aloop = aloop->next) for (aloop = loop->inner; aloop; aloop = aloop->next)
{ {
cloop = duplicate_loop (aloop, target); cloop = duplicate_loop (aloop, target, tail);
tail = cloop;
gcc_assert(!tail->next);
duplicate_subloops (aloop, cloop); duplicate_subloops (aloop, cloop);
} }
} }
/* Copies structure of subloops of N loops, stored in array COPIED_LOOPS, /* Copies structure of subloops of N loops, stored in array COPIED_LOOPS,
into TARGET loop, placing newly created loops into loop tree. */ into TARGET loop, placing newly created loops into loop tree adding
them to TARGETs sibling list at the end in order. */
static void static void
copy_loops_to (struct loop **copied_loops, int n, struct loop *target) copy_loops_to (struct loop **copied_loops, int n, struct loop *target)
{ {
struct loop *aloop; struct loop *aloop, *tail;
int i; int i;
for (tail = target->inner; tail && tail->next; tail = tail->next)
;
for (i = 0; i < n; i++) for (i = 0; i < n; i++)
{ {
aloop = duplicate_loop (copied_loops[i], target); aloop = duplicate_loop (copied_loops[i], target, tail);
tail = aloop;
gcc_assert(!tail->next);
duplicate_subloops (copied_loops[i], aloop); duplicate_subloops (copied_loops[i], aloop);
} }
} }
...@@ -1072,14 +1084,15 @@ can_duplicate_loop_p (const struct loop *loop) ...@@ -1072,14 +1084,15 @@ can_duplicate_loop_p (const struct loop *loop)
} }
/* Duplicates body of LOOP to given edge E NDUPL times. Takes care of updating /* Duplicates body of LOOP to given edge E NDUPL times. Takes care of updating
loop structure and dominators. E's destination must be LOOP header for loop structure and dominators (order of inner subloops is retained).
this to work, i.e. it must be entry or latch edge of this loop; these are E's destination must be LOOP header for this to work, i.e. it must be entry
unique, as the loops must have preheaders for this function to work or latch edge of this loop; these are unique, as the loops must have
correctly (in case E is latch, the function unrolls the loop, if E is entry preheaders for this function to work correctly (in case E is latch, the
edge, it peels the loop). Store edges created by copying ORIG edge from function unrolls the loop, if E is entry edge, it peels the loop). Store
copies corresponding to set bits in WONT_EXIT bitmap (bit 0 corresponds to edges created by copying ORIG edge from copies corresponding to set bits in
original LOOP body, the other copies are numbered in order given by control WONT_EXIT bitmap (bit 0 corresponds to original LOOP body, the other copies
flow through them) into TO_REMOVE array. Returns false if duplication is are numbered in order given by control flow through them) into TO_REMOVE
array. Returns false if duplication is
impossible. */ impossible. */
bool bool
......
...@@ -47,7 +47,8 @@ extern struct loop *loopify (edge, edge, ...@@ -47,7 +47,8 @@ extern struct loop *loopify (edge, edge,
profile_probability, profile_probability); profile_probability, profile_probability);
extern void unloop (struct loop *, bool *, bitmap); extern void unloop (struct loop *, bool *, bitmap);
extern void copy_loop_info (struct loop *loop, struct loop *target); extern void copy_loop_info (struct loop *loop, struct loop *target);
extern struct loop * duplicate_loop (struct loop *, struct loop *); extern struct loop * duplicate_loop (struct loop *, struct loop *,
struct loop * = NULL);
extern void duplicate_subloops (struct loop *, struct loop *); extern void duplicate_subloops (struct loop *, struct loop *);
extern bool can_duplicate_loop_p (const struct loop *loop); extern bool can_duplicate_loop_p (const struct loop *loop);
extern bool duplicate_loop_to_header_edge (struct loop *, edge, extern bool duplicate_loop_to_header_edge (struct loop *, edge,
......
...@@ -2695,6 +2695,10 @@ fsplit-loops ...@@ -2695,6 +2695,10 @@ fsplit-loops
Common Report Var(flag_split_loops) Optimization Common Report Var(flag_split_loops) Optimization
Perform loop splitting. Perform loop splitting.
funroll-and-jam
Common Report Var(flag_unroll_jam) Optimization
Perform unroll-and-jam on loops.
funwind-tables funwind-tables
Common Report Var(flag_unwind_tables) Optimization Common Report Var(flag_unwind_tables) Optimization
Just generate unwind tables for exception handling. Just generate unwind tables for exception handling.
......
...@@ -437,7 +437,7 @@ Objective-C and Objective-C++ Dialects}. ...@@ -437,7 +437,7 @@ Objective-C and Objective-C++ Dialects}.
-ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol -ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol
-ftree-switch-conversion -ftree-tail-merge @gol -ftree-switch-conversion -ftree-tail-merge @gol
-ftree-ter -ftree-vectorize -ftree-vrp -funconstrained-commons @gol -ftree-ter -ftree-vectorize -ftree-vrp -funconstrained-commons @gol
-funit-at-a-time -funroll-all-loops -funroll-loops @gol -funit-at-a-time -funroll-all-loops -funroll-loops -funroll-and-jam @gol
-funsafe-math-optimizations -funswitch-loops @gol -funsafe-math-optimizations -funswitch-loops @gol
-fipa-ra -fvariable-expansion-in-unroller -fvect-cost-model -fvpt @gol -fipa-ra -fvariable-expansion-in-unroller -fvect-cost-model -fvpt @gol
-fweb -fwhole-program -fwpa -fuse-linker-plugin @gol -fweb -fwhole-program -fwpa -fuse-linker-plugin @gol
...@@ -9771,6 +9771,12 @@ for one side of the iteration space and false for the other. ...@@ -9771,6 +9771,12 @@ for one side of the iteration space and false for the other.
Move branches with loop invariant conditions out of the loop, with duplicates Move branches with loop invariant conditions out of the loop, with duplicates
of the loop on both branches (modified according to result of the condition). of the loop on both branches (modified according to result of the condition).
@item -funroll-and-jam
@opindex funroll-and-jam
Apply unroll and jam transoformations on feasible loops. In a loop
nest this unrolls the outer loop by some factor and fuses the resulting
multiple inner loops.
@item -ffunction-sections @item -ffunction-sections
@itemx -fdata-sections @itemx -fdata-sections
@opindex ffunction-sections @opindex ffunction-sections
...@@ -10838,6 +10844,14 @@ we may be able to devirtualize speculatively. ...@@ -10838,6 +10844,14 @@ we may be able to devirtualize speculatively.
@item max-vrp-switch-assertions @item max-vrp-switch-assertions
The maximum number of assertions to add along the default edge of a switch The maximum number of assertions to add along the default edge of a switch
statement during VRP. The default is 10. statement during VRP. The default is 10.
@item unroll-jam-min-percent
The minimum percentage of memory references that must be optimized
away for the unroll-and-jam transformation to be considered profitable.
@item unroll-jam-max-unroll
The maximum number of times the outer loop should be unrolled by
the unroll-and-jam transformation.
@end table @end table
@end table @end table
...@@ -535,6 +535,7 @@ static const struct default_options default_options_table[] = ...@@ -535,6 +535,7 @@ static const struct default_options default_options_table[] =
{ OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_finline_functions_called_once, NULL, 1 }, { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_finline_functions_called_once, NULL, 1 },
{ OPT_LEVELS_3_PLUS, OPT_fsplit_loops, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_fsplit_loops, NULL, 1 },
{ OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 },
{ OPT_LEVELS_3_PLUS, OPT_funroll_and_jam, NULL, 1 },
{ OPT_LEVELS_3_PLUS, OPT_fgcse_after_reload, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_fgcse_after_reload, NULL, 1 },
{ OPT_LEVELS_3_PLUS, OPT_ftree_loop_vectorize, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_ftree_loop_vectorize, NULL, 1 },
{ OPT_LEVELS_3_PLUS, OPT_ftree_slp_vectorize, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_ftree_slp_vectorize, NULL, 1 },
......
...@@ -1293,6 +1293,16 @@ DEFPARAM (PARAM_VECT_EPILOGUES_NOMASK, ...@@ -1293,6 +1293,16 @@ DEFPARAM (PARAM_VECT_EPILOGUES_NOMASK,
"Enable loop epilogue vectorization using smaller vector size.", "Enable loop epilogue vectorization using smaller vector size.",
0, 0, 1) 0, 0, 1)
DEFPARAM(PARAM_UNROLL_JAM_MIN_PERCENT,
"unroll-jam-min-percent",
"Minimum percentage of memrefs that must go away for unroll-and-jam to be considered profitable.",
1, 0, 100)
DEFPARAM(PARAM_UNROLL_JAM_MAX_UNROLL,
"unroll-jam-max-unroll",
"Maximum unroll factor for the unroll-and-jam transformation.",
4, 0, 0)
/* /*
Local variables: Local variables:
......
...@@ -273,6 +273,7 @@ along with GCC; see the file COPYING3. If not see ...@@ -273,6 +273,7 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_tree_unswitch); NEXT_PASS (pass_tree_unswitch);
NEXT_PASS (pass_scev_cprop); NEXT_PASS (pass_scev_cprop);
NEXT_PASS (pass_loop_split); NEXT_PASS (pass_loop_split);
NEXT_PASS (pass_loop_jam);
/* All unswitching, final value replacement and splitting can expose /* All unswitching, final value replacement and splitting can expose
empty loops. Remove them now. */ empty loops. Remove them now. */
NEXT_PASS (pass_cd_dce); NEXT_PASS (pass_cd_dce);
......
2017-12-07 Michael Matz <matz@suse.de>
* gcc.dg/unroll-and-jam.c: New test.
2017-12-07 Richard Biener <rguenther@suse.de> 2017-12-07 Richard Biener <rguenther@suse.de>
PR tree-optimization/83296 PR tree-optimization/83296
......
/* { dg-do run } */
/* { dg-options "-O3 -funroll-and-jam --param unroll-jam-min-percent=0 -fdump-tree-unrolljam-details" } */
/* { dg-require-effective-target int32plus } */
#include <stdio.h>
extern unsigned int a[];
extern unsigned int b[];
extern unsigned int aa[][1024];
unsigned int checksum;
void checkaa(void)
{
unsigned sum = 1;
unsigned long i, j;
for (i = 0; i < 1024; i++) {
for (j = 0; j < 16; j++) {
sum += aa[j][i]*31+47;
}
}
checksum = checksum * 27 + sum;
//printf(" %d\n", sum);
}
void checkb(void)
{
unsigned sum = 1;
unsigned long i, j;
for (i = 0; i < 1024; i++) {
sum += b[i]*31+47;
}
checksum = checksum * 27 + sum;
//printf(" %d\n", sum);
}
#define TEST(name, body, test) \
static void __attribute__((noinline,noclone)) name (unsigned long n, unsigned long m) \
{ \
unsigned long i, j; \
for (i = 1; i < m; i++) { \
for (j = 1; j < n; j++) { \
body; \
} \
} \
test; \
} \
static void __attribute__((noinline,noclone,optimize("O1"))) name ## noopt (unsigned long n, unsigned long m) \
{ \
unsigned long i, j; \
for (i = 1; i < m; i++) { \
for (j = 1; j < n; j++) { \
body; \
} \
} \
test; \
}
TEST(foo1, aa[i+1][j+1]=aa[i][j] * aa[i][j] / 2, checkaa()) //ok, -1,-1
TEST(foo2, aa[i][j+1]=3*aa[i+1][j], checkaa()) //notok, 1,-1
TEST(foo3, aa[i+1][j-1]=aa[i][j] * aa[i][j] / 2, checkaa()) //notok, -1,1
TEST(foo4, aa[i][j] = aa[i-1][j+1] * aa[i-1][j+1] / 2, checkaa()) //notok, -1,1
TEST(foo5, aa[i][j] = aa[i+1][j+1] * aa[i+1][j+1] / 2, checkaa()) //ok, 1,1
TEST(foo6, aa[i][j] = aa[i+1][j] * aa[i+1][j] / 2, checkaa()) //ok, -1,0
TEST(foo7, aa[i+1][j] = aa[i][j] * aa[i][j] / 2, checkaa()) //ok, 1,0
TEST(foo9, b[j] = 3*b[j+1] + 1, checkb()) //notok, 0,-1
TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0
/* foo8 should work as well, but currently doesn't because the distance
vectors we compute are too pessimistic. We compute
(0,1), (1,1) and (1,-1)
and the last one causes us to lose. */
TEST(foo8, b[j+1] = 3*b[j] + 1, checkb()) //ok, 0,1
unsigned int a[1024];
unsigned int b[1024];
unsigned int aa[16][1024];
void init(void)
{
unsigned long i,j;
for (i = 0; i < 1024; i++) {
for (j = 0; j < 16; j++) {
aa[j][i] = ((j+1)*2+i+1) % 17;
}
a[i] = ((i+1)*31) % 19;
b[i] = ((i+1)*47) % 23;
}
checksum = 1;
}
#define RUN(name) \
printf(" %s\n", #name); \
init();for(i=0;i<4;i++)name##noopt(32,8); checka = checksum; \
init();for(i=0;i<4;i++)name(32,8); \
printf("%sok %s\n", checka != checksum ? "NOT " : "", #name);
int main()
{
int i;
unsigned checka;
RUN(foo1);
RUN(foo2);
RUN(foo3);
RUN(foo4);
RUN(foo5);
RUN(foo6);
RUN(foo7);
RUN(foo8);
RUN(foo9);
RUN(foo10);
return 0;
}
/* Five loops should be unroll-jammed (actually six, but see above). */
/* { dg-final { scan-tree-dump-times "applying unroll and jam" 5 "unrolljam" } } */
...@@ -188,6 +188,7 @@ DEFTIMEVAR (TV_TREE_LOOP_IVCANON , "tree canonical iv") ...@@ -188,6 +188,7 @@ DEFTIMEVAR (TV_TREE_LOOP_IVCANON , "tree canonical iv")
DEFTIMEVAR (TV_SCEV_CONST , "scev constant prop") DEFTIMEVAR (TV_SCEV_CONST , "scev constant prop")
DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH , "tree loop unswitching") DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH , "tree loop unswitching")
DEFTIMEVAR (TV_LOOP_SPLIT , "loop splitting") DEFTIMEVAR (TV_LOOP_SPLIT , "loop splitting")
DEFTIMEVAR (TV_LOOP_JAM , "unroll and jam")
DEFTIMEVAR (TV_COMPLETE_UNROLL , "complete unrolling") DEFTIMEVAR (TV_COMPLETE_UNROLL , "complete unrolling")
DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops") DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization") DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization")
......
...@@ -370,6 +370,7 @@ extern gimple_opt_pass *make_pass_tree_loop_init (gcc::context *ctxt); ...@@ -370,6 +370,7 @@ extern gimple_opt_pass *make_pass_tree_loop_init (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_lim (gcc::context *ctxt); extern gimple_opt_pass *make_pass_lim (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_tree_unswitch (gcc::context *ctxt); extern gimple_opt_pass *make_pass_tree_unswitch (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_loop_split (gcc::context *ctxt); extern gimple_opt_pass *make_pass_loop_split (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_loop_jam (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_predcom (gcc::context *ctxt); extern gimple_opt_pass *make_pass_predcom (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_iv_canon (gcc::context *ctxt); extern gimple_opt_pass *make_pass_iv_canon (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_scev_cprop (gcc::context *ctxt); extern gimple_opt_pass *make_pass_scev_cprop (gcc::context *ctxt);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment