Commit dea61d92 by Sebastian Pop Committed by Sebastian Pop

invoke.texi: Document -ftree-loop-distribution.

	* doc/invoke.texi: Document -ftree-loop-distribution.
	* tree-loop-distribution.c: New.
	* tree-pass.h (pass_loop_distribution): New.
	* graphds.h (struct graph): Add htab_t indices.
	* timevar.def (TV_TREE_LOOP_DISTRIBUTION): New.
	* tree-vectorizer.c (rename_variables_in_loop): Extern.
	(slpeel_tree_duplicate_loop_to_edge_cfg): Init PENDING_STMT to NULL.
	* tree-vectorizer.h (tree_duplicate_loop_on_edge): Declared.
	* tree-data-ref.c (debug_data_dependence_relations): New.
	(dump_data_dependence_relation): Also print data references.
	(free_data_ref): Extern.
	(same_access_functions): Moved...
	(find_vertex_for_stmt): Renamed rdg_vertex_for_stmt.
	(dump_rdg_vertex, debug_rdg_vertex, dump_rdg_component,
	debug_rdg_component, dump_rdg, debug_rdg, dot_rdg_1, dot_rdg,
	struct rdg_vertex_info, rdg_vertex_for_stmt): New.
	(create_rdg_edge_for_ddr, create_rdg_vertices): Cleaned up.
	(stmts_from_loop): Skip LABEL_EXPR.
	(hash_stmt_vertex_info, eq_stmt_vertex_info, hash_stmt_vertex_del): New.
	(build_rdg): Initialize rdg->indices htab.
	(free_rdg, stores_from_loop, ref_base_address,
	rdg_defs_used_in_other_loops_p, have_similar_memory_accesses,
	have_similar_memory_accesses_1, ref_base_address_1,
	remove_similar_memory_refs): New.
	* tree-data-ref.h: Depend on tree-chrec.h.
	(debug_data_dependence_relations, free_data_ref): Declared.
	(same_access_functions): ... here.
	(ddr_is_anti_dependent, ddrs_have_anti_deps, ddr_dependence_level): New.
	(struct rdg_vertex): Add has_mem_write and has_mem_reads.
	(RDGV_HAS_MEM_WRITE, RDGV_HAS_MEM_READS, RDG_STMT,
	RDG_MEM_WRITE_STMT, RDG_MEM_READS_STMT): New.
	(dump_rdg_vertex, debug_rdg_vertex, dump_rdg_component,
	debug_rdg_component, dump_rdg, debug_rdg, dot_rdg,
	rdg_vertex_for_stmt): Declared.
	(struct rdg_edge): Add level.
	(RDGE_LEVEL): New.
	(free_rdg, stores_from_loop, remove_similar_memory_refs,
	rdg_defs_used_in_other_loops_p, have_similar_memory_accesses): Declared.
	(rdg_has_similar_memory_accesses): New.
	* tree-vect-analyze.c: Remove unused static decls.
	* lambda.h (dependence_level): New.
	* common.opt (ftree-loop-distribution): New.
	* tree-flow.h (mark_virtual_ops_in_bb, 
	slpeel_tree_duplicate_loop_to_edge_cfg,
	rename_variables_in_loop): Declared.
	* Makefile.in (TREE_DATA_REF_H): Depend on tree-chrec.h.
	(OBJS-common): Add tree-loop-distribution.o.
	(tree-loop-distribution.o): New rule.
	* tree-cfg.c (mark_virtual_ops_in_bb): New.
	(mark_virtual_ops_in_region): Use mark_virtual_ops_in_bb.
	* passes.c (init_optimization_passes): Schedule pass_loop_distribution.

	* testsuite/gcc.dg/tree-ssa/ldist-{1..12}.c: New.

From-SVN: r132745
parent dde75838
2008-02-28 Sebastian Pop <sebastian.pop@amd.com>
* doc/invoke.texi: Document -ftree-loop-distribution.
* tree-loop-distribution.c: New.
* tree-pass.h (pass_loop_distribution): New.
* graphds.h (struct graph): Add htab_t indices.
* timevar.def (TV_TREE_LOOP_DISTRIBUTION): New.
* tree-vectorizer.c (rename_variables_in_loop): Extern.
(slpeel_tree_duplicate_loop_to_edge_cfg): Init PENDING_STMT to NULL.
* tree-vectorizer.h (tree_duplicate_loop_on_edge): Declared.
* tree-data-ref.c (debug_data_dependence_relations): New.
(dump_data_dependence_relation): Also print data references.
(free_data_ref): Extern.
(same_access_functions): Moved...
(find_vertex_for_stmt): Renamed rdg_vertex_for_stmt.
(dump_rdg_vertex, debug_rdg_vertex, dump_rdg_component,
debug_rdg_component, dump_rdg, debug_rdg, dot_rdg_1, dot_rdg,
struct rdg_vertex_info, rdg_vertex_for_stmt): New.
(create_rdg_edge_for_ddr, create_rdg_vertices): Cleaned up.
(stmts_from_loop): Skip LABEL_EXPR.
(hash_stmt_vertex_info, eq_stmt_vertex_info, hash_stmt_vertex_del): New.
(build_rdg): Initialize rdg->indices htab.
(free_rdg, stores_from_loop, ref_base_address,
rdg_defs_used_in_other_loops_p, have_similar_memory_accesses,
have_similar_memory_accesses_1, ref_base_address_1,
remove_similar_memory_refs): New.
* tree-data-ref.h: Depend on tree-chrec.h.
(debug_data_dependence_relations, free_data_ref): Declared.
(same_access_functions): ... here.
(ddr_is_anti_dependent, ddrs_have_anti_deps, ddr_dependence_level): New.
(struct rdg_vertex): Add has_mem_write and has_mem_reads.
(RDGV_HAS_MEM_WRITE, RDGV_HAS_MEM_READS, RDG_STMT,
RDG_MEM_WRITE_STMT, RDG_MEM_READS_STMT): New.
(dump_rdg_vertex, debug_rdg_vertex, dump_rdg_component,
debug_rdg_component, dump_rdg, debug_rdg, dot_rdg,
rdg_vertex_for_stmt): Declared.
(struct rdg_edge): Add level.
(RDGE_LEVEL): New.
(free_rdg, stores_from_loop, remove_similar_memory_refs,
rdg_defs_used_in_other_loops_p, have_similar_memory_accesses): Declared.
(rdg_has_similar_memory_accesses): New.
* tree-vect-analyze.c: Remove unused static decls.
* lambda.h (dependence_level): New.
* common.opt (ftree-loop-distribution): New.
* tree-flow.h (mark_virtual_ops_in_bb,
slpeel_tree_duplicate_loop_to_edge_cfg,
rename_variables_in_loop): Declared.
* Makefile.in (TREE_DATA_REF_H): Depend on tree-chrec.h.
(OBJS-common): Add tree-loop-distribution.o.
(tree-loop-distribution.o): New rule.
* tree-cfg.c (mark_virtual_ops_in_bb): New.
(mark_virtual_ops_in_region): Use mark_virtual_ops_in_bb.
* passes.c (init_optimization_passes): Schedule pass_loop_distribution.
2008-02-28 Joseph Myers <joseph@codesourcery.com> 2008-02-28 Joseph Myers <joseph@codesourcery.com>
PR target/33963 PR target/33963
......
...@@ -838,7 +838,7 @@ DIAGNOSTIC_H = diagnostic.h diagnostic.def $(PRETTY_PRINT_H) options.h ...@@ -838,7 +838,7 @@ DIAGNOSTIC_H = diagnostic.h diagnostic.def $(PRETTY_PRINT_H) options.h
C_PRETTY_PRINT_H = c-pretty-print.h $(PRETTY_PRINT_H) $(C_COMMON_H) $(TREE_H) C_PRETTY_PRINT_H = c-pretty-print.h $(PRETTY_PRINT_H) $(C_COMMON_H) $(TREE_H)
SCEV_H = tree-scalar-evolution.h $(GGC_H) tree-chrec.h $(PARAMS_H) SCEV_H = tree-scalar-evolution.h $(GGC_H) tree-chrec.h $(PARAMS_H)
LAMBDA_H = lambda.h $(TREE_H) vec.h $(GGC_H) LAMBDA_H = lambda.h $(TREE_H) vec.h $(GGC_H)
TREE_DATA_REF_H = tree-data-ref.h $(LAMBDA_H) omega.h graphds.h TREE_DATA_REF_H = tree-data-ref.h $(LAMBDA_H) omega.h graphds.h tree-chrec.h
VARRAY_H = varray.h $(MACHMODE_H) $(SYSTEM_H) coretypes.h $(TM_H) VARRAY_H = varray.h $(MACHMODE_H) $(SYSTEM_H) coretypes.h $(TM_H)
TREE_INLINE_H = tree-inline.h $(VARRAY_H) pointer-set.h TREE_INLINE_H = tree-inline.h $(VARRAY_H) pointer-set.h
REAL_H = real.h $(MACHMODE_H) REAL_H = real.h $(MACHMODE_H)
...@@ -1156,6 +1156,7 @@ OBJS-common = \ ...@@ -1156,6 +1156,7 @@ OBJS-common = \
tree-if-conv.o \ tree-if-conv.o \
tree-into-ssa.o \ tree-into-ssa.o \
tree-iterator.o \ tree-iterator.o \
tree-loop-distribution.o \
tree-loop-linear.o \ tree-loop-linear.o \
tree-nested.o \ tree-nested.o \
tree-nrv.o \ tree-nrv.o \
...@@ -2283,6 +2284,11 @@ tree-loop-linear.o: tree-loop-linear.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ ...@@ -2283,6 +2284,11 @@ tree-loop-linear.o: tree-loop-linear.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
$(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \ $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \
tree-pass.h $(TREE_DATA_REF_H) $(SCEV_H) $(EXPR_H) $(LAMBDA_H) \ tree-pass.h $(TREE_DATA_REF_H) $(SCEV_H) $(EXPR_H) $(LAMBDA_H) \
$(TARGET_H) tree-chrec.h $(OBSTACK_H) $(TARGET_H) tree-chrec.h $(OBSTACK_H)
tree-loop-distribution.o: tree-loop-distribution.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
$(TM_H) $(GGC_H) $(OPTABS_H) $(TREE_H) $(RTL_H) $(BASIC_BLOCK_H) \
$(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \
tree-pass.h $(TREE_DATA_REF_H) $(SCEV_H) $(EXPR_H) \
$(TARGET_H) tree-chrec.h tree-vectorizer.h
tree-parloops.o: tree-parloops.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \ tree-parloops.o: tree-parloops.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
$(TREE_FLOW_H) $(TREE_H) $(RTL_H) $(CFGLOOP_H) $(TREE_DATA_REF_H) $(GGC_H) \ $(TREE_FLOW_H) $(TREE_H) $(RTL_H) $(CFGLOOP_H) $(TREE_DATA_REF_H) $(GGC_H) \
$(DIAGNOSTIC_H) tree-pass.h $(SCEV_H) langhooks.h gt-tree-parloops.h \ $(DIAGNOSTIC_H) tree-pass.h $(SCEV_H) langhooks.h gt-tree-parloops.h \
......
...@@ -1098,6 +1098,10 @@ ftree-fre ...@@ -1098,6 +1098,10 @@ ftree-fre
Common Report Var(flag_tree_fre) Optimization Common Report Var(flag_tree_fre) Optimization
Enable Full Redundancy Elimination (FRE) on trees Enable Full Redundancy Elimination (FRE) on trees
ftree-loop-distribution
Common Report Var(flag_tree_loop_distribution)
Enable loop distribution on trees
ftree-loop-im ftree-loop-im
Common Report Var(flag_tree_loop_im) Init(1) Optimization Common Report Var(flag_tree_loop_im) Init(1) Optimization
Enable loop invariant motion on trees Enable loop invariant motion on trees
......
...@@ -354,6 +354,7 @@ Objective-C and Objective-C++ Dialects}. ...@@ -354,6 +354,7 @@ Objective-C and Objective-C++ Dialects}.
-fstrict-aliasing -fstrict-overflow -fthread-jumps -ftracer -ftree-ccp @gol -fstrict-aliasing -fstrict-overflow -fthread-jumps -ftracer -ftree-ccp @gol
-ftree-ch -ftree-copy-prop -ftree-copyrename -ftree-dce @gol -ftree-ch -ftree-copy-prop -ftree-copyrename -ftree-dce @gol
-ftree-dominator-opts -ftree-dse -ftree-fre -ftree-loop-im @gol -ftree-dominator-opts -ftree-dse -ftree-fre -ftree-loop-im @gol
-ftree-loop-distribution @gol
-ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize @gol -ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize @gol
-ftree-parallelize-loops=@var{n} -ftree-pre -ftree-reassoc -ftree-salias @gol -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-reassoc -ftree-salias @gol
-ftree-sink -ftree-sra -ftree-store-ccp -ftree-ter @gol -ftree-sink -ftree-sra -ftree-store-ccp -ftree-ter @gol
...@@ -5928,6 +5929,11 @@ performance and allow further loop optimizations to take place. ...@@ -5928,6 +5929,11 @@ performance and allow further loop optimizations to take place.
Compare the results of several data dependence analyzers. This option Compare the results of several data dependence analyzers. This option
is used for debugging the data dependence analyzers. is used for debugging the data dependence analyzers.
@item -ftree-loop-distribution
Perform loop distribution. This flag can improve cache performance on
big loop bodies and allow further loop optimizations, like
parallelization or vectorization, to take place.
@item -ftree-loop-im @item -ftree-loop-im
@opindex ftree-loop-im @opindex ftree-loop-im
Perform loop invariant motion on trees. This pass moves only invariants that Perform loop invariant motion on trees. This pass moves only invariants that
......
...@@ -47,6 +47,7 @@ struct graph ...@@ -47,6 +47,7 @@ struct graph
int n_vertices; /* Number of vertices. */ int n_vertices; /* Number of vertices. */
struct vertex *vertices; struct vertex *vertices;
/* The vertices. */ /* The vertices. */
htab_t indices; /* Fast lookup for indices. */
}; };
struct graph *new_graph (int); struct graph *new_graph (int);
......
...@@ -469,5 +469,22 @@ build_linear_expr (tree type, lambda_vector coefs, VEC (tree, heap) *ivs) ...@@ -469,5 +469,22 @@ build_linear_expr (tree type, lambda_vector coefs, VEC (tree, heap) *ivs)
return expr; return expr;
} }
/* Returns the dependence level for a vector DIST of size LENGTH.
LEVEL = 0 means a lexicographic dependence, i.e. a dependence due
to the sequence of statements, not carried by any loop. */
static inline unsigned
dependence_level (lambda_vector dist_vect, int length)
{
int i;
for (i = 0; i < length; i++)
if (dist_vect[i] != 0)
return i + 1;
return 0;
}
#endif /* LAMBDA_H */ #endif /* LAMBDA_H */
...@@ -625,6 +625,7 @@ init_optimization_passes (void) ...@@ -625,6 +625,7 @@ init_optimization_passes (void)
NEXT_PASS (pass_empty_loop); NEXT_PASS (pass_empty_loop);
NEXT_PASS (pass_record_bounds); NEXT_PASS (pass_record_bounds);
NEXT_PASS (pass_check_data_deps); NEXT_PASS (pass_check_data_deps);
NEXT_PASS (pass_loop_distribution);
NEXT_PASS (pass_linear_transform); NEXT_PASS (pass_linear_transform);
NEXT_PASS (pass_iv_canon); NEXT_PASS (pass_iv_canon);
NEXT_PASS (pass_if_conversion); NEXT_PASS (pass_if_conversion);
......
2008-02-28 Sebastian Pop <sebastian.pop@amd.com>
* testsuite/gcc.dg/tree-ssa/ldist-1.c: New.
* testsuite/gcc.dg/tree-ssa/ldist-1a.c: New.
* testsuite/gcc.dg/tree-ssa/ldist-2.c: New.
* testsuite/gcc.dg/tree-ssa/ldist-3.c: New.
* testsuite/gcc.dg/tree-ssa/ldist-4.c: New.
* testsuite/gcc.dg/tree-ssa/ldist-5.c: New.
* testsuite/gcc.dg/tree-ssa/ldist-6.c: New.
* testsuite/gcc.dg/tree-ssa/ldist-7.c: New.
* testsuite/gcc.dg/tree-ssa/ldist-8.c: New.
* testsuite/gcc.dg/tree-ssa/ldist-9.c: New.
* testsuite/gcc.dg/tree-ssa/ldist-10.c: New.
* testsuite/gcc.dg/tree-ssa/ldist-11.c: New.
* testsuite/gcc.dg/tree-ssa/ldist-12.c: New.
* testsuite/gfortran.dg/ldist-1.f90: New.
2008-02-28 Uros Bizjak <ubizjak@gmail.com> 2008-02-28 Uros Bizjak <ubizjak@gmail.com>
* gcc.dg/pr34351.c: Compile for x86 targets only. Use %ebx register. * gcc.dg/pr34351.c: Compile for x86 targets only. Use %ebx register.
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
void foo (int * __restrict__ ia,
int * __restrict__ ib,
int * __restrict__ oxa,
int * __restrict__ oxb,
int * __restrict__ oya,
int * __restrict__ oyb)
{
int i;
long int mya[52];
long int myb[52];
for (i=0; i < 52; i++)
{
mya[i] = ia[i] * oxa[i] + ib[i] * oxb[i];
myb[i] = -ia[i] * oxb[i] + ib[i] * oxa[i];
oya[i] = mya[i] >> 10;
oyb[i] = myb[i] >> 10;
}
/* This loop was distributed, but it is not anymore due to the cost
model changes: the result of a distribution would look like this:
| for (i=0; i < 52; i++)
| oya[i] = ia[i] * oxa[i] + ib[i] * oxb[i] >> 10;
|
| for (i=0; i < 52; i++)
| oyb[i] = -ia[i] * oxb[i] + ib[i] * oxa[i] >> 10;
and in this the array IA is read in both tasks. For maximizing
the cache reuse, ldist does not distributes this loop anymore.
*/
}
/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
int loop1 (int k)
{
unsigned int i;
int a[1000], b[1000], c[1000];
for (i = 1; i < 1000; i ++)
{
a[i] = c[i]; /* S1 */
b[i] = a[i-1]+1; /* S2 */
}
/* Dependences:
S1->S2 (flow, level 1)
One partition as A is used in both S1 and S2.
*/
return a[1000-2] + b[1000-1] + c[1000-2];
}
/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
void foo (int * __restrict__ ia,
int * __restrict__ ib,
int * __restrict__ oxa,
int * __restrict__ oxb,
int * __restrict__ oya,
int * __restrict__ oyb)
{
int i;
long int mya[52];
long int myb[52];
for (i=0; i < 52; i++)
{
mya[i] = ia[i] * oxa[i] + ib[i] * oxb[i];
myb[i] = -ia[i] * oxb[i] + ib[i] * oxa[i];
oya[i] = 0;
oyb[i] = myb[i] >> 10;
}
/* This loop should be distributed, and the result should look like
this:
| memset (oya, 0, 208);
| for (i=0; i < 52; i++)
| oyb[i] = -ia[i] * oxb[i] + ib[i] * oxa[i] >> 10;
*/
}
/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 "ldist" } } */
/* { dg-final { scan-tree-dump-times "generated memset zero" 1 "ldist" } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
int foo (int * __restrict__ ia,
int * __restrict__ ib,
int * __restrict__ oxa,
int * __restrict__ oxb)
{
int i;
int oya[52], oyb[52];
for (i=0; i < 52; i++)
{
oya[i] = (ia[i] * oxa[i]) >> 10;
oyb[i] = (ib[i] * oxb[i]) >> 10;
}
return oya[22] + oyb[21];
}
/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 "ldist" } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
int foo (int * __restrict__ ia,
int * __restrict__ ib,
int * __restrict__ oxa,
int * __restrict__ oxb)
{
int i;
int oya[52], oyb[52];
for (i=0; i < 52; i++)
{
oya[i] = (ia[i] * oxa[i] + ib[i] * oxb[i]) >> 10;
oyb[i] = (-ia[i] * oxb[i] + ib[i] * oxa[i]) >> 10;
}
return oya[22] + oyb[21];
}
/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
void foo (int * __restrict__ a,
int * __restrict__ b,
int * __restrict__ c)
{
int i;
for (i=1; i < 10; i++)
{
a[i] += c[i];
b[i] = a[i - 1] + 1;
}
/* This loop is not distributed because the cost of spliting it:
| for (i=1; i < N; i++)
| a[i] += c[i];
|
| for (i=1; i < N; i++)
| b[i] = a[i - 1] + 1;
is higher due to data in array A that is written and then read in
another task. The cost model should forbid the transformation in
this case.
*/
}
/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
int loop1 (int k)
{
unsigned int i;
int a[10000], b[10000], c[10000], d[10000];
a[0] = k; a[3] = k*2;
c[1] = k+1;
for (i = 2; i < (10000-1); i ++)
{
a[i] = k * i; /* S1 */
b[i] = a[i-2] + k; /* S2 */
c[i] = b[i] + a[i+1]; /* S3 */
d[i] = c[i-1] + k + i; /* S4 */
}
/*
Dependences:
S1 -> S2 (flow, level 1)
S1 -> S3 (anti, level 1)
S2 -> S3 (flow, level 0)
S3 -> S4 (flow, level 1)
There are three partitions: {S1, S3}, {S2} and {S4}.
The cost model should fuse together all the partitions, as they
are reusing the same data, ending on a single partition.
*/
return a[10000-2] + b[10000-1] + c[10000-2] + d[10000-2];
}
/* { dg-final { scan-tree-dump-times "distributed: split to 3 loops" 0 "ldist" } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
int loop1 (int k)
{
unsigned int i;
unsigned int j;
int a[100], b[100][100];
a[0] = k;
for (i = 1; i < 100; i ++)
{
for (j = 0; j < 100; j++)
{
a[j] = k * i;
b[i][j] = a[j-1] + k;
}
}
return b[100-1][0];
}
/* We used to distribute also innermost loops, but these could produce
too much code in the outer loop, degrading performance of scalar
code. So this test is XFAILed because the cost model of the stand
alone distribution pass has evolved. */
/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
int loop1 (int k)
{
unsigned int i;
unsigned int j;
int a[100][100], b[100][100], c[100][100], d[100][100];
a[0][0] = k;
for (i = 1; i < 100; i ++)
for (j = 1; j < (100-1); j++)
{
a[i][j] = k * i; /* S1 */
b[i][j] = a[i][j-1] + k; /* S2 */
c[i][j] = b[i][j] + a[i][j+1]; /* S3 */
d[i][j] = c[i][j] + k + i; /* S4 */
}
/* Dependences:
S1->S2 (flow, level 2)
S1->S3 (anti, level 2)
S2->S3 (flow, level 0)
S3->S4 (flow, level 0)
*/
return a[100-1][100-1] + b[100-1][100-1] + c[100-1][100-1] + d[100-1][100-1];
}
/* FIXME: This is XFAILed because of a data dependence analysis
problem: the dependence test fails with a "don't know" relation. */
/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 "ldist" { xfail *-*-* } } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
int loop1 (int k)
{
unsigned int i;
int a[1000], b[1000], c[1000], d[1000];
for (i = 2; i < (1000-1); i ++) {
a[i] = k * i; /* S1 */
b[i] = a[i-2] + k; /* S2 */
c[i] = b[i-1] + a[i+1]; /* S3 */
d[i] = c[i-1] + k + i; /* S4 */
}
/* Dependences:
S1->S2 (flow, level 1)
S2->S3 (flow, level 1)
S3->S1 (anti, level 1)
S3->S4 (flow, level 1)
There are two partitions: {S1, S2, S3} and {S4}.
{S1, S2, S3} have to be in the same partition because:
- S1 (i) has to be executed before S2 (i+2), as S1 produces a[i] that is then consumed 2 iterations later by S2.
- S2 (i) has to be executed before S3 (i+1), as S2 produces b[i] that is then consumed one iteration later by S3,
- S3 (i) has to be executed before S1 (i+1), as a[i+1] has to execute before the update to a[i],
{S4} is the consumer partition: it consumes the values from array "c" produced in S3.
The cost model should fuse all the tasks together as the cost of
fetching data from caches is too high.
*/
return a[1000-2] + b[1000-1] + c[1000-2] + d[1000-2];
}
/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
int loop1 (int k)
{
unsigned int i, z;
int a[1000], b[1000], c[1000], d[1000];
for (i = 2; i < (1000-1); i ++) {
z = a[i+1]; /* S1 */
a[i] = k * i; /* S2 */
b[i] = a[i-2] + k; /* S3 */
c[i] = b[i-1] + z; /* S4 */
d[i] = c[i-1] + b[i+1] + k + i; /* S5 */
}
/* Dependences:
S1->S2 (anti, level 1)
S1->S4 (flow, level 1, scalar)
S2->S3 (flow, level 1)
S3->S4 (flow, level 1)
S4->S5 (flow, level 1)
S5->S3 (anti, level 1)
There is a single partition: {S1, S2, S3, S4, S5}, because of the
scalar dependence z between the two partitions {S1, S2} and {S3, S4, S5}.
*/
return a[1000-2] + b[1000-1] + c[1000-2] + d[1000-2];
}
/* { dg-final { scan-tree-dump-times "distributed" 0 "ldist" } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
int loop1 (int k)
{
unsigned int i;
int a[1000], b[1000], c[1000], d[1000];
for (i = 2; i < (1000-1); i ++)
{
a[i] = k * i; /* S1 */
b[i] = a[i+1] + k; /* S2 */
c[i] = a[i-1] + b[i-1] + d[i-1]; /* S3 */
d[i] = a[i-1] + b[i+1] + k + i; /* S4 */
}
/* Dependences:
S1->S2 (anti, level 1)
S1->S3 (flow, level 1)
S1->S4 (flow, level 1)
S2->S3 (flow, level 1)
S2->S4 (anti, level 1)
S4->S3 (flow, level 1)
Two partitions: {S1, S2, S4} produce information that is consumed in {S3}.
So that means that the current cost model will also fuse these
two partitions into a single one for avoiding cache misses.
*/
return a[1000-2] + b[1000-1] + c[1000-2] + d[1000-2];
}
/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" } */
int loop1 (int k)
{
unsigned int i;
int a[1000], b[1000];
for (i = 1; i < (1000-1); i ++) {
a[i] = a[i+1] + a[i-1]; /* S1 */
b[i] = a[i-1] + k; /* S2 */
}
/*
Dependences:
S1->S2 (flow, level 1)
S1->S1 (anti, level 1)
S1->S1 (flow, level 1)
One partition, because of the cost of cache misses.
*/
return a[1000-2] + b[1000-1];
}
/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 0 "ldist" } } */
/* { dg-final { cleanup-tree-dump "ldist" } } */
! { dg-do compile }
! { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-all" }
Subroutine PADEC(DKS,DKDS,HVAR,WM,WG,FN,NS,AN,BN,CN,IT)
IMPLICIT REAL*8 (A-H, O-Z)
DIMENSION DKS(*),DKDS(*),HVAR(*)
COMPLEX*16 WM(*),WG(*),FN(*),AN(*),BN(*),CN(*)
COMPLEX*16 H2,CONST
COMMON/STRCH/ALP,BET,DH,ZH,UG,VG,T1,T2,DT,TOL,ALPHA ,HAMP,BUMP
Parameter (F1 = .8333333333333333D0, F2 = .0833333333333333D0)
SS=DT/(2.0D0)
do J=2,NS
BS=SS*DKS(J)*HVAR(J)*HVAR(J)
AN(J)=F1+2.*BS
BN(J)=F2-BS
CN(J)=F2-BS
H2=WM(J+1)
if(J.EQ.NS) then
CONST=CN(J)*H2
else
CONST=(0.D0,0.D0)
endif
FN(J)=(BS+F2)*(H2)+(F1-2.D0*BS)-CONST
end do
return
end Subroutine PADEC
! { dg-final { scan-tree-dump-times "distributed: split to 4 loops" 1 "ldist" } }
! { dg-final { cleanup-tree-dump "ldist" } }
...@@ -123,6 +123,7 @@ DEFTIMEVAR (TV_COMPLETE_UNROLL , "complete unrolling") ...@@ -123,6 +123,7 @@ DEFTIMEVAR (TV_COMPLETE_UNROLL , "complete unrolling")
DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops") DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization") DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization")
DEFTIMEVAR (TV_TREE_LINEAR_TRANSFORM , "tree loop linear") DEFTIMEVAR (TV_TREE_LINEAR_TRANSFORM , "tree loop linear")
DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution")
DEFTIMEVAR (TV_CHECK_DATA_DEPS , "tree check data dependences") DEFTIMEVAR (TV_CHECK_DATA_DEPS , "tree check data dependences")
DEFTIMEVAR (TV_TREE_PREFETCH , "tree prefetching") DEFTIMEVAR (TV_TREE_PREFETCH , "tree prefetching")
DEFTIMEVAR (TV_TREE_LOOP_IVOPTS , "tree iv optimization") DEFTIMEVAR (TV_TREE_LOOP_IVOPTS , "tree iv optimization")
......
...@@ -5646,22 +5646,30 @@ move_stmt_r (tree *tp, int *walk_subtrees, void *data) ...@@ -5646,22 +5646,30 @@ move_stmt_r (tree *tp, int *walk_subtrees, void *data)
/* Marks virtual operands of all statements in basic blocks BBS for /* Marks virtual operands of all statements in basic blocks BBS for
renaming. */ renaming. */
static void void
mark_virtual_ops_in_region (VEC (basic_block,heap) *bbs) mark_virtual_ops_in_bb (basic_block bb)
{ {
tree phi; tree phi;
block_stmt_iterator bsi; block_stmt_iterator bsi;
for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
mark_virtual_ops_for_renaming (phi);
for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
mark_virtual_ops_for_renaming (bsi_stmt (bsi));
}
/* Marks virtual operands of all statements in basic blocks BBS for
renaming. */
static void
mark_virtual_ops_in_region (VEC (basic_block,heap) *bbs)
{
basic_block bb; basic_block bb;
unsigned i; unsigned i;
for (i = 0; VEC_iterate (basic_block, bbs, i, bb); i++) for (i = 0; VEC_iterate (basic_block, bbs, i, bb); i++)
{ mark_virtual_ops_in_bb (bb);
for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
mark_virtual_ops_for_renaming (phi);
for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
mark_virtual_ops_for_renaming (bsi_stmt (bsi));
}
} }
/* Move basic block BB from function CFUN to function DEST_FN. The /* Move basic block BB from function CFUN to function DEST_FN. The
......
...@@ -88,7 +88,6 @@ along with GCC; see the file COPYING3. If not see ...@@ -88,7 +88,6 @@ along with GCC; see the file COPYING3. If not see
#include "tree-dump.h" #include "tree-dump.h"
#include "timevar.h" #include "timevar.h"
#include "cfgloop.h" #include "cfgloop.h"
#include "tree-chrec.h"
#include "tree-data-ref.h" #include "tree-data-ref.h"
#include "tree-scalar-evolution.h" #include "tree-scalar-evolution.h"
#include "tree-pass.h" #include "tree-pass.h"
...@@ -157,6 +156,14 @@ dump_data_references (FILE *file, VEC (data_reference_p, heap) *datarefs) ...@@ -157,6 +156,14 @@ dump_data_references (FILE *file, VEC (data_reference_p, heap) *datarefs)
dump_data_reference (file, dr); dump_data_reference (file, dr);
} }
/* Dump to STDERR all the dependence relations from DDRS. */
void
debug_data_dependence_relations (VEC (ddr_p, heap) *ddrs)
{
dump_data_dependence_relations (stderr, ddrs);
}
/* Dump into FILE all the dependence relations from DDRS. */ /* Dump into FILE all the dependence relations from DDRS. */
void void
...@@ -354,6 +361,10 @@ dump_data_dependence_relation (FILE *outf, ...@@ -354,6 +361,10 @@ dump_data_dependence_relation (FILE *outf,
dra = DDR_A (ddr); dra = DDR_A (ddr);
drb = DDR_B (ddr); drb = DDR_B (ddr);
fprintf (outf, "(Data Dep: \n"); fprintf (outf, "(Data Dep: \n");
dump_data_reference (outf, dra);
dump_data_reference (outf, drb);
if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
fprintf (outf, " (don't know)\n"); fprintf (outf, " (don't know)\n");
...@@ -808,7 +819,7 @@ dr_address_invariant_p (struct data_reference *dr) ...@@ -808,7 +819,7 @@ dr_address_invariant_p (struct data_reference *dr)
/* Frees data reference DR. */ /* Frees data reference DR. */
static void void
free_data_ref (data_reference_p dr) free_data_ref (data_reference_p dr)
{ {
BITMAP_FREE (DR_VOPS (dr)); BITMAP_FREE (DR_VOPS (dr));
...@@ -2787,22 +2798,6 @@ build_classic_dist_vector_1 (struct data_dependence_relation *ddr, ...@@ -2787,22 +2798,6 @@ build_classic_dist_vector_1 (struct data_dependence_relation *ddr,
return true; return true;
} }
/* Return true when the DDR contains two data references that have the
same access functions. */
static bool
same_access_functions (const struct data_dependence_relation *ddr)
{
unsigned i;
for (i = 0; i < DDR_NUM_SUBSCRIPTS (ddr); i++)
if (!eq_evolutions_p (DR_ACCESS_FN (DDR_A (ddr), i),
DR_ACCESS_FN (DDR_B (ddr), i)))
return false;
return true;
}
/* Return true when the DDR contains only constant access functions. */ /* Return true when the DDR contains only constant access functions. */
static bool static bool
...@@ -4371,48 +4366,219 @@ free_data_refs (VEC (data_reference_p, heap) *datarefs) ...@@ -4371,48 +4366,219 @@ free_data_refs (VEC (data_reference_p, heap) *datarefs)
/* Returns the index of STMT in RDG. */ /* Dump vertex I in RDG to FILE. */
static int void
find_vertex_for_stmt (const struct graph *rdg, const_tree stmt) dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
{
struct vertex *v = &(rdg->vertices[i]);
struct graph_edge *e;
fprintf (file, "(vertex %d: (%s%s) (in:", i,
RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
if (v->pred)
for (e = v->pred; e; e = e->pred_next)
fprintf (file, " %d", e->src);
fprintf (file, ") (out:");
if (v->succ)
for (e = v->succ; e; e = e->succ_next)
fprintf (file, " %d", e->dest);
fprintf (file, ") \n");
print_generic_stmt (file, RDGV_STMT (v), TDF_VOPS|TDF_MEMSYMS);
fprintf (file, ")\n");
}
/* Call dump_rdg_vertex on stderr. */
void
debug_rdg_vertex (struct graph *rdg, int i)
{
dump_rdg_vertex (stderr, rdg, i);
}
/* Dump component C of RDG to FILE. If DUMPED is non-null, set the
dumped vertices to that bitmap. */
void dump_rdg_component (FILE *file, struct graph *rdg, int c, bitmap dumped)
{
int i;
fprintf (file, "(%d\n", c);
for (i = 0; i < rdg->n_vertices; i++)
if (rdg->vertices[i].component == c)
{
if (dumped)
bitmap_set_bit (dumped, i);
dump_rdg_vertex (file, rdg, i);
}
fprintf (file, ")\n");
}
/* Call dump_rdg_vertex on stderr. */
void
debug_rdg_component (struct graph *rdg, int c)
{
dump_rdg_component (stderr, rdg, c, NULL);
}
/* Dump the reduced dependence graph RDG to FILE. */
void
dump_rdg (FILE *file, struct graph *rdg)
{ {
int i; int i;
bitmap dumped = BITMAP_ALLOC (NULL);
fprintf (file, "(rdg\n");
for (i = 0; i < rdg->n_vertices; i++) for (i = 0; i < rdg->n_vertices; i++)
if (RDGV_STMT (&(rdg->vertices[i])) == stmt) if (!bitmap_bit_p (dumped, i))
return i; dump_rdg_component (file, rdg, rdg->vertices[i].component, dumped);
gcc_unreachable (); fprintf (file, ")\n");
return 0; BITMAP_FREE (dumped);
} }
/* Creates an edge in RDG for each distance vector from DDR. */ /* Call dump_rdg on stderr. */
void
debug_rdg (struct graph *rdg)
{
dump_rdg (stderr, rdg);
}
static void static void
create_rdg_edge_for_ddr (struct graph *rdg, ddr_p ddr) dot_rdg_1 (FILE *file, struct graph *rdg)
{ {
int va, vb; int i;
data_reference_p dra;
data_reference_p drb; fprintf (file, "digraph RDG {\n");
struct graph_edge *e;
if (DDR_REVERSED_P (ddr)) for (i = 0; i < rdg->n_vertices; i++)
{ {
dra = DDR_B (ddr); struct vertex *v = &(rdg->vertices[i]);
drb = DDR_A (ddr); struct graph_edge *e;
/* Highlight reads from memory. */
if (RDG_MEM_READS_STMT (rdg, i))
fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
/* Highlight stores to memory. */
if (RDG_MEM_WRITE_STMT (rdg, i))
fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
if (v->succ)
for (e = v->succ; e; e = e->succ_next)
switch (RDGE_TYPE (e))
{
case input_dd:
fprintf (file, "%d -> %d [label=input] \n", i, e->dest);
break;
case output_dd:
fprintf (file, "%d -> %d [label=output] \n", i, e->dest);
break;
case flow_dd:
/* These are the most common dependences: don't print these. */
fprintf (file, "%d -> %d \n", i, e->dest);
break;
case anti_dd:
fprintf (file, "%d -> %d [label=anti] \n", i, e->dest);
break;
default:
gcc_unreachable ();
}
} }
else
fprintf (file, "}\n\n");
}
/* Display SCOP using dotty. */
void
dot_rdg (struct graph *rdg)
{
FILE *file = fopen ("/tmp/rdg.dot", "w");
gcc_assert (file != NULL);
dot_rdg_1 (file, rdg);
fclose (file);
system ("dotty /tmp/rdg.dot");
}
/* This structure is used for recording the mapping statement index in
the RDG. */
struct rdg_vertex_info GTY(())
{
tree stmt;
int index;
};
/* Returns the index of STMT in RDG. */
int
rdg_vertex_for_stmt (struct graph *rdg, tree stmt)
{
struct rdg_vertex_info rvi, *slot;
rvi.stmt = stmt;
slot = (struct rdg_vertex_info *) htab_find (rdg->indices, &rvi);
if (!slot)
return -1;
return slot->index;
}
/* Creates an edge in RDG for each distance vector from DDR. The
order that we keep track of in the RDG is the order in which
statements have to be executed. */
static void
create_rdg_edge_for_ddr (struct graph *rdg, ddr_p ddr)
{
struct graph_edge *e;
int va, vb;
data_reference_p dra = DDR_A (ddr);
data_reference_p drb = DDR_B (ddr);
unsigned level = ddr_dependence_level (ddr);
/* For non scalar dependences, when the dependence is REVERSED,
statement B has to be executed before statement A. */
if (level > 0
&& !DDR_REVERSED_P (ddr))
{ {
dra = DDR_A (ddr); data_reference_p tmp = dra;
drb = DDR_B (ddr); dra = drb;
drb = tmp;
} }
va = find_vertex_for_stmt (rdg, DR_STMT (dra)); va = rdg_vertex_for_stmt (rdg, DR_STMT (dra));
vb = find_vertex_for_stmt (rdg, DR_STMT (drb)); vb = rdg_vertex_for_stmt (rdg, DR_STMT (drb));
if (va < 0 || vb < 0)
return;
e = add_edge (rdg, va, vb); e = add_edge (rdg, va, vb);
e->data = XNEW (struct rdg_edge); e->data = XNEW (struct rdg_edge);
RDGE_LEVEL (e) = level;
/* Determines the type of the data dependence. */ /* Determines the type of the data dependence. */
if (DR_IS_READ (dra) && DR_IS_READ (drb)) if (DR_IS_READ (dra) && DR_IS_READ (drb))
RDGE_TYPE (e) = input_dd; RDGE_TYPE (e) = input_dd;
...@@ -4435,9 +4601,13 @@ create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef) ...@@ -4435,9 +4601,13 @@ create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def) FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
{ {
int use = find_vertex_for_stmt (rdg, USE_STMT (imm_use_p)); struct graph_edge *e;
struct graph_edge *e = add_edge (rdg, idef, use); int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
if (use < 0)
continue;
e = add_edge (rdg, idef, use);
e->data = XNEW (struct rdg_edge); e->data = XNEW (struct rdg_edge);
RDGE_TYPE (e) = flow_dd; RDGE_TYPE (e) = flow_dd;
} }
...@@ -4458,8 +4628,8 @@ create_rdg_edges (struct graph *rdg, VEC (ddr_p, heap) *ddrs) ...@@ -4458,8 +4628,8 @@ create_rdg_edges (struct graph *rdg, VEC (ddr_p, heap) *ddrs)
create_rdg_edge_for_ddr (rdg, ddr); create_rdg_edge_for_ddr (rdg, ddr);
for (i = 0; i < rdg->n_vertices; i++) for (i = 0; i < rdg->n_vertices; i++)
FOR_EACH_PHI_OR_STMT_DEF (def_p, RDGV_STMT (&(rdg->vertices[i])), FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
iter, SSA_OP_ALL_DEFS) iter, SSA_OP_DEF)
create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i); create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
} }
...@@ -4468,19 +4638,50 @@ create_rdg_edges (struct graph *rdg, VEC (ddr_p, heap) *ddrs) ...@@ -4468,19 +4638,50 @@ create_rdg_edges (struct graph *rdg, VEC (ddr_p, heap) *ddrs)
static void static void
create_rdg_vertices (struct graph *rdg, VEC (tree, heap) *stmts) create_rdg_vertices (struct graph *rdg, VEC (tree, heap) *stmts)
{ {
int i; int i, j;
tree s; tree stmt;
for (i = 0; VEC_iterate (tree, stmts, i, s); i++) for (i = 0; VEC_iterate (tree, stmts, i, stmt); i++)
{ {
VEC (data_ref_loc, heap) *references;
data_ref_loc *ref;
struct vertex *v = &(rdg->vertices[i]); struct vertex *v = &(rdg->vertices[i]);
struct rdg_vertex_info *rvi = XNEW (struct rdg_vertex_info);
struct rdg_vertex_info **slot;
rvi->stmt = stmt;
rvi->index = i;
slot = (struct rdg_vertex_info **) htab_find_slot (rdg->indices, rvi, INSERT);
if (!*slot)
*slot = rvi;
else
free (rvi);
v->data = XNEW (struct rdg_vertex); v->data = XNEW (struct rdg_vertex);
RDGV_STMT (v) = s; RDG_STMT (rdg, i) = stmt;
RDG_MEM_WRITE_STMT (rdg, i) = false;
RDG_MEM_READS_STMT (rdg, i) = false;
if (TREE_CODE (stmt) == PHI_NODE)
continue;
get_references_in_stmt (stmt, &references);
for (j = 0; VEC_iterate (data_ref_loc, references, j, ref); j++)
if (!ref->is_read)
RDG_MEM_WRITE_STMT (rdg, i) = true;
else
RDG_MEM_READS_STMT (rdg, i) = true;
VEC_free (data_ref_loc, heap, references);
} }
} }
/* Initialize STMTS with all the statements and PHI nodes of LOOP. */ /* Initialize STMTS with all the statements of LOOP. When
INCLUDE_PHIS is true, include also the PHI nodes. The order in
which we discover statements is important as
generate_loops_for_partition is using the same traversal for
identifying statements. */
static void static void
stmts_from_loop (struct loop *loop, VEC (tree, heap) **stmts) stmts_from_loop (struct loop *loop, VEC (tree, heap) **stmts)
...@@ -4490,7 +4691,7 @@ stmts_from_loop (struct loop *loop, VEC (tree, heap) **stmts) ...@@ -4490,7 +4691,7 @@ stmts_from_loop (struct loop *loop, VEC (tree, heap) **stmts)
for (i = 0; i < loop->num_nodes; i++) for (i = 0; i < loop->num_nodes; i++)
{ {
tree phi; tree phi, stmt;
basic_block bb = bbs[i]; basic_block bb = bbs[i];
block_stmt_iterator bsi; block_stmt_iterator bsi;
...@@ -4498,7 +4699,8 @@ stmts_from_loop (struct loop *loop, VEC (tree, heap) **stmts) ...@@ -4498,7 +4699,8 @@ stmts_from_loop (struct loop *loop, VEC (tree, heap) **stmts)
VEC_safe_push (tree, heap, *stmts, phi); VEC_safe_push (tree, heap, *stmts, phi);
for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi)) for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
VEC_safe_push (tree, heap, *stmts, bsi_stmt (bsi)); if (TREE_CODE (stmt = bsi_stmt (bsi)) != LABEL_EXPR)
VEC_safe_push (tree, heap, *stmts, stmt);
} }
free (bbs); free (bbs);
...@@ -4519,8 +4721,39 @@ known_dependences_p (VEC (ddr_p, heap) *dependence_relations) ...@@ -4519,8 +4721,39 @@ known_dependences_p (VEC (ddr_p, heap) *dependence_relations)
return true; return true;
} }
/* Build a Reduced Dependence Graph with one vertex per statement of the /* Computes a hash function for element ELT. */
loop nest and one edge per data dependence or scalar dependence. */
static hashval_t
hash_stmt_vertex_info (const void *elt)
{
struct rdg_vertex_info *rvi = (struct rdg_vertex_info *) elt;
tree stmt = rvi->stmt;
return htab_hash_pointer (stmt);
}
/* Compares database elements E1 and E2. */
static int
eq_stmt_vertex_info (const void *e1, const void *e2)
{
const struct rdg_vertex_info *elt1 = (const struct rdg_vertex_info *) e1;
const struct rdg_vertex_info *elt2 = (const struct rdg_vertex_info *) e2;
return elt1->stmt == elt2->stmt;
}
/* Free the element E. */
static void
hash_stmt_vertex_del (void *e)
{
free (e);
}
/* Build the Reduced Dependence Graph (RDG) with one vertex per
statement of the loop nest, and one edge per data dependence or
scalar dependence. */
struct graph * struct graph *
build_rdg (struct loop *loop) build_rdg (struct loop *loop)
...@@ -4529,7 +4762,7 @@ build_rdg (struct loop *loop) ...@@ -4529,7 +4762,7 @@ build_rdg (struct loop *loop)
struct graph *rdg = NULL; struct graph *rdg = NULL;
VEC (ddr_p, heap) *dependence_relations; VEC (ddr_p, heap) *dependence_relations;
VEC (data_reference_p, heap) *datarefs; VEC (data_reference_p, heap) *datarefs;
VEC (tree, heap) *stmts = VEC_alloc (tree, heap, 10); VEC (tree, heap) *stmts = VEC_alloc (tree, heap, nb_data_refs);
dependence_relations = VEC_alloc (ddr_p, heap, nb_data_refs * nb_data_refs) ; dependence_relations = VEC_alloc (ddr_p, heap, nb_data_refs * nb_data_refs) ;
datarefs = VEC_alloc (data_reference_p, heap, nb_data_refs); datarefs = VEC_alloc (data_reference_p, heap, nb_data_refs);
...@@ -4537,12 +4770,15 @@ build_rdg (struct loop *loop) ...@@ -4537,12 +4770,15 @@ build_rdg (struct loop *loop)
false, false,
&datarefs, &datarefs,
&dependence_relations); &dependence_relations);
if (!known_dependences_p (dependence_relations)) if (!known_dependences_p (dependence_relations))
goto end_rdg; goto end_rdg;
stmts_from_loop (loop, &stmts); stmts_from_loop (loop, &stmts);
rdg = new_graph (VEC_length (tree, stmts)); rdg = new_graph (VEC_length (tree, stmts));
rdg->indices = htab_create (nb_data_refs, hash_stmt_vertex_info,
eq_stmt_vertex_info, hash_stmt_vertex_del);
create_rdg_vertices (rdg, stmts); create_rdg_vertices (rdg, stmts);
create_rdg_edges (rdg, dependence_relations); create_rdg_edges (rdg, dependence_relations);
...@@ -4553,3 +4789,197 @@ build_rdg (struct loop *loop) ...@@ -4553,3 +4789,197 @@ build_rdg (struct loop *loop)
return rdg; return rdg;
} }
/* Free the reduced dependence graph RDG. */
void
free_rdg (struct graph *rdg)
{
int i;
for (i = 0; i < rdg->n_vertices; i++)
free (rdg->vertices[i].data);
htab_delete (rdg->indices);
free_graph (rdg);
}
/* Initialize STMTS with all the statements of LOOP that contain a
store to memory. */
void
stores_from_loop (struct loop *loop, VEC (tree, heap) **stmts)
{
unsigned int i;
basic_block *bbs = get_loop_body_in_dom_order (loop);
for (i = 0; i < loop->num_nodes; i++)
{
basic_block bb = bbs[i];
block_stmt_iterator bsi;
for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
if (!ZERO_SSA_OPERANDS (bsi_stmt (bsi), SSA_OP_VDEF))
VEC_safe_push (tree, heap, *stmts, bsi_stmt (bsi));
}
free (bbs);
}
/* For a data reference REF, return the declaration of its base
address or NULL_TREE if the base is not determined. */
static inline tree
ref_base_address (tree stmt, data_ref_loc *ref)
{
tree base = NULL_TREE;
tree base_address;
struct data_reference *dr = XCNEW (struct data_reference);
DR_STMT (dr) = stmt;
DR_REF (dr) = *ref->pos;
dr_analyze_innermost (dr);
base_address = DR_BASE_ADDRESS (dr);
if (!base_address)
goto end;
switch (TREE_CODE (base_address))
{
case ADDR_EXPR:
base = TREE_OPERAND (base_address, 0);
break;
default:
base = base_address;
break;
}
end:
free_data_ref (dr);
return base;
}
/* Determines whether the statement from vertex V of the RDG has a
definition used outside the loop that contains this statement. */
bool
rdg_defs_used_in_other_loops_p (struct graph *rdg, int v)
{
tree stmt = RDG_STMT (rdg, v);
struct loop *loop = loop_containing_stmt (stmt);
use_operand_p imm_use_p;
imm_use_iterator iterator;
ssa_op_iter it;
def_operand_p def_p;
if (!loop)
return true;
FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, it, SSA_OP_DEF)
{
FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, DEF_FROM_PTR (def_p))
{
if (loop_containing_stmt (USE_STMT (imm_use_p)) != loop)
return true;
}
}
return false;
}
/* Determines whether statements S1 and S2 access to similar memory
locations. Two memory accesses are considered similar when they
have the same base address declaration, i.e. when their
ref_base_address is the same. */
bool
have_similar_memory_accesses (tree s1, tree s2)
{
bool res = false;
unsigned i, j;
VEC (data_ref_loc, heap) *refs1, *refs2;
data_ref_loc *ref1, *ref2;
get_references_in_stmt (s1, &refs1);
get_references_in_stmt (s2, &refs2);
for (i = 0; VEC_iterate (data_ref_loc, refs1, i, ref1); i++)
{
tree base1 = ref_base_address (s1, ref1);
if (base1)
for (j = 0; VEC_iterate (data_ref_loc, refs2, j, ref2); j++)
if (base1 == ref_base_address (s2, ref2))
{
res = true;
goto end;
}
}
end:
VEC_free (data_ref_loc, heap, refs1);
VEC_free (data_ref_loc, heap, refs2);
return res;
}
/* Helper function for the hashtab. */
static int
have_similar_memory_accesses_1 (const void *s1, const void *s2)
{
return have_similar_memory_accesses ((tree) s1, (tree) s2);
}
/* Helper function for the hashtab. */
static hashval_t
ref_base_address_1 (const void *s)
{
tree stmt = (tree) s;
unsigned i;
VEC (data_ref_loc, heap) *refs;
data_ref_loc *ref;
hashval_t res = 0;
get_references_in_stmt (stmt, &refs);
for (i = 0; VEC_iterate (data_ref_loc, refs, i, ref); i++)
if (!ref->is_read)
{
res = htab_hash_pointer (ref_base_address (stmt, ref));
break;
}
VEC_free (data_ref_loc, heap, refs);
return res;
}
/* Try to remove duplicated write data references from STMTS. */
void
remove_similar_memory_refs (VEC (tree, heap) **stmts)
{
unsigned i;
tree stmt;
htab_t seen = htab_create (VEC_length (tree, *stmts), ref_base_address_1,
have_similar_memory_accesses_1, NULL);
for (i = 0; VEC_iterate (tree, *stmts, i, stmt); )
{
void **slot;
slot = htab_find_slot (seen, stmt, INSERT);
if (*slot)
VEC_ordered_remove (tree, *stmts, i);
else
{
*slot = (void *) stmt;
i++;
}
}
htab_delete (seen);
}
...@@ -24,6 +24,7 @@ along with GCC; see the file COPYING3. If not see ...@@ -24,6 +24,7 @@ along with GCC; see the file COPYING3. If not see
#include "graphds.h" #include "graphds.h"
#include "lambda.h" #include "lambda.h"
#include "omega.h" #include "omega.h"
#include "tree-chrec.h"
/* /*
innermost_loop_behavior describes the evolution of the address of the memory innermost_loop_behavior describes the evolution of the address of the memory
...@@ -38,6 +39,7 @@ along with GCC; see the file COPYING3. If not see ...@@ -38,6 +39,7 @@ along with GCC; see the file COPYING3. If not see
Example 1 Example 2 Example 1 Example 2
data-ref a[j].b[i][j] *(p + x + 16B + 4B * j) data-ref a[j].b[i][j] *(p + x + 16B + 4B * j)
innermost_loop_behavior innermost_loop_behavior
base_address &a p base_address &a p
offset i * D_i x offset i * D_i x
...@@ -319,26 +321,107 @@ extern void debug_data_dependence_relation (struct data_dependence_relation *); ...@@ -319,26 +321,107 @@ extern void debug_data_dependence_relation (struct data_dependence_relation *);
extern void dump_data_dependence_relation (FILE *, extern void dump_data_dependence_relation (FILE *,
struct data_dependence_relation *); struct data_dependence_relation *);
extern void dump_data_dependence_relations (FILE *, VEC (ddr_p, heap) *); extern void dump_data_dependence_relations (FILE *, VEC (ddr_p, heap) *);
extern void debug_data_dependence_relations (VEC (ddr_p, heap) *);
extern void dump_data_dependence_direction (FILE *, extern void dump_data_dependence_direction (FILE *,
enum data_dependence_direction); enum data_dependence_direction);
extern void free_dependence_relation (struct data_dependence_relation *); extern void free_dependence_relation (struct data_dependence_relation *);
extern void free_dependence_relations (VEC (ddr_p, heap) *); extern void free_dependence_relations (VEC (ddr_p, heap) *);
extern void free_data_ref (data_reference_p);
extern void free_data_refs (VEC (data_reference_p, heap) *); extern void free_data_refs (VEC (data_reference_p, heap) *);
struct data_reference *create_data_ref (struct loop *, tree, tree, bool); struct data_reference *create_data_ref (struct loop *, tree, tree, bool);
bool find_loop_nest (struct loop *, VEC (loop_p, heap) **); bool find_loop_nest (struct loop *, VEC (loop_p, heap) **);
void compute_all_dependences (VEC (data_reference_p, heap) *, void compute_all_dependences (VEC (data_reference_p, heap) *,
VEC (ddr_p, heap) **, VEC (loop_p, heap) *, bool); VEC (ddr_p, heap) **, VEC (loop_p, heap) *, bool);
/* Return true when the DDR contains two data references that have the
same access functions. */
static inline bool
same_access_functions (const struct data_dependence_relation *ddr)
{
unsigned i;
for (i = 0; i < DDR_NUM_SUBSCRIPTS (ddr); i++)
if (!eq_evolutions_p (DR_ACCESS_FN (DDR_A (ddr), i),
DR_ACCESS_FN (DDR_B (ddr), i)))
return false;
return true;
}
/* Return true when DDR is an anti-dependence relation. */
static inline bool
ddr_is_anti_dependent (ddr_p ddr)
{
return (DDR_ARE_DEPENDENT (ddr) == NULL_TREE
&& DR_IS_READ (DDR_A (ddr))
&& !DR_IS_READ (DDR_B (ddr))
&& !same_access_functions (ddr));
}
/* Return true when DEPENDENCE_RELATIONS contains an anti-dependence. */
static inline bool
ddrs_have_anti_deps (VEC (ddr_p, heap) *dependence_relations)
{
unsigned i;
ddr_p ddr;
for (i = 0; VEC_iterate (ddr_p, dependence_relations, i, ddr); i++)
if (ddr_is_anti_dependent (ddr))
return true;
return false;
}
/* Return the dependence level for the DDR relation. */
static inline unsigned
ddr_dependence_level (ddr_p ddr)
{
unsigned vector;
unsigned level = 0;
if (DDR_DIST_VECTS (ddr))
level = dependence_level (DDR_DIST_VECT (ddr, 0), DDR_NB_LOOPS (ddr));
for (vector = 1; vector < DDR_NUM_DIST_VECTS (ddr); vector++)
level = MIN (level, dependence_level (DDR_DIST_VECT (ddr, vector),
DDR_NB_LOOPS (ddr)));
return level;
}
/* A RDG vertex representing a statement. */ /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
typedef struct rdg_vertex typedef struct rdg_vertex
{ {
/* The statement represented by this vertex. */ /* The statement represented by this vertex. */
tree stmt; tree stmt;
/* True when the statement contains a write to memory. */
bool has_mem_write;
/* True when the statement contains a read from memory. */
bool has_mem_reads;
} *rdg_vertex_p; } *rdg_vertex_p;
#define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
#define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
#define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
#define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
#define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
#define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
void dump_rdg_vertex (FILE *, struct graph *, int);
void debug_rdg_vertex (struct graph *, int);
void dump_rdg_component (FILE *, struct graph *, int, bitmap);
void debug_rdg_component (struct graph *, int);
void dump_rdg (FILE *, struct graph *);
void debug_rdg (struct graph *);
void dot_rdg (struct graph *);
int rdg_vertex_for_stmt (struct graph *, tree);
/* Data dependence type. */ /* Data dependence type. */
...@@ -363,11 +446,17 @@ typedef struct rdg_edge ...@@ -363,11 +446,17 @@ typedef struct rdg_edge
{ {
/* Type of the dependence. */ /* Type of the dependence. */
enum rdg_dep_type type; enum rdg_dep_type type;
/* Levels of the dependence: the depth of the loops that
carry the dependence. */
unsigned level;
} *rdg_edge_p; } *rdg_edge_p;
#define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
#define RDGE_LEVEL(E) ((struct rdg_edge *) ((E)->data))->level
struct graph *build_rdg (struct loop *); struct graph *build_rdg (struct loop *);
void free_rdg (struct graph *);
/* Return the index of the variable VAR in the LOOP_NEST array. */ /* Return the index of the variable VAR in the LOOP_NEST array. */
...@@ -385,6 +474,21 @@ index_in_loop_nest (int var, VEC (loop_p, heap) *loop_nest) ...@@ -385,6 +474,21 @@ index_in_loop_nest (int var, VEC (loop_p, heap) *loop_nest)
return var_index; return var_index;
} }
void stores_from_loop (struct loop *, VEC (tree, heap) **);
void remove_similar_memory_refs (VEC (tree, heap) **);
bool rdg_defs_used_in_other_loops_p (struct graph *, int);
bool have_similar_memory_accesses (tree, tree);
/* Determines whether RDG vertices V1 and V2 access to similar memory
locations, in which case they have to be in the same partition. */
static inline bool
rdg_has_similar_memory_accesses (struct graph *rdg, int v1, int v2)
{
return have_similar_memory_accesses (RDG_STMT (rdg, v1),
RDG_STMT (rdg, v2));
}
/* In lambda-code.c */ /* In lambda-code.c */
bool lambda_transform_legal_p (lambda_trans_matrix, int, VEC (ddr_p, heap) *); bool lambda_transform_legal_p (lambda_trans_matrix, int, VEC (ddr_p, heap) *);
......
...@@ -792,6 +792,7 @@ extern void end_recording_case_labels (void); ...@@ -792,6 +792,7 @@ extern void end_recording_case_labels (void);
extern basic_block move_sese_region_to_fn (struct function *, basic_block, extern basic_block move_sese_region_to_fn (struct function *, basic_block,
basic_block); basic_block);
void remove_edge_and_dominated_blocks (edge); void remove_edge_and_dominated_blocks (edge);
void mark_virtual_ops_in_bb (basic_block);
/* In tree-cfgcleanup.c */ /* In tree-cfgcleanup.c */
extern bitmap cfgcleanup_altered_bbs; extern bitmap cfgcleanup_altered_bbs;
...@@ -1022,6 +1023,8 @@ bool tree_duplicate_loop_to_header_edge (struct loop *, edge, ...@@ -1022,6 +1023,8 @@ bool tree_duplicate_loop_to_header_edge (struct loop *, edge,
unsigned int, sbitmap, unsigned int, sbitmap,
edge, VEC (edge, heap) **, edge, VEC (edge, heap) **,
int); int);
struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge);
void rename_variables_in_loop (struct loop *);
struct loop *tree_ssa_loop_version (struct loop *, tree, struct loop *tree_ssa_loop_version (struct loop *, tree,
basic_block *); basic_block *);
tree expand_simple_operations (tree); tree expand_simple_operations (tree);
......
/* Loop distribution.
Copyright (C) 2006, 2007 Free Software Foundation, Inc.
Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
and Sebastian Pop <sebastian.pop@amd.com>.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3, or (at your option) any
later version.
GCC is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
/* This pass performs loop distribution: for example, the loop
|DO I = 2, N
| A(I) = B(I) + C
| D(I) = A(I-1)*E
|ENDDO
is transformed to
|DOALL I = 2, N
| A(I) = B(I) + C
|ENDDO
|
|DOALL I = 2, N
| D(I) = A(I-1)*E
|ENDDO
This pass uses an RDG, Reduced Dependence Graph built on top of the
data dependence relations. The RDG is then topologically sorted to
obtain a map of information producers/consumers based on which it
generates the new loops. */
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "tm.h"
#include "ggc.h"
#include "tree.h"
#include "target.h"
#include "rtl.h"
#include "basic-block.h"
#include "diagnostic.h"
#include "tree-flow.h"
#include "tree-dump.h"
#include "timevar.h"
#include "cfgloop.h"
#include "expr.h"
#include "optabs.h"
#include "tree-chrec.h"
#include "tree-data-ref.h"
#include "tree-scalar-evolution.h"
#include "tree-pass.h"
#include "lambda.h"
#include "langhooks.h"
#include "tree-vectorizer.h"
/* If bit I is not set, it means that this node represents an
operation that has already been performed, and that should not be
performed again. This is the subgraph of remaining important
computations that is passed to the DFS algorithm for avoiding to
include several times the same stores in different loops. */
static bitmap remaining_stmts;
/* A node of the RDG is marked in this bitmap when it has as a
predecessor a node that writes to memory. */
static bitmap upstream_mem_writes;
/* Update the PHI nodes of NEW_LOOP. NEW_LOOP is a duplicate of
ORIG_LOOP. */
static void
update_phis_for_loop_copy (struct loop *orig_loop, struct loop *new_loop)
{
tree new_ssa_name;
tree phi_new, phi_orig;
edge orig_loop_latch = loop_latch_edge (orig_loop);
edge orig_entry_e = loop_preheader_edge (orig_loop);
edge new_loop_entry_e = loop_preheader_edge (new_loop);
/* Scan the phis in the headers of the old and new loops
(they are organized in exactly the same order). */
for (phi_new = phi_nodes (new_loop->header),
phi_orig = phi_nodes (orig_loop->header);
phi_new && phi_orig;
phi_new = PHI_CHAIN (phi_new), phi_orig = PHI_CHAIN (phi_orig))
{
/* Add the first phi argument for the phi in NEW_LOOP (the one
associated with the entry of NEW_LOOP) */
tree def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_entry_e);
add_phi_arg (phi_new, def, new_loop_entry_e);
/* Add the second phi argument for the phi in NEW_LOOP (the one
associated with the latch of NEW_LOOP) */
def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
if (TREE_CODE (def) == SSA_NAME)
{
new_ssa_name = get_current_def (def);
if (!new_ssa_name)
/* This only happens if there are no definitions inside the
loop. Use the phi_result in this case. */
new_ssa_name = PHI_RESULT (phi_new);
}
else
/* Could be an integer. */
new_ssa_name = def;
add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
}
}
/* Return a copy of LOOP placed before LOOP. */
static struct loop *
copy_loop_before (struct loop *loop)
{
struct loop *res;
edge preheader = loop_preheader_edge (loop);
if (!single_exit (loop))
return NULL;
initialize_original_copy_tables ();
res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader);
free_original_copy_tables ();
if (!res)
return NULL;
update_phis_for_loop_copy (loop, res);
rename_variables_in_loop (res);
return res;
}
/* Creates an empty basic block after LOOP. */
static void
create_bb_after_loop (struct loop *loop)
{
edge exit = single_exit (loop);
if (!exit)
return;
split_edge (exit);
}
/* Generate code for PARTITION from the code in LOOP. The loop is
copied when COPY_P is true. All the statements not flagged in the
PARTITION bitmap are removed from the loop or from its copy. The
statements are indexed in sequence inside a basic block, and the
basic blocks of a loop are taken in dom order. Returns true when
the code gen succeeded. */
static bool
generate_loops_for_partition (struct loop *loop, bitmap partition, bool copy_p)
{
unsigned i, x;
block_stmt_iterator bsi;
basic_block *bbs;
if (copy_p)
{
loop = copy_loop_before (loop);
create_preheader (loop, CP_SIMPLE_PREHEADERS);
create_bb_after_loop (loop);
}
if (loop == NULL)
return false;
/* Remove stmts not in the PARTITION bitmap. The order in which we
visit the phi nodes and the statements is exactly as in
stmts_from_loop. */
bbs = get_loop_body_in_dom_order (loop);
for (x = 0, i = 0; i < loop->num_nodes; i++)
{
basic_block bb = bbs[i];
tree phi, prev = NULL_TREE, next;
for (phi = phi_nodes (bb); phi;)
if (!bitmap_bit_p (partition, x++))
{
next = PHI_CHAIN (phi);
remove_phi_node (phi, prev, true);
phi = next;
}
else
{
prev = phi;
phi = PHI_CHAIN (phi);
}
for (bsi = bsi_start (bb); !bsi_end_p (bsi);)
if (TREE_CODE (bsi_stmt (bsi)) != LABEL_EXPR
&& !bitmap_bit_p (partition, x++))
bsi_remove (&bsi, false);
else
bsi_next (&bsi);
mark_virtual_ops_in_bb (bb);
}
free (bbs);
return true;
}
/* Generate a call to memset. Return true when the operation succeeded. */
static bool
generate_memset_zero (tree stmt, tree op0, tree nb_iter,
block_stmt_iterator bsi)
{
tree s, t, stmts, nb_bytes, addr_base;
bool res = false;
tree stmt_list = NULL_TREE;
tree args [3];
tree fn_call, mem, fndecl, fntype, fn;
tree_stmt_iterator i;
ssa_op_iter iter;
struct data_reference *dr = XCNEW (struct data_reference);
nb_bytes = fold_build2 (MULT_EXPR, TREE_TYPE (nb_iter),
nb_iter, TYPE_SIZE_UNIT (TREE_TYPE (op0)));
nb_bytes = force_gimple_operand (nb_bytes, &stmts, true, NULL);
append_to_statement_list_force (stmts, &stmt_list);
DR_STMT (dr) = stmt;
DR_REF (dr) = op0;
dr_analyze_innermost (dr);
/* Test for a positive stride, iterating over every element. */
if (integer_zerop (fold_build2 (MINUS_EXPR, integer_type_node, DR_STEP (dr),
TYPE_SIZE_UNIT (TREE_TYPE (op0)))))
addr_base = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_BASE_ADDRESS (dr)),
DR_BASE_ADDRESS (dr),
size_binop (PLUS_EXPR,
DR_OFFSET (dr), DR_INIT (dr)));
/* Test for a negative stride, iterating over every element. */
else if (integer_zerop (fold_build2 (PLUS_EXPR, integer_type_node,
TYPE_SIZE_UNIT (TREE_TYPE (op0)),
DR_STEP (dr))))
{
addr_base = size_binop (PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
addr_base = fold_build2 (MINUS_EXPR, sizetype, addr_base, nb_bytes);
addr_base = force_gimple_operand (addr_base, &stmts, true, NULL);
append_to_statement_list_force (stmts, &stmt_list);
addr_base = fold_build2 (POINTER_PLUS_EXPR,
TREE_TYPE (DR_BASE_ADDRESS (dr)),
DR_BASE_ADDRESS (dr), addr_base);
}
else
goto end;
mem = force_gimple_operand (addr_base, &stmts, true, NULL);
append_to_statement_list_force (stmts, &stmt_list);
fndecl = implicit_built_in_decls [BUILT_IN_MEMSET];
fntype = TREE_TYPE (fndecl);
fn = build1 (ADDR_EXPR, build_pointer_type (fntype), fndecl);
args[0] = mem;
args[1] = integer_zero_node;
args[2] = nb_bytes;
fn_call = build_call_array (fntype, fn, 3, args);
append_to_statement_list_force (fn_call, &stmt_list);
for (i = tsi_start (stmt_list); !tsi_end_p (i); tsi_next (&i))
{
s = tsi_stmt (i);
update_stmt_if_modified (s);
FOR_EACH_SSA_TREE_OPERAND (t, s, iter, SSA_OP_VIRTUAL_DEFS)
{
if (TREE_CODE (t) == SSA_NAME)
t = SSA_NAME_VAR (t);
mark_sym_for_renaming (t);
}
}
/* Mark also the uses of the VDEFS of STMT to be renamed. */
FOR_EACH_SSA_TREE_OPERAND (t, stmt, iter, SSA_OP_VIRTUAL_DEFS)
{
if (TREE_CODE (t) == SSA_NAME)
{
imm_use_iterator imm_iter;
FOR_EACH_IMM_USE_STMT (s, imm_iter, t)
update_stmt (s);
t = SSA_NAME_VAR (t);
}
mark_sym_for_renaming (t);
}
bsi_insert_after (&bsi, stmt_list, BSI_CONTINUE_LINKING);
res = true;
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "generated memset zero\n");
end:
free_data_ref (dr);
return res;
}
/* Tries to generate a builtin function for the instructions of LOOP
pointed to by the bits set in PARTITION. Returns true when the
operation succeeded. */
static bool
generate_builtin (struct loop *loop, bitmap partition, bool copy_p)
{
bool res = false;
unsigned i, x = 0;
basic_block *bbs;
tree write = NULL_TREE;
tree op0, op1;
block_stmt_iterator bsi;
tree nb_iter = number_of_exit_cond_executions (loop);
if (!nb_iter || nb_iter == chrec_dont_know)
return false;
bbs = get_loop_body_in_dom_order (loop);
for (i = 0; i < loop->num_nodes; i++)
{
basic_block bb = bbs[i];
tree phi;
for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
x++;
for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
{
tree stmt = bsi_stmt (bsi);
if (bitmap_bit_p (partition, x++)
&& TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
&& !is_gimple_reg (GIMPLE_STMT_OPERAND (stmt, 0)))
{
/* Don't generate the builtins when there are more than
one memory write. */
if (write != NULL)
goto end;
write = stmt;
}
}
}
if (!write)
goto end;
op0 = GIMPLE_STMT_OPERAND (write, 0);
op1 = GIMPLE_STMT_OPERAND (write, 1);
if (!(TREE_CODE (op0) == ARRAY_REF
|| TREE_CODE (op0) == INDIRECT_REF))
goto end;
/* The new statements will be placed before LOOP. */
bsi = bsi_last (loop_preheader_edge (loop)->src);
if (integer_zerop (op1) || real_zerop (op1))
res = generate_memset_zero (write, op0, nb_iter, bsi);
/* If this is the last partition for which we generate code, we have
to destroy the loop. */
if (res && !copy_p)
{
unsigned nbbs = loop->num_nodes;
basic_block src = loop_preheader_edge (loop)->src;
basic_block dest = single_exit (loop)->dest;
make_edge (src, dest, EDGE_FALLTHRU);
set_immediate_dominator (CDI_DOMINATORS, dest, src);
cancel_loop_tree (loop);
for (i = 0; i < nbbs; i++)
delete_basic_block (bbs[i]);
}
end:
free (bbs);
return res;
}
/* Generates code for PARTITION. For simple loops, this function can
generate a built-in. */
static bool
generate_code_for_partition (struct loop *loop, bitmap partition, bool copy_p)
{
if (generate_builtin (loop, partition, copy_p))
return true;
return generate_loops_for_partition (loop, partition, copy_p);
}
/* Returns true if the node V of RDG cannot be recomputed. */
static bool
rdg_cannot_recompute_vertex_p (struct graph *rdg, int v)
{
if (RDG_MEM_WRITE_STMT (rdg, v))
return true;
return false;
}
/* Returns true when the vertex V has already been generated in the
current partition (V is in PROCESSED), or when V belongs to another
partition and cannot be recomputed (V is not in REMAINING_STMTS). */
static inline bool
already_processed_vertex_p (bitmap processed, int v)
{
return (bitmap_bit_p (processed, v)
|| !bitmap_bit_p (remaining_stmts, v));
}
/* Returns NULL when there is no anti-dependence among the successors
of vertex V, otherwise returns the edge with the anti-dep. */
static struct graph_edge *
has_anti_dependence (struct vertex *v)
{
struct graph_edge *e;
if (v->succ)
for (e = v->succ; e; e = e->succ_next)
if (RDGE_TYPE (e) == anti_dd)
return e;
return NULL;
}
/* Returns true when V has an anti-dependence edge among its successors. */
static bool
predecessor_has_mem_write (struct graph *rdg, struct vertex *v)
{
struct graph_edge *e;
if (v->pred)
for (e = v->pred; e; e = e->pred_next)
if (bitmap_bit_p (upstream_mem_writes, e->src)
/* Don't consider flow channels: a write to memory followed
by a read from memory. These channels allow the split of
the RDG in different partitions. */
&& !RDG_MEM_WRITE_STMT (rdg, e->src))
return true;
return false;
}
/* Initializes the upstream_mem_writes bitmap following the
information from RDG. */
static void
mark_nodes_having_upstream_mem_writes (struct graph *rdg)
{
int v, x;
bitmap seen = BITMAP_ALLOC (NULL);
for (v = rdg->n_vertices - 1; v >= 0; v--)
if (!bitmap_bit_p (seen, v))
{
unsigned i;
VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
bool has_upstream_mem_write_p = false;
graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
for (i = 0; VEC_iterate (int, nodes, i, x); i++)
{
if (bitmap_bit_p (seen, x))
continue;
bitmap_set_bit (seen, x);
if (RDG_MEM_WRITE_STMT (rdg, x)
|| predecessor_has_mem_write (rdg, &(rdg->vertices[x]))
/* In anti dependences the read should occur before
the write, this is why both the read and the write
should be placed in the same partition. */
|| has_anti_dependence (&(rdg->vertices[x])))
{
has_upstream_mem_write_p = true;
bitmap_set_bit (upstream_mem_writes, x);
}
}
VEC_free (int, heap, nodes);
}
}
/* Returns true when vertex u has a memory write node as a predecessor
in RDG. */
static bool
has_upstream_mem_writes (int u)
{
return bitmap_bit_p (upstream_mem_writes, u);
}
static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap,
bitmap, bool *);
/* Flag all the uses of U. */
static void
rdg_flag_all_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
bitmap processed, bool *part_has_writes)
{
struct graph_edge *e;
for (e = rdg->vertices[u].succ; e; e = e->succ_next)
if (!bitmap_bit_p (processed, e->dest))
{
rdg_flag_vertex_and_dependent (rdg, e->dest, partition, loops,
processed, part_has_writes);
rdg_flag_all_uses (rdg, e->dest, partition, loops, processed,
part_has_writes);
}
}
/* Flag the uses of U stopping following the information from
upstream_mem_writes. */
static void
rdg_flag_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
bitmap processed, bool *part_has_writes)
{
ssa_op_iter iter;
use_operand_p use_p;
struct vertex *x = &(rdg->vertices[u]);
tree stmt = RDGV_STMT (x);
struct graph_edge *anti_dep = has_anti_dependence (x);
/* Keep in the same partition the destination of an antidependence,
because this is a store to the exact same location. Putting this
in another partition is bad for cache locality. */
if (anti_dep)
{
int v = anti_dep->dest;
if (!already_processed_vertex_p (processed, v))
rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
processed, part_has_writes);
}
if (TREE_CODE (stmt) != PHI_NODE)
{
FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_VIRTUAL_USES)
{
tree use = USE_FROM_PTR (use_p);
if (TREE_CODE (use) == SSA_NAME)
{
tree def_stmt = SSA_NAME_DEF_STMT (use);
int v = rdg_vertex_for_stmt (rdg, def_stmt);
if (v >= 0
&& !already_processed_vertex_p (processed, v))
rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
processed, part_has_writes);
}
}
}
if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
&& has_upstream_mem_writes (u))
{
tree op0 = GIMPLE_STMT_OPERAND (stmt, 0);
/* Scalar channels don't have enough space for transmitting data
between tasks, unless we add more storage by privatizing. */
if (is_gimple_reg (op0))
{
use_operand_p use_p;
imm_use_iterator iter;
FOR_EACH_IMM_USE_FAST (use_p, iter, op0)
{
int v = rdg_vertex_for_stmt (rdg, USE_STMT (use_p));
if (!already_processed_vertex_p (processed, v))
rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
processed, part_has_writes);
}
}
}
}
/* Flag V from RDG as part of PARTITION, and also flag its loop number
in LOOPS. */
static void
rdg_flag_vertex (struct graph *rdg, int v, bitmap partition, bitmap loops,
bool *part_has_writes)
{
struct loop *loop;
if (bitmap_bit_p (partition, v))
return;
loop = loop_containing_stmt (RDG_STMT (rdg, v));
bitmap_set_bit (loops, loop->num);
bitmap_set_bit (partition, v);
if (rdg_cannot_recompute_vertex_p (rdg, v))
{
*part_has_writes = true;
bitmap_clear_bit (remaining_stmts, v);
}
}
/* Flag in the bitmap PARTITION the vertex V and all its predecessors.
Alse flag their loop number in LOOPS. */
static void
rdg_flag_vertex_and_dependent (struct graph *rdg, int v, bitmap partition,
bitmap loops, bitmap processed,
bool *part_has_writes)
{
unsigned i;
VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
int x;
bitmap_set_bit (processed, v);
rdg_flag_uses (rdg, v, partition, loops, processed, part_has_writes);
graphds_dfs (rdg, &v, 1, &nodes, false, remaining_stmts);
rdg_flag_vertex (rdg, v, partition, loops, part_has_writes);
for (i = 0; VEC_iterate (int, nodes, i, x); i++)
if (!already_processed_vertex_p (processed, x))
rdg_flag_vertex_and_dependent (rdg, x, partition, loops, processed,
part_has_writes);
VEC_free (int, heap, nodes);
}
/* Initialize CONDS with all the condition statements from the basic
blocks of LOOP. */
static void
collect_condition_stmts (struct loop *loop, VEC (tree, heap) **conds)
{
unsigned i;
edge e;
VEC (edge, heap) *exits = get_loop_exit_edges (loop);
for (i = 0; VEC_iterate (edge, exits, i, e); i++)
{
tree cond = last_stmt (e->src);
if (cond)
VEC_safe_push (tree, heap, *conds, cond);
}
VEC_free (edge, heap, exits);
}
/* Add to PARTITION all the exit condition statements for LOOPS
together with all their dependent statements determined from
RDG. */
static void
rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition,
bitmap processed, bool *part_has_writes)
{
unsigned i;
bitmap_iterator bi;
VEC (tree, heap) *conds = VEC_alloc (tree, heap, 3);
EXECUTE_IF_SET_IN_BITMAP (loops, 0, i, bi)
collect_condition_stmts (get_loop (i), &conds);
while (!VEC_empty (tree, conds))
{
tree cond = VEC_pop (tree, conds);
int v = rdg_vertex_for_stmt (rdg, cond);
bitmap new_loops = BITMAP_ALLOC (NULL);
if (!already_processed_vertex_p (processed, v))
rdg_flag_vertex_and_dependent (rdg, v, partition, new_loops, processed,
part_has_writes);
EXECUTE_IF_SET_IN_BITMAP (new_loops, 0, i, bi)
if (!bitmap_bit_p (loops, i))
{
bitmap_set_bit (loops, i);
collect_condition_stmts (get_loop (i), &conds);
}
BITMAP_FREE (new_loops);
}
}
/* Strongly connected components of the reduced data dependence graph. */
typedef struct rdg_component
{
int num;
VEC (int, heap) *vertices;
} *rdgc;
DEF_VEC_P (rdgc);
DEF_VEC_ALLOC_P (rdgc, heap);
/* Flag all the nodes of RDG containing memory accesses that could
potentially belong to arrays already accessed in the current
PARTITION. */
static void
rdg_flag_similar_memory_accesses (struct graph *rdg, bitmap partition,
bitmap loops, bitmap processed,
VEC (int, heap) **other_stores)
{
bool foo;
unsigned i, n;
int j, k, kk;
bitmap_iterator ii;
struct graph_edge *e;
EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii)
if (RDG_MEM_WRITE_STMT (rdg, i)
|| RDG_MEM_READS_STMT (rdg, i))
{
for (j = 0; j < rdg->n_vertices; j++)
if (!bitmap_bit_p (processed, j)
&& (RDG_MEM_WRITE_STMT (rdg, j)
|| RDG_MEM_READS_STMT (rdg, j))
&& rdg_has_similar_memory_accesses (rdg, i, j))
{
/* Flag first the node J itself, and all the nodes that
are needed to compute J. */
rdg_flag_vertex_and_dependent (rdg, j, partition, loops,
processed, &foo);
/* When J is a read, we want to coalesce in the same
PARTITION all the nodes that are using J: this is
needed for better cache locality. */
rdg_flag_all_uses (rdg, j, partition, loops, processed, &foo);
/* Remove from OTHER_STORES the vertex that we flagged. */
if (RDG_MEM_WRITE_STMT (rdg, j))
for (k = 0; VEC_iterate (int, *other_stores, k, kk); k++)
if (kk == j)
{
VEC_unordered_remove (int, *other_stores, k);
break;
}
}
/* If the node I has two uses, then keep these together in the
same PARTITION. */
for (n = 0, e = rdg->vertices[i].succ; e; e = e->succ_next, n++);
if (n > 1)
rdg_flag_all_uses (rdg, i, partition, loops, processed, &foo);
}
}
/* Returns a bitmap in which all the statements needed for computing
the strongly connected component C of the RDG are flagged, also
including the loop exit conditions. */
static bitmap
build_rdg_partition_for_component (struct graph *rdg, rdgc c,
bool *part_has_writes,
VEC (int, heap) **other_stores)
{
int i, v;
bitmap partition = BITMAP_ALLOC (NULL);
bitmap loops = BITMAP_ALLOC (NULL);
bitmap processed = BITMAP_ALLOC (NULL);
for (i = 0; VEC_iterate (int, c->vertices, i, v); i++)
if (!already_processed_vertex_p (processed, v))
rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed,
part_has_writes);
/* Also iterate on the array of stores not in the starting vertices,
and determine those vertices that have some memory affinity with
the current nodes in the component: these are stores to the same
arrays, i.e. we're taking care of cache locality. */
rdg_flag_similar_memory_accesses (rdg, partition, loops, processed,
other_stores);
rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes);
BITMAP_FREE (processed);
BITMAP_FREE (loops);
return partition;
}
/* Free memory for COMPONENTS. */
static void
free_rdg_components (VEC (rdgc, heap) *components)
{
int i;
rdgc x;
for (i = 0; VEC_iterate (rdgc, components, i, x); i++)
{
VEC_free (int, heap, x->vertices);
free (x);
}
}
/* Build the COMPONENTS vector with the strongly connected components
of RDG in which the STARTING_VERTICES occur. */
static void
rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices,
VEC (rdgc, heap) **components)
{
int i, v;
bitmap saved_components = BITMAP_ALLOC (NULL);
int n_components = graphds_scc (rdg, NULL);
VEC (int, heap) **all_components = XNEWVEC (VEC (int, heap) *, n_components);
for (i = 0; i < n_components; i++)
all_components[i] = VEC_alloc (int, heap, 3);
for (i = 0; i < rdg->n_vertices; i++)
VEC_safe_push (int, heap, all_components[rdg->vertices[i].component], i);
for (i = 0; VEC_iterate (int, starting_vertices, i, v); i++)
{
int c = rdg->vertices[v].component;
if (!bitmap_bit_p (saved_components, c))
{
rdgc x = XCNEW (struct rdg_component);
x->num = c;
x->vertices = all_components[c];
VEC_safe_push (rdgc, heap, *components, x);
bitmap_set_bit (saved_components, c);
}
}
for (i = 0; i < n_components; i++)
if (!bitmap_bit_p (saved_components, i))
VEC_free (int, heap, all_components[i]);
free (all_components);
BITMAP_FREE (saved_components);
}
DEF_VEC_P (bitmap);
DEF_VEC_ALLOC_P (bitmap, heap);
/* Aggregate several components into a useful partition that is
registered in the PARTITIONS vector. Partitions will be
distributed in different loops. */
static void
rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components,
VEC (int, heap) **other_stores,
VEC (bitmap, heap) **partitions, bitmap processed)
{
int i;
rdgc x;
bitmap partition = BITMAP_ALLOC (NULL);
for (i = 0; VEC_iterate (rdgc, components, i, x); i++)
{
bitmap np;
bool part_has_writes = false;
int v = VEC_index (int, x->vertices, 0);
if (bitmap_bit_p (processed, v))
continue;
np = build_rdg_partition_for_component (rdg, x, &part_has_writes,
other_stores);
bitmap_ior_into (partition, np);
bitmap_ior_into (processed, np);
BITMAP_FREE (np);
if (part_has_writes)
{
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "ldist useful partition:\n");
dump_bitmap (dump_file, partition);
}
VEC_safe_push (bitmap, heap, *partitions, partition);
partition = BITMAP_ALLOC (NULL);
}
}
/* Add the nodes from the RDG that were not marked as processed, and
that are used outside the current loop. These are scalar
computations that are not yet part of previous partitions. */
for (i = 0; i < rdg->n_vertices; i++)
if (!bitmap_bit_p (processed, i)
&& rdg_defs_used_in_other_loops_p (rdg, i))
VEC_safe_push (int, heap, *other_stores, i);
/* If there are still statements left in the OTHER_STORES array,
create other components and partitions with these stores and
their dependences. */
if (VEC_length (int, *other_stores) > 0)
{
VEC (rdgc, heap) *comps = VEC_alloc (rdgc, heap, 3);
VEC (int, heap) *foo = VEC_alloc (int, heap, 3);
rdg_build_components (rdg, *other_stores, &comps);
rdg_build_partitions (rdg, comps, &foo, partitions, processed);
VEC_free (int, heap, foo);
free_rdg_components (comps);
}
/* If there is something left in the last partition, save it. */
if (bitmap_count_bits (partition) > 0)
VEC_safe_push (bitmap, heap, *partitions, partition);
else
BITMAP_FREE (partition);
}
/* Dump to FILE the PARTITIONS. */
static void
dump_rdg_partitions (FILE *file, VEC (bitmap, heap) *partitions)
{
int i;
bitmap partition;
for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
debug_bitmap_file (file, partition);
}
/* Debug PARTITIONS. */
extern void debug_rdg_partitions (VEC (bitmap, heap) *);
void
debug_rdg_partitions (VEC (bitmap, heap) *partitions)
{
dump_rdg_partitions (stderr, partitions);
}
/* Generate code from STARTING_VERTICES in RDG. Returns the number of
distributed loops. */
static int
ldist_gen (struct loop *loop, struct graph *rdg,
VEC (int, heap) *starting_vertices)
{
int i, nbp;
VEC (rdgc, heap) *components = VEC_alloc (rdgc, heap, 3);
VEC (bitmap, heap) *partitions = VEC_alloc (bitmap, heap, 3);
VEC (int, heap) *other_stores = VEC_alloc (int, heap, 3);
bitmap partition, processed = BITMAP_ALLOC (NULL);
remaining_stmts = BITMAP_ALLOC (NULL);
upstream_mem_writes = BITMAP_ALLOC (NULL);
for (i = 0; i < rdg->n_vertices; i++)
{
bitmap_set_bit (remaining_stmts, i);
/* Save in OTHER_STORES all the memory writes that are not in
STARTING_VERTICES. */
if (RDG_MEM_WRITE_STMT (rdg, i))
{
int v;
unsigned j;
bool found = false;
for (j = 0; VEC_iterate (int, starting_vertices, j, v); j++)
if (i == v)
{
found = true;
break;
}
if (!found)
VEC_safe_push (int, heap, other_stores, i);
}
}
mark_nodes_having_upstream_mem_writes (rdg);
rdg_build_components (rdg, starting_vertices, &components);
rdg_build_partitions (rdg, components, &other_stores, &partitions,
processed);
BITMAP_FREE (processed);
nbp = VEC_length (bitmap, partitions);
if (nbp <= 1)
goto ldist_done;
if (dump_file && (dump_flags & TDF_DETAILS))
dump_rdg_partitions (dump_file, partitions);
for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
if (!generate_code_for_partition (loop, partition, i < nbp - 1))
goto ldist_done;
rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
update_ssa (TODO_update_ssa_only_virtuals | TODO_update_ssa);
ldist_done:
BITMAP_FREE (remaining_stmts);
BITMAP_FREE (upstream_mem_writes);
for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
BITMAP_FREE (partition);
VEC_free (int, heap, other_stores);
VEC_free (bitmap, heap, partitions);
free_rdg_components (components);
return nbp;
}
/* Distributes the code from LOOP in such a way that producer
statements are placed before consumer statements. When STMTS is
NULL, performs the maximal distribution, if STMTS is not NULL,
tries to separate only these statements from the LOOP's body.
Returns the number of distributed loops. */
static int
distribute_loop (struct loop *loop, VEC (tree, heap) *stmts)
{
bool res = false;
struct graph *rdg;
tree s;
unsigned i;
VEC (int, heap) *vertices;
if (loop->num_nodes > 2)
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file,
"FIXME: Loop %d not distributed: it has more than two basic blocks.\n",
loop->num);
return res;
}
rdg = build_rdg (loop);
if (!rdg)
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file,
"FIXME: Loop %d not distributed: failed to build the RDG.\n",
loop->num);
return res;
}
vertices = VEC_alloc (int, heap, 3);
if (dump_file && (dump_flags & TDF_DETAILS))
dump_rdg (dump_file, rdg);
for (i = 0; VEC_iterate (tree, stmts, i, s); i++)
{
int v = rdg_vertex_for_stmt (rdg, s);
if (v >= 0)
{
VEC_safe_push (int, heap, vertices, v);
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file,
"ldist asked to generate code for vertex %d\n", v);
}
}
res = ldist_gen (loop, rdg, vertices);
VEC_free (int, heap, vertices);
free_rdg (rdg);
return res;
}
/* Distribute all loops in the current function. */
static unsigned int
tree_loop_distribution (void)
{
struct loop *loop;
loop_iterator li;
int nb_generated_loops = 0;
FOR_EACH_LOOP (li, loop, 0)
{
VEC (tree, heap) *work_list = VEC_alloc (tree, heap, 3);
/* With the following working list, we're asking distribute_loop
to separate the stores of the loop: when dependences allow,
it will end on having one store per loop. */
stores_from_loop (loop, &work_list);
/* A simple heuristic for cache locality is to not split stores
to the same array. Without this call, an unrolled loop would
be split into as many loops as unroll factor, each loop
storing in the same array. */
remove_similar_memory_refs (&work_list);
nb_generated_loops = distribute_loop (loop, work_list);
if (dump_file && (dump_flags & TDF_DETAILS))
{
if (nb_generated_loops > 1)
fprintf (dump_file, "Loop %d distributed: split to %d loops.\n",
loop->num, nb_generated_loops);
else
fprintf (dump_file, "Loop %d is the same.\n", loop->num);
}
verify_loop_structure ();
VEC_free (tree, heap, work_list);
}
return 0;
}
static bool
gate_tree_loop_distribution (void)
{
return flag_tree_loop_distribution != 0;
}
struct tree_opt_pass pass_loop_distribution =
{
"ldist", /* name */
gate_tree_loop_distribution, /* gate */
tree_loop_distribution, /* execute */
NULL, /* sub */
NULL, /* next */
0, /* static_pass_number */
TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
PROP_cfg | PROP_ssa, /* properties_required */
0, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
TODO_dump_func | TODO_verify_loops, /* todo_flags_finish */
0 /* letter */
};
...@@ -265,6 +265,7 @@ extern struct tree_opt_pass pass_scev_cprop; ...@@ -265,6 +265,7 @@ extern struct tree_opt_pass pass_scev_cprop;
extern struct tree_opt_pass pass_empty_loop; extern struct tree_opt_pass pass_empty_loop;
extern struct tree_opt_pass pass_record_bounds; extern struct tree_opt_pass pass_record_bounds;
extern struct tree_opt_pass pass_if_conversion; extern struct tree_opt_pass pass_if_conversion;
extern struct tree_opt_pass pass_loop_distribution;
extern struct tree_opt_pass pass_vectorize; extern struct tree_opt_pass pass_vectorize;
extern struct tree_opt_pass pass_complete_unroll; extern struct tree_opt_pass pass_complete_unroll;
extern struct tree_opt_pass pass_parallelize_loops; extern struct tree_opt_pass pass_parallelize_loops;
......
...@@ -41,28 +41,7 @@ along with GCC; see the file COPYING3. If not see ...@@ -41,28 +41,7 @@ along with GCC; see the file COPYING3. If not see
#include "toplev.h" #include "toplev.h"
#include "recog.h" #include "recog.h"
/* Main analysis functions. */
static bool vect_analyze_data_refs (loop_vec_info);
static bool vect_mark_stmts_to_be_vectorized (loop_vec_info);
static void vect_analyze_scalar_cycles (loop_vec_info);
static bool vect_analyze_data_ref_accesses (loop_vec_info);
static bool vect_analyze_data_ref_dependences (loop_vec_info);
static bool vect_analyze_data_refs_alignment (loop_vec_info);
static bool vect_compute_data_refs_alignment (loop_vec_info);
static bool vect_enhance_data_refs_alignment (loop_vec_info);
static bool vect_analyze_operations (loop_vec_info);
static bool vect_determine_vectorization_factor (loop_vec_info);
/* Utility functions for the analyses. */
static bool exist_non_indexing_operands_for_use_p (tree, tree);
static tree vect_get_loop_niters (struct loop *, tree *);
static bool vect_analyze_data_ref_dependence
(struct data_dependence_relation *, loop_vec_info);
static bool vect_compute_data_ref_alignment (struct data_reference *);
static bool vect_analyze_data_ref_access (struct data_reference *);
static bool vect_can_advance_ivs_p (loop_vec_info); static bool vect_can_advance_ivs_p (loop_vec_info);
static void vect_update_misalignment_for_peel
(struct data_reference *, struct data_reference *, int npeel);
/* Function vect_determine_vectorization_factor /* Function vect_determine_vectorization_factor
......
...@@ -147,24 +147,8 @@ along with GCC; see the file COPYING3. If not see ...@@ -147,24 +147,8 @@ along with GCC; see the file COPYING3. If not see
#include "tree-pass.h" #include "tree-pass.h"
/************************************************************************* /*************************************************************************
Simple Loop Peeling Utilities
*************************************************************************/
static void slpeel_update_phis_for_duplicate_loop
(struct loop *, struct loop *, bool after);
static void slpeel_update_phi_nodes_for_guard1
(edge, struct loop *, bool, basic_block *, bitmap *);
static void slpeel_update_phi_nodes_for_guard2
(edge, struct loop *, bool, basic_block *);
static edge slpeel_add_loop_guard (basic_block, tree, basic_block, basic_block);
static void rename_use_op (use_operand_p);
static void rename_variables_in_bb (basic_block);
static void rename_variables_in_loop (struct loop *);
/*************************************************************************
General Vectorization Utilities General Vectorization Utilities
*************************************************************************/ *************************************************************************/
static void vect_set_dump_settings (void);
/* vect_dump will be set to stderr or dump_file if exist. */ /* vect_dump will be set to stderr or dump_file if exist. */
FILE *vect_dump; FILE *vect_dump;
...@@ -241,7 +225,7 @@ rename_variables_in_bb (basic_block bb) ...@@ -241,7 +225,7 @@ rename_variables_in_bb (basic_block bb)
/* Renames variables in new generated LOOP. */ /* Renames variables in new generated LOOP. */
static void void
rename_variables_in_loop (struct loop *loop) rename_variables_in_loop (struct loop *loop)
{ {
unsigned i; unsigned i;
...@@ -806,7 +790,7 @@ slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters) ...@@ -806,7 +790,7 @@ slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
/* Given LOOP this function generates a new copy of it and puts it /* Given LOOP this function generates a new copy of it and puts it
on E which is either the entry or exit of LOOP. */ on E which is either the entry or exit of LOOP. */
static struct loop * struct loop *
slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
{ {
struct loop *new_loop; struct loop *new_loop;
...@@ -871,6 +855,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) ...@@ -871,6 +855,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
if (at_exit) /* Add the loop copy at exit. */ if (at_exit) /* Add the loop copy at exit. */
{ {
redirect_edge_and_branch_force (e, new_loop->header); redirect_edge_and_branch_force (e, new_loop->header);
PENDING_STMT (e) = NULL;
set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src); set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src);
if (was_imm_dom) if (was_imm_dom)
set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header); set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header);
...@@ -888,6 +873,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) ...@@ -888,6 +873,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
new_exit_e = EDGE_SUCC (new_loop->header, 1); new_exit_e = EDGE_SUCC (new_loop->header, 1);
redirect_edge_and_branch_force (new_exit_e, loop->header); redirect_edge_and_branch_force (new_exit_e, loop->header);
PENDING_STMT (new_exit_e) = NULL;
set_immediate_dominator (CDI_DOMINATORS, loop->header, set_immediate_dominator (CDI_DOMINATORS, loop->header,
new_exit_e->src); new_exit_e->src);
...@@ -901,6 +887,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) ...@@ -901,6 +887,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
} }
redirect_edge_and_branch_force (entry_e, new_loop->header); redirect_edge_and_branch_force (entry_e, new_loop->header);
PENDING_STMT (entry_e) = NULL;
set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader); set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader);
} }
......
...@@ -630,6 +630,7 @@ extern struct loop *slpeel_tree_peel_loop_to_edge ...@@ -630,6 +630,7 @@ extern struct loop *slpeel_tree_peel_loop_to_edge
(struct loop *, edge, tree, tree, bool, unsigned int, bool); (struct loop *, edge, tree, tree, bool, unsigned int, bool);
extern void set_prologue_iterations (basic_block, tree, extern void set_prologue_iterations (basic_block, tree,
struct loop *, unsigned int); struct loop *, unsigned int);
struct loop *tree_duplicate_loop_on_edge (struct loop *, edge);
extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree); extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree);
extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge); extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
#ifdef ENABLE_CHECKING #ifdef ENABLE_CHECKING
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment