Commit 911b3fdb by Zdenek Dvorak Committed by Zdenek Dvorak

tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve comments.

	* tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve
	comments.
	(issue_prefetch_ref): Move assignment to write_p out of loop.
	(determine_unroll_factor): Do not take PARAM_MAX_UNROLL_TIMES and
	SIMULTANEOUS_PREFETCHES into account.
	(loop_prefetch_arrays): Do not pass ahead to determine_unroll_factor.
	* lambda-code.c (lcm): Renamed to ...
	(least_common_multiple): ... and exported.
	* tree-flow.h (least_common_multiple): Declare.

From-SVN: r118730
parent 946e1bc7
2006-11-12 Zdenek Dvorak <dvorakz@suse.cz> 2006-11-12 Zdenek Dvorak <dvorakz@suse.cz>
* tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve
comments.
(issue_prefetch_ref): Move assignment to write_p out of loop.
(determine_unroll_factor): Do not take PARAM_MAX_UNROLL_TIMES and
SIMULTANEOUS_PREFETCHES into account.
(loop_prefetch_arrays): Do not pass ahead to determine_unroll_factor.
* lambda-code.c (lcm): Renamed to ...
(least_common_multiple): ... and exported.
* tree-flow.h (least_common_multiple): Declare.
2006-11-12 Zdenek Dvorak <dvorakz@suse.cz>
* Makefile.in (tree-data-ref.o): Add langhooks.h dependency. * Makefile.in (tree-data-ref.o): Add langhooks.h dependency.
* tree-ssa-loop-niter.c (derive_constant_upper_bound): Follow * tree-ssa-loop-niter.c (derive_constant_upper_bound): Follow
ud-chains. Handle AND_EXPR. ud-chains. Handle AND_EXPR.
......
...@@ -442,8 +442,8 @@ lambda_lattice_compute_base (lambda_loopnest nest) ...@@ -442,8 +442,8 @@ lambda_lattice_compute_base (lambda_loopnest nest)
/* Compute the least common multiple of two numbers A and B . */ /* Compute the least common multiple of two numbers A and B . */
static int int
lcm (int a, int b) least_common_multiple (int a, int b)
{ {
return (abs (a) * abs (b) / gcd (a, b)); return (abs (a) * abs (b) / gcd (a, b));
} }
...@@ -577,7 +577,7 @@ compute_nest_using_fourier_motzkin (int size, ...@@ -577,7 +577,7 @@ compute_nest_using_fourier_motzkin (int size,
{ {
if (A[k][i] < 0) if (A[k][i] < 0)
{ {
multiple = lcm (A[j][i], A[k][i]); multiple = least_common_multiple (A[j][i], A[k][i]);
f1 = multiple / A[j][i]; f1 = multiple / A[j][i];
f2 = -1 * multiple / A[k][i]; f2 = -1 * multiple / A[k][i];
......
2006-11-12 Zdenek Dvorak <dvorakz@suse.cz>
* gcc.dg/tree-ssa/prefetch-3.c: New test.
2006-11-12 Roger Sayle <roger@eyesopen.com> 2006-11-12 Roger Sayle <roger@eyesopen.com>
PR tree-optimization/13827 PR tree-optimization/13827
/* Prefetching used to prefer nonsensical unroll factor of 5 in this testcase. */
/* { dg-do compile { target i?86-*-* } } */
/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -msse2 -mfpmath=sse -fdump-tree-aprefetch-details" } */
#define N 1000000
double a[N];
double test(void)
{
unsigned i;
double sum = 0;
for (i = 0; i < N; i += 2)
sum += (a[i] * a[i+1]);
return sum;
}
/* { dg-final { scan-tree-dump-times "unroll factor 4" 1 "aprefetch" } } */
/* { dg-final { cleanup-tree-dump "aprefetch" } } */
...@@ -1036,4 +1036,7 @@ void swap_tree_operands (tree, tree *, tree *); ...@@ -1036,4 +1036,7 @@ void swap_tree_operands (tree, tree *, tree *);
extern void recalculate_used_alone (void); extern void recalculate_used_alone (void);
extern bool updating_used_alone; extern bool updating_used_alone;
int least_common_multiple (int, int);
#endif /* _TREE_FLOW_H */ #endif /* _TREE_FLOW_H */
...@@ -744,19 +744,21 @@ static bool ...@@ -744,19 +744,21 @@ static bool
schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor, schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
unsigned ahead) unsigned ahead)
{ {
unsigned max_prefetches, n_prefetches; unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
unsigned slots_per_prefetch;
struct mem_ref *ref; struct mem_ref *ref;
bool any = false; bool any = false;
max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead; /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES) remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
max_prefetches = SIMULTANEOUS_PREFETCHES;
/* The prefetch will run for AHEAD iterations of the original loop, i.e.,
AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
it will need a prefetch slot. */
slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
if (dump_file && (dump_flags & TDF_DETAILS)) if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches); fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
slots_per_prefetch);
if (!max_prefetches)
return false;
/* For now we just take memory references one by one and issue /* For now we just take memory references one by one and issue
prefetches for as many as possible. The groups are sorted prefetches for as many as possible. The groups are sorted
...@@ -769,16 +771,24 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor, ...@@ -769,16 +771,24 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
if (!should_issue_prefetch_p (ref)) if (!should_issue_prefetch_p (ref))
continue; continue;
ref->issue_prefetch_p = true; /* If we need to prefetch the reference each PREFETCH_MOD iterations,
and we unroll the loop UNROLL_FACTOR times, we need to insert
/* If prefetch_mod is less then unroll_factor, we need to insert ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
several prefetches for the reference. */ iteration. */
n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
/ ref->prefetch_mod); / ref->prefetch_mod);
if (max_prefetches <= n_prefetches) prefetch_slots = n_prefetches * slots_per_prefetch;
return true;
/* If more than half of the prefetches would be lost anyway, do not
issue the prefetch. */
if (2 * remaining_prefetch_slots < prefetch_slots)
continue;
ref->issue_prefetch_p = true;
max_prefetches -= n_prefetches; if (remaining_prefetch_slots <= prefetch_slots)
return true;
remaining_prefetch_slots -= prefetch_slots;
any = true; any = true;
} }
...@@ -822,6 +832,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) ...@@ -822,6 +832,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
/ ref->prefetch_mod); / ref->prefetch_mod);
addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node); addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL); addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
write_p = ref->write_p ? integer_one_node : integer_zero_node;
for (ap = 0; ap < n_prefetches; ap++) for (ap = 0; ap < n_prefetches; ap++)
{ {
...@@ -832,10 +843,9 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) ...@@ -832,10 +843,9 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL); addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
/* Create the prefetch instruction. */ /* Create the prefetch instruction. */
write_p = ref->write_p ? integer_one_node : integer_zero_node;
params = tree_cons (NULL_TREE, addr, params = tree_cons (NULL_TREE, addr,
tree_cons (NULL_TREE, write_p, NULL_TREE)); tree_cons (NULL_TREE, write_p, NULL_TREE));
prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH], prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
params); params);
bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT); bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
...@@ -888,48 +898,36 @@ should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc, ...@@ -888,48 +898,36 @@ should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
static unsigned static unsigned
determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs, determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
unsigned ahead, unsigned ninsns, unsigned ninsns, struct tree_niter_desc *desc)
struct tree_niter_desc *desc)
{ {
unsigned upper_bound, size_factor, constraint_factor; unsigned upper_bound;
unsigned factor, max_mod_constraint, ahead_factor; unsigned nfactor, factor, mod_constraint;
struct mem_ref_group *agp; struct mem_ref_group *agp;
struct mem_ref *ref; struct mem_ref *ref;
upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); /* First check whether the loop is not too large to unroll. We ignore
PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
/* First check whether the loop is not too large to unroll. */ from unrolling them enough to make exactly one cache line covered by each
size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns; iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
if (size_factor <= 1) us from unrolling the loops too many times in cases where we only expect
gains from better scheduling and decreasing loop overhead, which is not
the case here. */
upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
if (upper_bound <= 1)
return 1; return 1;
if (size_factor < upper_bound) /* Choose the factor so that we may prefetch each cache just once,
upper_bound = size_factor; but bound the unrolling by UPPER_BOUND. */
factor = 1;
max_mod_constraint = 1;
for (agp = refs; agp; agp = agp->next) for (agp = refs; agp; agp = agp->next)
for (ref = agp->refs; ref; ref = ref->next) for (ref = agp->refs; ref; ref = ref->next)
if (should_issue_prefetch_p (ref) if (should_issue_prefetch_p (ref))
&& ref->prefetch_mod > max_mod_constraint) {
max_mod_constraint = ref->prefetch_mod; mod_constraint = ref->prefetch_mod;
nfactor = least_common_multiple (mod_constraint, factor);
/* Set constraint_factor as large as needed to be able to satisfy the if (nfactor <= upper_bound)
largest modulo constraint. */ factor = nfactor;
constraint_factor = max_mod_constraint; }
/* If ahead is too large in comparison with the number of available
prefetches, unroll the loop as much as needed to be able to prefetch
at least partially some of the references in the loop. */
ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1)
/ SIMULTANEOUS_PREFETCHES);
/* Unroll as much as useful, but bound the code size growth. */
if (constraint_factor < ahead_factor)
factor = ahead_factor;
else
factor = constraint_factor;
if (factor > upper_bound)
factor = upper_bound;
if (!should_unroll_loop_p (loop, desc, factor)) if (!should_unroll_loop_p (loop, desc, factor))
return 1; return 1;
...@@ -964,8 +962,7 @@ loop_prefetch_arrays (struct loops *loops, struct loop *loop) ...@@ -964,8 +962,7 @@ loop_prefetch_arrays (struct loops *loops, struct loop *loop)
instructions executed per iteration of the loop. */ instructions executed per iteration of the loop. */
ninsns = tree_num_loop_insns (loop); ninsns = tree_num_loop_insns (loop);
ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns; ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns, unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc);
&desc);
if (dump_file && (dump_flags & TDF_DETAILS)) if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor); fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment