Commit 2711355f by Zdenek Dvorak Committed by Zdenek Dvorak

tree-ssa-loop-prefetch.c (determine_unroll_factor): Bound the unroll factor by…

tree-ssa-loop-prefetch.c (determine_unroll_factor): Bound the unroll factor by the estimated number of iterations.

	* tree-ssa-loop-prefetch.c (determine_unroll_factor):  Bound the unroll
	factor by the estimated number of iterations.
	(loop_prefetch_arrays): Do not prefetch in loops that iterate less than
	prefetch latency.

	* gcc.dg/tree-ssa/prefetch-4.c: New test.

From-SVN: r122435
parent 41dc91a8
2007-03-01 Zdenek Dvorak <dvorakz@suse.cz>
* tree-ssa-loop-prefetch.c (determine_unroll_factor): Bound the unroll
factor by the estimated number of iterations.
(loop_prefetch_arrays): Do not prefetch in loops that iterate less than
prefetch latency.
2007-03-01 Richard Henderson <rth@redhat.com>
* expr.c (emit_move_complex_push): Export.
......
......@@ -47,6 +47,131 @@ const char *host_detect_local_cpu (int argc, const char **argv);
#define bit_3DNOWP (1 << 30)
#define bit_LM (1 << 29)
/* Returns parameters that describe L1_ASSOC associative cache of size
L1_SIZEKB with lines of size L1_LINE. */
static char *
describe_cache (unsigned l1_sizekb, unsigned l1_line,
unsigned l1_assoc ATTRIBUTE_UNUSED)
{
char size[1000], line[1000];
unsigned size_in_lines;
/* At the moment, gcc middle-end does not use the information about the
associativity of the cache. */
size_in_lines = (l1_sizekb * 1024) / l1_line;
sprintf (size, "--param l1-cache-size=%u", size_in_lines);
sprintf (line, "--param l1-cache-line-size=%u", l1_line);
return concat (size, " ", line, " ", NULL);
}
/* Returns the description of caches for an AMD processor. */
static char *
detect_caches_amd (unsigned max_ext_level)
{
unsigned eax, ebx, ecx, edx;
unsigned l1_sizekb, l1_line, l1_assoc;
if (max_ext_level < 0x80000005)
return NULL;
cpuid (0x80000005, eax, ebx, ecx, edx);
l1_line = ecx & 0xff;
l1_sizekb = (ecx >> 24) & 0xff;
l1_assoc = (ecx >> 16) & 0xff;
return describe_cache (l1_sizekb, l1_line, l1_assoc);
}
/* Stores the size of the L1 cache and cache line, and the associativity
of the cache according to REG to L1_SIZEKB, L1_LINE and L1_ASSOC. */
static void
decode_caches_intel (unsigned reg, unsigned *l1_sizekb, unsigned *l1_line,
unsigned *l1_assoc)
{
unsigned i, val;
if (((reg >> 31) & 1) != 0)
return;
for (i = 0; i < 4; i++)
{
val = reg & 0xff;
reg >>= 8;
switch (val)
{
case 0xa:
*l1_sizekb = 8;
*l1_line = 32;
*l1_assoc = 2;
break;
case 0xc:
*l1_sizekb = 16;
*l1_line = 32;
*l1_assoc = 4;
break;
case 0x2c:
*l1_sizekb = 32;
*l1_line = 64;
*l1_assoc = 8;
break;
case 0x60:
*l1_sizekb = 16;
*l1_line = 64;
*l1_assoc = 8;
break;
case 0x66:
*l1_sizekb = 8;
*l1_line = 64;
*l1_assoc = 4;
break;
case 0x67:
*l1_sizekb = 16;
*l1_line = 64;
*l1_assoc = 4;
break;
case 0x68:
*l1_sizekb = 32;
*l1_line = 64;
*l1_assoc = 4;
break;
default:
break;
}
}
}
/* Returns the description of caches for an intel processor. */
static char *
detect_caches_intel (unsigned max_level)
{
unsigned eax, ebx, ecx, edx;
unsigned l1_sizekb = 0, l1_line = 0, assoc = 0;
if (max_level < 2)
return NULL;
cpuid (2, eax, ebx, ecx, edx);
decode_caches_intel (eax, &l1_sizekb, &l1_line, &assoc);
decode_caches_intel (ebx, &l1_sizekb, &l1_line, &assoc);
decode_caches_intel (ecx, &l1_sizekb, &l1_line, &assoc);
decode_caches_intel (edx, &l1_sizekb, &l1_line, &assoc);
if (!l1_sizekb)
return (char *) "";
return describe_cache (l1_sizekb, l1_line, assoc);
}
/* This will be called by the spec parser in gcc.c when it sees
a %:local_cpu_detect(args) construct. Currently it will be called
with either "arch" or "tune" as argument depending on if -march=native
......@@ -62,6 +187,7 @@ const char *host_detect_local_cpu (int argc, const char **argv);
const char *host_detect_local_cpu (int argc, const char **argv)
{
const char *cpu = NULL;
const char *cache = "";
enum processor_type processor = PROCESSOR_I386;
unsigned int eax, ebx, ecx, edx;
unsigned int max_level;
......@@ -126,6 +252,14 @@ const char *host_detect_local_cpu (int argc, const char **argv)
is_amd = vendor == *(unsigned int*)"Auth";
if (!arch)
{
if (is_amd)
cache = detect_caches_amd (ext_level);
else if (vendor == *(unsigned int*)"Genu")
cache = detect_caches_intel (max_level);
}
if (is_amd)
{
if (has_mmx)
......@@ -283,7 +417,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
}
done:
return concat ("-m", argv[0], "=", cpu, NULL);
return concat (cache, "-m", argv[0], "=", cpu, NULL);
}
#else
/* If we aren't compiling with GCC we just provide a minimal
......
2007-03-01 Zdenek Dvorak <dvorakz@suse.cz>
* gcc.dg/tree-ssa/prefetch-4.c: New test.
2007-03-01 Simon Baldwin <simonb@google.com>
PR c++/23689
/* The loop rolls too little, hence the prefetching would not be useful. */
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target ilp32 } */
/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -fdump-tree-final_cleanup" } */
int xxx[20];
void foo (int n)
{
int i;
for (i = 0; i < n; i++)
xxx[i] = i;
}
/* { dg-final { scan-tree-dump-times "prefetch" 0 "final_cleanup" } } */
/* { dg-final { cleanup-tree-dump "final_cleanup" } } */
......@@ -885,13 +885,14 @@ should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
/* Determine the coefficient by that unroll LOOP, from the information
contained in the list of memory references REFS. Description of
umber of iterations of LOOP is stored to DESC. AHEAD is the number
of iterations ahead that we need to prefetch. NINSNS is number of
insns of the LOOP. */
umber of iterations of LOOP is stored to DESC. NINSNS is the number of
insns of the LOOP. EST_NITER is the estimated number of iterations of
the loop, or -1 if no estimate is available. */
static unsigned
determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
unsigned ninsns, struct tree_niter_desc *desc)
unsigned ninsns, struct tree_niter_desc *desc,
HOST_WIDE_INT est_niter)
{
unsigned upper_bound;
unsigned nfactor, factor, mod_constraint;
......@@ -906,6 +907,12 @@ determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
gains from better scheduling and decreasing loop overhead, which is not
the case here. */
upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
/* If we unrolled the loop more times than it iterates, the unrolled version
of the loop would be never entered. */
if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
upper_bound = est_niter;
if (upper_bound <= 1)
return 1;
......@@ -935,7 +942,8 @@ static bool
loop_prefetch_arrays (struct loop *loop)
{
struct mem_ref_group *refs;
unsigned ahead, ninsns, unroll_factor;
unsigned ahead, ninsns, time, unroll_factor;
HOST_WIDE_INT est_niter;
struct tree_niter_desc desc;
bool unrolled = false;
......@@ -950,21 +958,24 @@ loop_prefetch_arrays (struct loop *loop)
/* Step 3: determine the ahead and unroll factor. */
/* FIXME: We should use not size of the loop, but the average number of
instructions executed per iteration of the loop. */
ninsns = tree_num_loop_insns (loop, &eni_time_weights);
ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc);
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
/* FIXME: the time should be weighted by the probabilities of the blocks in
the loop body. */
time = tree_num_loop_insns (loop, &eni_time_weights);
ahead = (PREFETCH_LATENCY + time - 1) / time;
est_niter = estimated_loop_iterations_int (loop, false);
/* If the loop rolls less than the required unroll factor, prefetching
is useless. */
if (unroll_factor > 1
&& cst_and_fits_in_hwi (desc.niter)
&& (unsigned HOST_WIDE_INT) int_cst_value (desc.niter) < unroll_factor)
/* The prefetches will run for AHEAD iterations of the original loop. Unless
the loop rolls at least AHEAD times, prefetching the references does not
make sense. */
if (est_niter >= 0 && est_niter <= (HOST_WIDE_INT) ahead)
goto fail;
ninsns = tree_num_loop_insns (loop, &eni_size_weights);
unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
est_niter);
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
/* Step 4: what to prefetch? */
if (!schedule_prefetches (refs, unroll_factor, ahead))
goto fail;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment