Commit e70444a8 by Harsha Jagasia Committed by Harsha Jagasia

rsha Jagasia <harsha.jagasia@amd.com>

            Jan Sjodin <jan.sjodin@amd.com>
        
        * tree-vect-analyze.c (vect_analyze_operations): Change
        comparison of loop iterations with threshold to less than
        or equal to instead of less than. Reduce
        min_scalar_loop_bound by one.
        * tree-vect-transform.c (vect_estimate_min_profitable_iters): 
        Change prologue and epilogue iterations estimate to vf/2,
        when unknown at compile-time. Change versioning guard
        cost to taken_branch_cost. If peeling for alignment is
        unknown at compile-time, change peel guard costs to one
        taken branch and one not-taken branch per peeled loop.
        If peeling for alignment is known but number of scalar loop
        iterations is unknown at compile-time, change peel guard
        costs to one taken branch per peeled loop. Change the cost
        model equation to consider vector iterations as the loop
        iterations less the prologue and epilogue iterations.
        Change outside vector cost check to less than or equal to
        zero instead of equal to zero.
        (vect_do_peeling_for_loop_bound): Reduce
        min_scalar_loop_bound by one.
        * tree-vectorizer.h: Add TARG_COND_TAKEN_BRANCH_COST and
        TARG_COND_NOT_TAKEN_BRANCH_COST.        
        * config/i386/i386.h (processor_costs): Add
        scalar_stmt_cost, scalar_load_cost, scalar_store_cost,
        vec_stmt_cost, vec_to_scalar_cost, scalar_to_vec_cost,
        vec_align_load_cost, vect_unalign_load_cost,
        vec_store_cost, cond_taken_branch_cost,
        cond_not_taken_branch_cost.
        Define macros for x86 costs.
        * config/i386/i386.c:
        (size_cost): Set scalar_stmt_cost, scalar_load_cost,
        scalar_store_cost, vec_stmt_cost, vec_to_scalar_cost,
        scalar_to_vec_cost, vec_align_load_cost, 
        vect_unalign_load_cost, vec_store_cost,
        cond_taken_branch_cost, cond_not_taken_branch_cost to one. 
        (i386_cost, i486_cost, pentium_cost, pentiumpro_cost,
        geode_cost, k6_cost, athlon_cost, pentium4_cost, nocona_cost, 
        core2_cost, generic64_cost, generic32_cost): Set to default
        untuned costs.
        (k8_cost, amdfam10_cost): Costs for vectorization tuned.
        (x86_builtin_vectorization_cost): New.

2007-09-10  Harsha Jagasia <harsha.jagasia@amd.com>

        * gcc.dg/vect/costmodel/i386/costmodel-vect-31.c: 
        Change dg-final to expect 1 non-profitable loop and
        3 profitable loops.
        * gcc.dg/vect/costmodel/x86-64/costmodel-vect-31.c:
        Change dg-final to expect 1 non-profitable loop and
        3 profitable loops.
        * gcc.dg/vect/costmodel/x86-64/costmodel-fast-math-vect-pr29925.c:
        Change dg-final to expect 1 profitable loop.
        * gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c:
        Change dg-final to expect 1 profitable loop.


Co-Authored-By: Jan Sjodin <jan.sjodin@amd.com>

From-SVN: r128353
parent 2533577f
2007-09-10 Harsha Jagasia <harsha.jagasia@amd.com>
Jan Sjodin <jan.sjodin@amd.com>
* tree-vect-analyze.c (vect_analyze_operations): Change
comparison of loop iterations with threshold to less than
or equal to instead of less than. Reduce
min_scalar_loop_bound by one.
* tree-vect-transform.c (vect_estimate_min_profitable_iters):
Change prologue and epilogue iterations estimate to vf/2,
when unknown at compile-time. Change versioning guard
cost to taken_branch_cost. If peeling for alignment is
unknown at compile-time, change peel guard costs to one
taken branch and one not-taken branch per peeled loop.
If peeling for alignment is known but number of scalar loop
iterations is unknown at compile-time, change peel guard
costs to one taken branch per peeled loop. Change the cost
model equation to consider vector iterations as the loop
iterations less the prologue and epilogue iterations.
Change outside vector cost check to less than or equal to
zero instead of equal to zero.
(vect_do_peeling_for_loop_bound): Reduce
min_scalar_loop_bound by one.
* tree-vectorizer.h: Add TARG_COND_TAKEN_BRANCH_COST and
TARG_COND_NOT_TAKEN_BRANCH_COST.
* config/i386/i386.h (processor_costs): Add
scalar_stmt_cost, scalar_load_cost, scalar_store_cost,
vec_stmt_cost, vec_to_scalar_cost, scalar_to_vec_cost,
vec_align_load_cost, vect_unalign_load_cost,
vec_store_cost, cond_taken_branch_cost,
cond_not_taken_branch_cost.
Define macros for x86 costs.
* config/i386/i386.c:
(size_cost): Set scalar_stmt_cost, scalar_load_cost,
scalar_store_cost, vec_stmt_cost, vec_to_scalar_cost,
scalar_to_vec_cost, vec_align_load_cost,
vect_unalign_load_cost, vec_store_cost,
cond_taken_branch_cost, cond_not_taken_branch_cost to one.
(i386_cost, i486_cost, pentium_cost, pentiumpro_cost,
geode_cost, k6_cost, athlon_cost, pentium4_cost, nocona_cost,
core2_cost, generic64_cost, generic32_cost): Set to default
untuned costs.
(k8_cost, amdfam10_cost): Costs for vectorization tuned.
(x86_builtin_vectorization_cost): New.
2007-09-10 Janis Johnson <janis187@us.ibm.com>
Ben Elliston <bje@au.ibm.com>
......@@ -138,6 +138,22 @@ struct processor_costs {
/* Specify what algorithm
to use for stringops on unknown size. */
struct stringop_algs memcpy[2], memset[2];
const int scalar_stmt_cost; /* Cost of any scalar operation, excluding
load and store. */
const int scalar_load_cost; /* Cost of scalar load. */
const int scalar_store_cost; /* Cost of scalar store. */
const int vec_stmt_cost; /* Cost of any vector operation, excluding
load, store, vector-to-scalar and
scalar-to-vector operation. */
const int vec_to_scalar_cost; /* Cost of vect-to-scalar operation. */
const int scalar_to_vec_cost; /* Cost of scalar-to-vector operation. */
const int vec_align_load_cost; /* Cost of aligned vector load. */
const int vec_unalign_load_cost; /* Cost of unaligned vector load. */
const int vec_store_cost; /* Cost of vector store. */
const int cond_taken_branch_cost; /* Cost of taken branch for vectorizer
cost model. */
const int cond_not_taken_branch_cost;/* Cost of not taken branch for
vectorizer cost model. */
};
extern const struct processor_costs *ix86_cost;
......@@ -2460,6 +2476,57 @@ struct machine_function GTY(())
#define SYMBOL_REF_DLLEXPORT_P(X) \
((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_DLLEXPORT) != 0)
/* Model costs for vectorizer. */
/* Cost of conditional branch. */
#undef TARG_COND_BRANCH_COST
#define TARG_COND_BRANCH_COST ix86_cost->branch_cost
/* Cost of any scalar operation, excluding load and store. */
#undef TARG_SCALAR_STMT_COST
#define TARG_SCALAR_STMT_COST ix86_cost->scalar_stmt_cost
/* Cost of scalar load. */
#undef TARG_SCALAR_LOAD_COST
#define TARG_SCALAR_LOAD_COST ix86_cost->scalar_load_cost
/* Cost of scalar store. */
#undef TARG_SCALAR_STORE_COST
#define TARG_SCALAR_STORE_COST ix86_cost->scalar_store_cost
/* Cost of any vector operation, excluding load, store or vector to scalar
operation. */
#undef TARG_VEC_STMT_COST
#define TARG_VEC_STMT_COST ix86_cost->vec_stmt_cost
/* Cost of vector to scalar operation. */
#undef TARG_VEC_TO_SCALAR_COST
#define TARG_VEC_TO_SCALAR_COST ix86_cost->vec_to_scalar_cost
/* Cost of scalar to vector operation. */
#undef TARG_SCALAR_TO_VEC_COST
#define TARG_SCALAR_TO_VEC_COST ix86_cost->scalar_to_vec_cost
/* Cost of aligned vector load. */
#undef TARG_VEC_LOAD_COST
#define TARG_VEC_LOAD_COST ix86_cost->vec_align_load_cost
/* Cost of misaligned vector load. */
#undef TARG_VEC_UNALIGNED_LOAD_COST
#define TARG_VEC_UNALIGNED_LOAD_COST ix86_cost->vec_unalign_load_cost
/* Cost of vector store. */
#undef TARG_VEC_STORE_COST
#define TARG_VEC_STORE_COST ix86_cost->vec_store_cost
/* Cost of conditional taken branch for vectorizer cost model. */
#undef TARG_COND_TAKEN_BRANCH_COST
#define TARG_COND_TAKEN_BRANCH_COST ix86_cost->cond_taken_branch_cost
/* Cost of conditional not taken branch for vectorizer cost model. */
#undef TARG_COND_NOT_TAKEN_BRANCH_COST
#define TARG_COND_NOT_TAKEN_BRANCH_COST ix86_cost->cond_not_taken_branch_cost
/*
Local variables:
version-control: t
......
2007-09-10 Harsha Jagasia <harsha.jagasia@amd.com>
* gcc.dg/vect/costmodel/i386/costmodel-vect-31.c:
Change dg-final to expect 1 non-profitable loop and
3 profitable loops.
* gcc.dg/vect/costmodel/x86-64/costmodel-vect-31.c:
Change dg-final to expect 1 non-profitable loop and
3 profitable loops.
* gcc.dg/vect/costmodel/x86-64/costmodel-fast-math-vect-pr29925.c:
Change dg-final to expect 1 profitable loop.
* gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c:
Change dg-final to expect 1 profitable loop.
2007-09-10 Richard Sandiford <richard@codesourcery.com>
* gcc.target/mips/call-saved-1.c: New test.
......@@ -35,6 +35,6 @@ int main()
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
......@@ -85,7 +85,7 @@ int main (void)
return main1 ();
}
/* { dg-final { scan-tree-dump-times "vectorization not profitable" 2 "vect" } }
/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } }
*/
/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
......@@ -35,6 +35,6 @@ int main()
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
......@@ -85,7 +85,7 @@ int main (void)
return main1 ();
}
/* { dg-final { scan-tree-dump-times "vectorization not profitable" 2 "vect" } }
/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } }
*/
/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
......@@ -596,8 +596,8 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
return false;
}
min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
* vectorization_factor;
min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
* vectorization_factor) - 1);
/* Use the cost model only if it is more conservative than user specified
threshold. */
......@@ -609,7 +609,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
th = (unsigned) min_profitable_iters;
if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
&& LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
&& LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
{
if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
fprintf (vect_dump, "not vectorized: vectorization not "
......
......@@ -124,6 +124,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
int nbbs = loop->num_nodes;
int byte_misalign;
int peel_guard_costs = 0;
int innerloop_iters = 0, factor;
VEC (slp_instance, heap) *slp_instances;
slp_instance instance;
......@@ -141,7 +142,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
{
vec_outside_cost += TARG_COND_BRANCH_COST;
vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "cost model: Adding cost of checks for loop "
"versioning.\n");
......@@ -188,7 +189,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
loop.
FORNOW: If we dont know the value of peel_iters for prologue or epilogue
at compile-time - we assume it's (vf-1)/2 (the worst would be vf-1).
at compile-time - we assume it's vf/2 (the worst would be vf-1).
TODO: Build an expression that represents peel_iters for prologue and
epilogue to be used in a run-time test. */
......@@ -197,18 +198,26 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
if (byte_misalign < 0)
{
peel_iters_prologue = (vf - 1)/2;
peel_iters_prologue = vf/2;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "cost model: "
"prologue peel iters set to (vf-1)/2.");
"prologue peel iters set to vf/2.");
/* If peeling for alignment is unknown, loop bound of main loop becomes
unknown. */
peel_iters_epilogue = (vf - 1)/2;
peel_iters_epilogue = vf/2;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "cost model: "
"epilogue peel iters set to (vf-1)/2 because "
"epilogue peel iters set to vf/2 because "
"peeling for alignment is unknown .");
/* If peeled iterations are unknown, count a taken branch and a not taken
branch per peeled loop. Even if scalar loop iterations are known,
vector iterations are not known since peeled prologue iterations are
not known. Hence guards remain the same. */
peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
+ TARG_COND_NOT_TAKEN_BRANCH_COST);
}
else
{
......@@ -226,11 +235,16 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
{
peel_iters_epilogue = (vf - 1)/2;
peel_iters_epilogue = vf/2;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "cost model: "
"epilogue peel iters set to (vf-1)/2 because "
"epilogue peel iters set to vf/2 because "
"loop iterations are unknown .");
/* If peeled iterations are known but number of scalar loop
iterations are unknown, count a taken branch per peeled loop. */
peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
}
else
{
......@@ -241,33 +255,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
}
}
/* Requires a prologue loop when peeling to handle misalignment. Add cost of
two guards, one for the peeled loop and one for the vector loop. */
if (peel_iters_prologue)
{
vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "cost model: Adding cost of checks for "
"prologue.\n");
}
/* Requires an epilogue loop to finish up remaining iterations after vector
loop. Add cost of two guards, one for the peeled loop and one for the
vector loop. */
if (peel_iters_epilogue
|| !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
|| LOOP_VINFO_INT_NITERS (loop_vinfo) % vf)
{
vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "cost model : Adding cost of checks for "
"epilogue.\n");
}
vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
+ (peel_iters_epilogue * scalar_single_iter_cost);
+ (peel_iters_epilogue * scalar_single_iter_cost)
+ peel_guard_costs;
/* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
information we provide for the target is whether testing against the
......@@ -305,11 +295,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
if ((scalar_single_iter_cost * vf) > vec_inside_cost)
{
if (vec_outside_cost == 0)
if (vec_outside_cost <= 0)
min_profitable_iters = 1;
else
{
min_profitable_iters = (vec_outside_cost * vf)
min_profitable_iters = (vec_outside_cost * vf
- vec_inside_cost * peel_iters_prologue
- vec_inside_cost * peel_iters_epilogue)
/ ((scalar_single_iter_cost * vf)
- vec_inside_cost);
......@@ -344,8 +336,6 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
peel_iters_epilogue);
fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
min_profitable_iters);
fprintf (vect_dump, " Actual minimum iters for profitability: %d\n",
min_profitable_iters < vf ? vf : min_profitable_iters);
}
min_profitable_iters =
......@@ -355,6 +345,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
if (niters <= min_profitable_iters)
then skip the vectorized loop. */
min_profitable_iters--;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, " Profitability threshold = %d\n",
min_profitable_iters);
return min_profitable_iters;
}
......@@ -6452,8 +6447,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
/* Analyze cost to set threshhold for vectorized loop. */
min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
* LOOP_VINFO_VECT_FACTOR (loop_vinfo);
min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
* LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
/* Use the cost model only if it is more conservative than user specified
threshold. */
......@@ -6464,8 +6459,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
|| min_profitable_iters > min_scalar_loop_bound))
th = (unsigned) min_profitable_iters;
if (min_profitable_iters
&& !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
if (((LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
|| !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
&& vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "vectorization may not be profitable.");
......
......@@ -469,9 +469,14 @@ typedef struct _stmt_vec_info {
/* These are some defines for the initial implementation of the vectorizer's
cost model. These will later be target specific hooks. */
/* Cost of conditional branch. */
#ifndef TARG_COND_BRANCH_COST
#define TARG_COND_BRANCH_COST 3
/* Cost of conditional taken branch. */
#ifndef TARG_COND_TAKEN_BRANCH_COST
#define TARG_COND_TAKEN_BRANCH_COST 3
#endif
/* Cost of conditional not taken branch. */
#ifndef TARG_COND_NOT_TAKEN_BRANCH_COST
#define TARG_COND_NOT_TAKEN_BRANCH_COST 1
#endif
/* Cost of any scalar operation, excluding load and store. */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment