Commit b5aeb3bb by Ira Rosen Committed by Ira Rosen

re PR tree-optimization/37027 (SLP loop vectorization missing support for reductions)


	PR tree-optimization/37027
	* tree-vectorizer.h (struct _loop_vec_info): Add new field reductions 
	and macro to access it.
	(vectorizable_reduction): Add argument.
	(vect_get_slp_defs): Likewise.
	* tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction
	statements for possible use in SLP.
	(new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS.
	(destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS.
	(vect_create_epilog_for_reduction): Handle SLP. Modify documentation,
	add new argument.
	(vectorizable_reduction): Likewise.
	* tree-vect-stmts.c (vect_get_vec_defs): Update call to 
	vect_get_slp_defs.
	(vectorizable_type_demotion, vectorizable_type_promotion,
	vectorizable_store): Likewise.
	(vect_analyze_stmt): Update call to vectorizable_reduction.
	(vect_transform_stmt): Likewise.
	* tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction.
	(vect_build_slp_tree): Fix indentation. Check that there are no loads
	from different interleaving chains in same node.
	(vect_slp_rearrange_stmts): New function.
	(vect_supported_load_permutation_p): Allow load permutations for 
	reductions. Call vect_slp_rearrange_stmts() to rearrange statements
	inside SLP nodes if necessary.
	(vect_analyze_slp_instance): Handle reductions.
	(vect_analyze_slp): Try to build SLP instances originating from groups
	of reductions.
	(vect_detect_hybrid_slp_stmts): Skip reduction statements.
	(vect_get_constant_vectors): Create initial vectors for reductions
	according to reduction code. Add new argument.
	(vect_get_slp_defs): Add new argument, pass it to 
	vect_get_constant_vectors.
	(vect_schedule_slp_instance): Remove SLP tree root statements.

From-SVN: r158506
parent 5a2fa9e8
2010-04-19 Ira Rosen <irar@il.ibm.com>
PR tree-optimization/37027
* tree-vectorizer.h (struct _loop_vec_info): Add new field reductions
and macro to access it.
(vectorizable_reduction): Add argument.
(vect_get_slp_defs): Likewise.
* tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction
statements for possible use in SLP.
(new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS.
(destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS.
(vect_create_epilog_for_reduction): Handle SLP. Modify documentation,
add new argument.
(vectorizable_reduction): Likewise.
* tree-vect-stmts.c (vect_get_vec_defs): Update call to
vect_get_slp_defs.
(vectorizable_type_demotion, vectorizable_type_promotion,
vectorizable_store): Likewise.
(vect_analyze_stmt): Update call to vectorizable_reduction.
(vect_transform_stmt): Likewise.
* tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction.
(vect_build_slp_tree): Fix indentation. Check that there are no loads
from different interleaving chains in same node.
(vect_slp_rearrange_stmts): New function.
(vect_supported_load_permutation_p): Allow load permutations for
reductions. Call vect_slp_rearrange_stmts() to rearrange statements
inside SLP nodes if necessary.
(vect_analyze_slp_instance): Handle reductions.
(vect_analyze_slp): Try to build SLP instances originating from groups
of reductions.
(vect_detect_hybrid_slp_stmts): Skip reduction statements.
(vect_get_constant_vectors): Create initial vectors for reductions
according to reduction code. Add new argument.
(vect_get_slp_defs): Add new argument, pass it to
vect_get_constant_vectors.
(vect_schedule_slp_instance): Remove SLP tree root statements.
2010-04-19 Jakub Jelinek <jakub@redhat.com>
* tree.h (ENUM_IS_SCOPED): Define.
......
2010-04-19 Ira Rosen <irar@il.ibm.com>
PR tree-optimization/37027
* lib/target-supports.exp
(check_effective_target_vect_widen_sum_hi_to_si_pattern): New.
* gcc.dg/vect/pr37027.c: New test.
* gcc.dg/vect/slp-reduc-1.c, gcc.dg/vect/slp-reduc-2.c,
gcc.dg/vect/slp-reduc-3.c, gcc.dg/vect/slp-reduc-4.c,
gcc.dg/vect/slp-reduc-5.c, gcc.dg/vect/slp-reduc-6.c,
gcc.dg/vect/vect-complex-6.c: Likewise.
2010-04-19 Jakub Jelinek <jakub@redhat.com>
* g++.dg/debug/dwarf2/enum1.C: New test.
......
/* { dg-do compile } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
struct mystr
{
int f1;
int f2;
};
struct mystr a[16];
struct mystr b[16];
int res1, res2;
void
foo (void)
{
int i;
int sum1;
int sum2;
for (i = 0; i < 16; i++)
{
sum1 += a[i].f1 + b[i].f1;
sum2 += a[i].f2 + b[i].f2;
}
res1 = sum1;
res2 = sum2;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 16
unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
unsigned int uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
/* Vectorization of reduction using loop-aware SLP. */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1, int res2, int res3)
{
int i;
unsigned int udiff0 = 5, udiff1 = 10, udiff2 = 20, udiff3 = 30;
for (i = 0; i < n; i++) {
udiff3 += (ub[4*i + 3] - uc[4*i + 3]);
udiff2 += (ub[4*i + 2] - uc[4*i + 2]);
udiff1 += (ub[4*i + 1] - uc[4*i + 1]);
udiff0 += (ub[4*i] - uc[4*i]);
}
/* Check results: */
if (udiff0 != res0
|| udiff1 != res1
|| udiff2 != res2
|| udiff3 != res3)
abort ();
return 0;
}
int main (void)
{
check_vect ();
main1 (N/4, 53, 66, 84, 102);
main1 (N/4 - 1, 29, 40, 56, 72);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 16
unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
unsigned int uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
/* Vectorization of reduction using loop-aware SLP (with unrolling). */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1, int res2, int res3)
{
int i;
unsigned int udiff0 = 5, udiff1 = 10;
for (i = 0; i < n; i++) {
udiff1 += (ub[2*i + 1] - uc[2*i + 1]);
udiff0 += (ub[2*i] - uc[2*i]);
}
/* Check results: */
if (udiff0 != res0
|| udiff1 != res1)
abort ();
return 0;
}
int main (void)
{
check_vect ();
main1 (N/2, 117, 138, 84, 102);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include "tree-vect.h"
#define N 64
#define DOT1 21834
#define DOT2 21876
unsigned short X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
unsigned short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
/* short->short->int dot product.
Not detected as a dot-product pattern.
Requires support for non-widneing multiplication and widening-summation.
Vectorized with loop-aware SLP. */
__attribute__ ((noinline)) unsigned int
foo1(int len, int *result1, int *result2)
{
int i;
unsigned int res1 = 10, res2 = 20;
unsigned short prod;
for (i=0; i<len; i++) {
prod = X[2*i] * Y[2*i];
res1 += prod;
prod = X[2*i+1] * Y[2*i+1];
res2 += prod;
}
*result1 = res1;
*result2 = res2;
return 0;
}
int main (void)
{
unsigned int dot1, dot2;
unsigned short i;
check_vect ();
for (i=0; i<N; i++) {
X[i] = i;
Y[i] = 64-i;
}
foo1 (N/2, &dot1, &dot2);
if (dot1 != DOT1 || dot2 != DOT2)
abort ();
return 0;
}
/* The initialization loop in main also gets vectorized. */
/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target { vect_short_mult && vect_widen_sum_hi_to_si } } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_widen_sum_hi_to_si_pattern } } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 128
unsigned int uc[N];
/* Vectorization of reduction using loop-aware SLP. */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1, int res2, int res3, int res4, int res5, int res6, int res7)
{
int i;
unsigned int max0 = 5, max1 = 10, max2 = 20, max3 = 30, max4 = 2, max5 = 13, max6 = 7, max7 = 313;
for (i = 0; i < n; i++) {
max2 = max2 < uc[8*i+2] ? uc[8*i+2] : max2;
max3 = max3 < uc[8*i+3] ? uc[8*i+3] : max3;
max1 = max1 < uc[8*i+1] ? uc[8*i+1] : max1;
max7 = max7 < uc[8*i+7] ? uc[8*i+7] : max7;
max6 = max6 < uc[8*i+6] ? uc[8*i+6] : max6;
max0 = max0 < uc[8*i] ? uc[8*i] : max0;
max4 = max4 < uc[8*i+4] ? uc[8*i+4] : max4;
max5 = max5 < uc[8*i+5] ? uc[8*i+5] : max5;
}
/* Check results: */
if (max0 != res0
|| max1 != res1
|| max2 != res2
|| max3 != res3
|| max4 != res4
|| max5 != res5
|| max6 != res6
|| max7 != res7)
abort ();
return 0;
}
int main (void)
{
int i;
check_vect ();
for (i = 0; i < N; i++)
uc[i] = i+3;
main1 (N/8, 123, 124, 125, 126, 127, 128, 129, 313);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_max } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_max } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 128
int c[N];
/* Vectorization of reduction using loop-aware SLP. */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1)
{
int i;
int max0 = -100, max1 = -313;
for (i = 0; i < n; i++) {
max1 = max1 < c[2*i+1] ? c[2*i+1] : max1;
max0 = max0 < c[2*i] ? c[2*i] : max0;
}
/* Check results: */
if (max0 != res0
|| max1 != res1)
abort ();
return 0;
}
int main (void)
{
int i;
check_vect ();
for (i = 0; i < N; i++)
c[i] = (i+3) * -1;
c[0] = c[1] = -100;
main1 (N/2, -5, -6);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_max } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_max } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 128
int a[N], b[N];
/* Vectorization of reduction. Loop-aware SLP is not possible, because of
different arrays. */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1)
{
int i;
int sum0 = 0, sum1 = 0;
for (i = 0; i < n; i++) {
sum1 += a[2*i];
sum0 += b[2*i];
}
/* Check results: */
if (sum0 != res0
|| sum1 != res1)
abort ();
return 0;
}
int main (void)
{
int i;
check_vect ();
for (i = 0; i < N; i++)
a[i] = b[i] = i;
main1 (N/2, 4032, 4032);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
/* { dg-final { scan-tree-dump-times "different interleaving chains in one node" 1 "vect" { target { ! vect_no_int_add } } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
......@@ -2105,6 +2105,25 @@ proc check_effective_target_vect_perm { } {
return $et_vect_perm_saved
}
# Return 1 if the target plus current options supports a vector
# widening summation of *short* args into *int* result, 0 otherwise.
#
# This won't change for different subtargets so cache the result.
proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
global et_vect_widen_sum_hi_to_si_pattern
if [info exists et_vect_widen_sum_hi_to_si_pattern_saved] {
verbose "check_effective_target_vect_widen_sum_hi_to_si_pattern: using cached result" 2
} else {
set et_vect_widen_sum_hi_to_si_pattern_saved 0
if { [istarget powerpc*-*-*] } {
set et_vect_widen_sum_hi_to_si_pattern_saved 1
}
}
verbose "check_effective_target_vect_widen_sum_hi_to_si_pattern: returning $et_vect_widen_sum_hi_to_si_pattern_saved" 2
return $et_vect_widen_sum_hi_to_si_pattern_saved
}
# Return 1 if the target plus current options supports a vector
# widening summation of *short* args into *int* result, 0 otherwise.
......
......@@ -670,6 +670,8 @@ vect_pattern_recog_1 (
tree pattern_vectype;
tree type_in, type_out;
enum tree_code code;
int i;
gimple next;
pattern_stmt = (* vect_recog_func) (stmt, &type_in, &type_out);
if (!pattern_stmt)
......@@ -735,7 +737,13 @@ vect_pattern_recog_1 (
STMT_VINFO_IN_PATTERN_P (stmt_info) = true;
STMT_VINFO_RELATED_STMT (stmt_info) = pattern_stmt;
return;
/* Patterns cannot be vectorized using SLP, because they change the order of
computation. */
for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i,
next);
i++)
if (next == stmt)
VEC_ordered_remove (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i);
}
......
......@@ -1134,7 +1134,7 @@ vect_get_vec_defs (tree op0, tree op1, gimple stmt,
slp_tree slp_node)
{
if (slp_node)
vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1, -1);
else
{
tree vec_oprnd;
......@@ -2519,7 +2519,7 @@ vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
{
/* Handle uses. */
if (slp_node)
vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
vect_get_slp_defs (slp_node, &vec_oprnds0, NULL, -1);
else
{
VEC_free (tree, heap, vec_oprnds0);
......@@ -2819,7 +2819,7 @@ vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
if (j == 0)
{
if (slp_node)
vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
else
{
vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
......@@ -3105,7 +3105,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
if (slp)
{
/* Get vectorized arguments for SLP_NODE. */
vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
vect_get_slp_defs (slp_node, &vec_oprnds, NULL, -1);
vec_oprnd = VEC_index (tree, vec_oprnds, 0);
}
......@@ -4049,7 +4049,7 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vectorize, slp_tree node)
|| vectorizable_load (stmt, NULL, NULL, NULL, NULL)
|| vectorizable_call (stmt, NULL, NULL)
|| vectorizable_store (stmt, NULL, NULL, NULL)
|| vectorizable_reduction (stmt, NULL, NULL)
|| vectorizable_reduction (stmt, NULL, NULL, NULL)
|| vectorizable_condition (stmt, NULL, NULL, NULL, 0));
else
{
......@@ -4201,8 +4201,7 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
break;
case reduc_vec_info_type:
gcc_assert (!slp_node);
done = vectorizable_reduction (stmt, gsi, &vec_stmt);
done = vectorizable_reduction (stmt, gsi, &vec_stmt, slp_node);
gcc_assert (done);
break;
......
......@@ -242,6 +242,9 @@ typedef struct _loop_vec_info {
/* The unrolling factor needed to SLP the loop. In case of that pure SLP is
applied to the loop, i.e., no unrolling is needed, this is 1. */
unsigned slp_unrolling_factor;
/* Reduction cycles detected in the loop. Used in loop-aware SLP. */
VEC (gimple, heap) *reductions;
} *loop_vec_info;
/* Access Functions. */
......@@ -266,6 +269,7 @@ typedef struct _loop_vec_info {
#define LOOP_VINFO_STRIDED_STORES(L) (L)->strided_stores
#define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances
#define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
#define LOOP_VINFO_REDUCTIONS(L) (L)->reductions
#define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
VEC_length (gimple, (L)->may_misalign_stmts) > 0
......@@ -844,7 +848,8 @@ extern void vect_transform_loop (loop_vec_info);
extern loop_vec_info vect_analyze_loop_form (struct loop *);
extern bool vectorizable_live_operation (gimple, gimple_stmt_iterator *,
gimple *);
extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *);
extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *,
slp_tree);
extern bool vectorizable_induction (gimple, gimple_stmt_iterator *, gimple *);
extern int vect_estimate_min_profitable_iters (loop_vec_info);
extern tree get_initial_def_for_reduction (gimple, tree, tree *);
......@@ -862,7 +867,7 @@ extern bool vect_analyze_slp (loop_vec_info, bb_vec_info);
extern void vect_make_slp_decision (loop_vec_info);
extern void vect_detect_hybrid_slp (loop_vec_info);
extern void vect_get_slp_defs (slp_tree, VEC (tree,heap) **,
VEC (tree,heap) **);
VEC (tree,heap) **, int);
extern LOC find_bb_location (basic_block);
extern bb_vec_info vect_slp_analyze_bb (basic_block);
extern void vect_slp_transform_bb (basic_block);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment