Commit b5aeb3bb by Ira Rosen Committed by Ira Rosen

re PR tree-optimization/37027 (SLP loop vectorization missing support for reductions)


	PR tree-optimization/37027
	* tree-vectorizer.h (struct _loop_vec_info): Add new field reductions 
	and macro to access it.
	(vectorizable_reduction): Add argument.
	(vect_get_slp_defs): Likewise.
	* tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction
	statements for possible use in SLP.
	(new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS.
	(destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS.
	(vect_create_epilog_for_reduction): Handle SLP. Modify documentation,
	add new argument.
	(vectorizable_reduction): Likewise.
	* tree-vect-stmts.c (vect_get_vec_defs): Update call to 
	vect_get_slp_defs.
	(vectorizable_type_demotion, vectorizable_type_promotion,
	vectorizable_store): Likewise.
	(vect_analyze_stmt): Update call to vectorizable_reduction.
	(vect_transform_stmt): Likewise.
	* tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction.
	(vect_build_slp_tree): Fix indentation. Check that there are no loads
	from different interleaving chains in same node.
	(vect_slp_rearrange_stmts): New function.
	(vect_supported_load_permutation_p): Allow load permutations for 
	reductions. Call vect_slp_rearrange_stmts() to rearrange statements
	inside SLP nodes if necessary.
	(vect_analyze_slp_instance): Handle reductions.
	(vect_analyze_slp): Try to build SLP instances originating from groups
	of reductions.
	(vect_detect_hybrid_slp_stmts): Skip reduction statements.
	(vect_get_constant_vectors): Create initial vectors for reductions
	according to reduction code. Add new argument.
	(vect_get_slp_defs): Add new argument, pass it to 
	vect_get_constant_vectors.
	(vect_schedule_slp_instance): Remove SLP tree root statements.

From-SVN: r158506
parent 5a2fa9e8
2010-04-19 Ira Rosen <irar@il.ibm.com>
PR tree-optimization/37027
* tree-vectorizer.h (struct _loop_vec_info): Add new field reductions
and macro to access it.
(vectorizable_reduction): Add argument.
(vect_get_slp_defs): Likewise.
* tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction
statements for possible use in SLP.
(new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS.
(destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS.
(vect_create_epilog_for_reduction): Handle SLP. Modify documentation,
add new argument.
(vectorizable_reduction): Likewise.
* tree-vect-stmts.c (vect_get_vec_defs): Update call to
vect_get_slp_defs.
(vectorizable_type_demotion, vectorizable_type_promotion,
vectorizable_store): Likewise.
(vect_analyze_stmt): Update call to vectorizable_reduction.
(vect_transform_stmt): Likewise.
* tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction.
(vect_build_slp_tree): Fix indentation. Check that there are no loads
from different interleaving chains in same node.
(vect_slp_rearrange_stmts): New function.
(vect_supported_load_permutation_p): Allow load permutations for
reductions. Call vect_slp_rearrange_stmts() to rearrange statements
inside SLP nodes if necessary.
(vect_analyze_slp_instance): Handle reductions.
(vect_analyze_slp): Try to build SLP instances originating from groups
of reductions.
(vect_detect_hybrid_slp_stmts): Skip reduction statements.
(vect_get_constant_vectors): Create initial vectors for reductions
according to reduction code. Add new argument.
(vect_get_slp_defs): Add new argument, pass it to
vect_get_constant_vectors.
(vect_schedule_slp_instance): Remove SLP tree root statements.
2010-04-19 Jakub Jelinek <jakub@redhat.com>
* tree.h (ENUM_IS_SCOPED): Define.
......
2010-04-19 Ira Rosen <irar@il.ibm.com>
PR tree-optimization/37027
* lib/target-supports.exp
(check_effective_target_vect_widen_sum_hi_to_si_pattern): New.
* gcc.dg/vect/pr37027.c: New test.
* gcc.dg/vect/slp-reduc-1.c, gcc.dg/vect/slp-reduc-2.c,
gcc.dg/vect/slp-reduc-3.c, gcc.dg/vect/slp-reduc-4.c,
gcc.dg/vect/slp-reduc-5.c, gcc.dg/vect/slp-reduc-6.c,
gcc.dg/vect/vect-complex-6.c: Likewise.
2010-04-19 Jakub Jelinek <jakub@redhat.com>
* g++.dg/debug/dwarf2/enum1.C: New test.
......
/* { dg-do compile } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
struct mystr
{
int f1;
int f2;
};
struct mystr a[16];
struct mystr b[16];
int res1, res2;
void
foo (void)
{
int i;
int sum1;
int sum2;
for (i = 0; i < 16; i++)
{
sum1 += a[i].f1 + b[i].f1;
sum2 += a[i].f2 + b[i].f2;
}
res1 = sum1;
res2 = sum2;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 16
unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
unsigned int uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
/* Vectorization of reduction using loop-aware SLP. */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1, int res2, int res3)
{
int i;
unsigned int udiff0 = 5, udiff1 = 10, udiff2 = 20, udiff3 = 30;
for (i = 0; i < n; i++) {
udiff3 += (ub[4*i + 3] - uc[4*i + 3]);
udiff2 += (ub[4*i + 2] - uc[4*i + 2]);
udiff1 += (ub[4*i + 1] - uc[4*i + 1]);
udiff0 += (ub[4*i] - uc[4*i]);
}
/* Check results: */
if (udiff0 != res0
|| udiff1 != res1
|| udiff2 != res2
|| udiff3 != res3)
abort ();
return 0;
}
int main (void)
{
check_vect ();
main1 (N/4, 53, 66, 84, 102);
main1 (N/4 - 1, 29, 40, 56, 72);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 16
unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
unsigned int uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
/* Vectorization of reduction using loop-aware SLP (with unrolling). */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1, int res2, int res3)
{
int i;
unsigned int udiff0 = 5, udiff1 = 10;
for (i = 0; i < n; i++) {
udiff1 += (ub[2*i + 1] - uc[2*i + 1]);
udiff0 += (ub[2*i] - uc[2*i]);
}
/* Check results: */
if (udiff0 != res0
|| udiff1 != res1)
abort ();
return 0;
}
int main (void)
{
check_vect ();
main1 (N/2, 117, 138, 84, 102);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include "tree-vect.h"
#define N 64
#define DOT1 21834
#define DOT2 21876
unsigned short X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
unsigned short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
/* short->short->int dot product.
Not detected as a dot-product pattern.
Requires support for non-widneing multiplication and widening-summation.
Vectorized with loop-aware SLP. */
__attribute__ ((noinline)) unsigned int
foo1(int len, int *result1, int *result2)
{
int i;
unsigned int res1 = 10, res2 = 20;
unsigned short prod;
for (i=0; i<len; i++) {
prod = X[2*i] * Y[2*i];
res1 += prod;
prod = X[2*i+1] * Y[2*i+1];
res2 += prod;
}
*result1 = res1;
*result2 = res2;
return 0;
}
int main (void)
{
unsigned int dot1, dot2;
unsigned short i;
check_vect ();
for (i=0; i<N; i++) {
X[i] = i;
Y[i] = 64-i;
}
foo1 (N/2, &dot1, &dot2);
if (dot1 != DOT1 || dot2 != DOT2)
abort ();
return 0;
}
/* The initialization loop in main also gets vectorized. */
/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target { vect_short_mult && vect_widen_sum_hi_to_si } } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_widen_sum_hi_to_si_pattern } } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 128
unsigned int uc[N];
/* Vectorization of reduction using loop-aware SLP. */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1, int res2, int res3, int res4, int res5, int res6, int res7)
{
int i;
unsigned int max0 = 5, max1 = 10, max2 = 20, max3 = 30, max4 = 2, max5 = 13, max6 = 7, max7 = 313;
for (i = 0; i < n; i++) {
max2 = max2 < uc[8*i+2] ? uc[8*i+2] : max2;
max3 = max3 < uc[8*i+3] ? uc[8*i+3] : max3;
max1 = max1 < uc[8*i+1] ? uc[8*i+1] : max1;
max7 = max7 < uc[8*i+7] ? uc[8*i+7] : max7;
max6 = max6 < uc[8*i+6] ? uc[8*i+6] : max6;
max0 = max0 < uc[8*i] ? uc[8*i] : max0;
max4 = max4 < uc[8*i+4] ? uc[8*i+4] : max4;
max5 = max5 < uc[8*i+5] ? uc[8*i+5] : max5;
}
/* Check results: */
if (max0 != res0
|| max1 != res1
|| max2 != res2
|| max3 != res3
|| max4 != res4
|| max5 != res5
|| max6 != res6
|| max7 != res7)
abort ();
return 0;
}
int main (void)
{
int i;
check_vect ();
for (i = 0; i < N; i++)
uc[i] = i+3;
main1 (N/8, 123, 124, 125, 126, 127, 128, 129, 313);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_max } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_max } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 128
int c[N];
/* Vectorization of reduction using loop-aware SLP. */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1)
{
int i;
int max0 = -100, max1 = -313;
for (i = 0; i < n; i++) {
max1 = max1 < c[2*i+1] ? c[2*i+1] : max1;
max0 = max0 < c[2*i] ? c[2*i] : max0;
}
/* Check results: */
if (max0 != res0
|| max1 != res1)
abort ();
return 0;
}
int main (void)
{
int i;
check_vect ();
for (i = 0; i < N; i++)
c[i] = (i+3) * -1;
c[0] = c[1] = -100;
main1 (N/2, -5, -6);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_max } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_max } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 128
int a[N], b[N];
/* Vectorization of reduction. Loop-aware SLP is not possible, because of
different arrays. */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1)
{
int i;
int sum0 = 0, sum1 = 0;
for (i = 0; i < n; i++) {
sum1 += a[2*i];
sum0 += b[2*i];
}
/* Check results: */
if (sum0 != res0
|| sum1 != res1)
abort ();
return 0;
}
int main (void)
{
int i;
check_vect ();
for (i = 0; i < N; i++)
a[i] = b[i] = i;
main1 (N/2, 4032, 4032);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
/* { dg-final { scan-tree-dump-times "different interleaving chains in one node" 1 "vect" { target { ! vect_no_int_add } } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
......@@ -2105,6 +2105,25 @@ proc check_effective_target_vect_perm { } {
return $et_vect_perm_saved
}
# Return 1 if the target plus current options supports a vector
# widening summation of *short* args into *int* result, 0 otherwise.
#
# This won't change for different subtargets so cache the result.
proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
global et_vect_widen_sum_hi_to_si_pattern
if [info exists et_vect_widen_sum_hi_to_si_pattern_saved] {
verbose "check_effective_target_vect_widen_sum_hi_to_si_pattern: using cached result" 2
} else {
set et_vect_widen_sum_hi_to_si_pattern_saved 0
if { [istarget powerpc*-*-*] } {
set et_vect_widen_sum_hi_to_si_pattern_saved 1
}
}
verbose "check_effective_target_vect_widen_sum_hi_to_si_pattern: returning $et_vect_widen_sum_hi_to_si_pattern_saved" 2
return $et_vect_widen_sum_hi_to_si_pattern_saved
}
# Return 1 if the target plus current options supports a vector
# widening summation of *short* args into *int* result, 0 otherwise.
......
......@@ -545,6 +545,11 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
vect_reduction_def;
/* Store the reduction cycles for possible vectorization in
loop-aware SLP. */
VEC_safe_push (gimple, heap,
LOOP_VINFO_REDUCTIONS (loop_vinfo),
reduc_stmt);
}
}
}
......@@ -745,6 +750,7 @@ new_loop_vec_info (struct loop *loop)
VEC_alloc (ddr_p, heap,
PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
......@@ -835,6 +841,7 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
free (loop_vinfo);
loop->aux = NULL;
......@@ -1223,7 +1230,6 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
if ((STMT_VINFO_RELEVANT_P (stmt_info)
|| VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
&& !PURE_SLP_STMT (stmt_info))
/* STMT needs both SLP and loop-based vectorization. */
only_slp_in_loop = false;
}
......@@ -2860,28 +2866,33 @@ get_initial_def_for_reduction (gimple stmt, tree init_val,
/* Function vect_create_epilog_for_reduction
Create code at the loop-epilog to finalize the result of a reduction
computation.
VECT_DEF is a vector of partial results.
REDUC_CODE is the tree-code for the epilog reduction.
computation.
VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
reduction statements.
STMT is the scalar reduction stmt that is being vectorized.
NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
number of elements that we can fit in a vectype (nunits). In this case
we have to generate more than one vector stmt - i.e - we need to "unroll"
the vector stmt by a factor VF/nunits. For more details see documentation
in vectorizable_operation.
STMT is the scalar reduction stmt that is being vectorized.
REDUCTION_PHI is the phi-node that carries the reduction computation.
REDUC_INDEX is the index of the operand in the right hand side of the
REDUC_CODE is the tree-code for the epilog reduction.
REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
computation.
REDUC_INDEX is the index of the operand in the right hand side of the
statement that is defined by REDUCTION_PHI.
DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
SLP_NODE is an SLP node containing a group of reduction statements. The
first one in this group is STMT.
This function:
1. Creates the reduction def-use cycle: sets the arguments for
REDUCTION_PHI:
1. Creates the reduction def-use cycles: sets the arguments for
REDUCTION_PHIS:
The loop-entry argument is the vectorized initial-value of the reduction.
The loop-latch argument is VECT_DEF - the vector of partial sums.
2. "Reduces" the vector of partial results VECT_DEF into a single result,
by applying the operation specified by REDUC_CODE if available, or by
The loop-latch argument is taken from VECT_DEFS - the vector of partial
sums.
2. "Reduces" each vector of partial results VECT_DEFS into a single result,
by applying the operation specified by REDUC_CODE if available, or by
other means (whole-vector shifts or a scalar loop).
The function also creates a new phi node at the loop exit to preserve
loop-closed form, as illustrated below.
......@@ -2914,12 +2925,11 @@ get_initial_def_for_reduction (gimple stmt, tree init_val,
*/
static void
vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
int ncopies,
enum tree_code reduc_code,
gimple reduction_phi,
int reduc_index,
bool double_reduc)
vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
int ncopies, enum tree_code reduc_code,
VEC (gimple, heap) *reduction_phis,
int reduc_index, bool double_reduc,
slp_tree slp_node)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
stmt_vec_info prev_phi_info;
......@@ -2933,32 +2943,37 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
gimple new_phi = NULL, phi;
gimple_stmt_iterator exit_gsi;
tree vec_dest;
tree new_temp = NULL_TREE;
tree new_name;
tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
gimple epilog_stmt = NULL;
tree new_scalar_dest, new_dest;
enum tree_code code = gimple_assign_rhs_code (stmt);
gimple exit_phi;
tree bitsize, bitpos;
enum tree_code code = gimple_assign_rhs_code (stmt);
tree adjustment_def;
tree vec_initial_def, def;
tree orig_name;
tree adjustment_def = NULL;
tree vec_initial_def = NULL;
tree reduction_op, expr, def;
tree orig_name, scalar_result;
imm_use_iterator imm_iter;
use_operand_p use_p;
bool extract_scalar_result = false;
tree reduction_op, expr;
gimple orig_stmt;
gimple use_stmt;
gimple use_stmt, orig_stmt, reduction_phi = NULL;
bool nested_in_vect_loop = false;
VEC(gimple,heap) *phis = NULL;
VEC (gimple, heap) *new_phis = NULL;
enum vect_def_type dt = vect_unknown_def_type;
int j, i;
VEC (tree, heap) *scalar_results = NULL;
int group_size = 1, k, ratio;
VEC (tree, heap) *vec_initial_defs = NULL;
VEC (gimple, heap) *phis;
if (slp_node)
group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
if (nested_in_vect_loop_p (loop, stmt))
{
outer_loop = loop;
loop = loop->inner;
nested_in_vect_loop = true;
gcc_assert (!slp_node);
}
switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
......@@ -2983,47 +2998,80 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
gcc_assert (vectype);
mode = TYPE_MODE (vectype);
/*** 1. Create the reduction def-use cycle ***/
/* 1. Create the reduction def-use cycle:
Set the arguments of REDUCTION_PHIS, i.e., transform
loop:
vec_def = phi <null, null> # REDUCTION_PHI
VECT_DEF = vector_stmt # vectorized form of STMT
...
/* For the case of reduction, vect_get_vec_def_for_operand returns
the scalar def before the loop, that defines the initial value
of the reduction variable. */
vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
&adjustment_def);
into:
phi = reduction_phi;
def = vect_def;
for (j = 0; j < ncopies; j++)
loop:
vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
VECT_DEF = vector_stmt # vectorized form of STMT
...
(in case of SLP, do it for all the phis). */
/* Get the loop-entry arguments. */
if (slp_node)
vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index);
else
{
/* 1.1 set the loop-entry arg of the reduction-phi: */
add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop),
UNKNOWN_LOCATION);
vec_initial_defs = VEC_alloc (tree, heap, 1);
/* For the case of reduction, vect_get_vec_def_for_operand returns
the scalar def before the loop, that defines the initial value
of the reduction variable. */
vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
&adjustment_def);
VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
}
/* 1.2 set the loop-latch arg for the reduction-phi: */
if (j > 0)
def = vect_get_vec_def_for_stmt_copy (dt, def);
add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
/* Set phi nodes arguments. */
for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++)
{
tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
tree def = VEC_index (tree, vect_defs, i);
for (j = 0; j < ncopies; j++)
{
/* Set the loop-entry arg of the reduction-phi. */
add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
UNKNOWN_LOCATION);
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "transform reduction: created def-use cycle: ");
print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
fprintf (vect_dump, "\n");
print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
}
/* Set the loop-latch arg for the reduction-phi. */
if (j > 0)
def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "transform reduction: created def-use"
" cycle: ");
print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
fprintf (vect_dump, "\n");
print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
TDF_SLIM);
}
phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
}
}
/*** 2. Create epilog code
The reduction epilog code operates across the elements of the vector
of partial results computed by the vectorized loop.
The reduction epilog code consists of:
step 1: compute the scalar result in a vector (v_out2)
step 2: extract the scalar result (s_out3) from the vector (v_out2)
step 3: adjust the scalar result (s_out3) if needed.
VEC_free (tree, heap, vec_initial_defs);
/* 2. Create epilog code.
The reduction epilog code operates across the elements of the vector
of partial results computed by the vectorized loop.
The reduction epilog code consists of:
step 1: compute the scalar result in a vector (v_out2)
step 2: extract the scalar result (s_out3) from the vector (v_out2)
step 3: adjust the scalar result (s_out3) if needed.
Step 1 can be accomplished using one the following three schemes:
Step 1 can be accomplished using one the following three schemes:
(scheme 1) using reduc_code, if available.
(scheme 2) using whole-vector shifts, if available.
(scheme 3) using a scalar loop. In this case steps 1+2 above are
......@@ -3038,29 +3086,33 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
s_out4 = adjust_result <s_out3> # step 3
(step 3 is optional, and steps 1 and 2 may be combined).
Lastly, the uses of s_out0 are replaced by s_out4.
Lastly, the uses of s_out0 are replaced by s_out4. */
***/
/* 2.1 Create new loop-exit-phi to preserve loop-closed form:
v_out1 = phi <v_loop> */
/* 2.1 Create new loop-exit-phis to preserve loop-closed form:
v_out1 = phi <VECT_DEF>
Store them in NEW_PHIS. */
exit_bb = single_exit (loop)->dest;
def = vect_def;
prev_phi_info = NULL;
for (j = 0; j < ncopies; j++)
new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++)
{
phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
if (j == 0)
new_phi = phi;
else
{
def = vect_get_vec_def_for_stmt_copy (dt, def);
STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
}
SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
prev_phi_info = vinfo_for_stmt (phi);
for (j = 0; j < ncopies; j++)
{
phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
if (j == 0)
VEC_quick_push (gimple, new_phis, phi);
else
{
def = vect_get_vec_def_for_stmt_copy (dt, def);
STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
}
SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
prev_phi_info = vinfo_for_stmt (phi);
}
}
exit_gsi = gsi_after_labels (exit_bb);
......@@ -3089,16 +3141,17 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
}
code = gimple_assign_rhs_code (orig_stmt);
/* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
partial results are added and not subtracted. */
if (code == MINUS_EXPR)
code = PLUS_EXPR;
scalar_dest = gimple_assign_lhs (orig_stmt);
scalar_type = TREE_TYPE (scalar_dest);
scalar_results = VEC_alloc (tree, heap, group_size);
new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
bitsize = TYPE_SIZE (scalar_type);
/* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
partial results are added and not subtracted. */
if (code == MINUS_EXPR)
code = PLUS_EXPR;
/* In case this is a reduction in an inner-loop while vectorizing an outer
loop - we don't need to extract a single scalar result at the end of the
inner-loop (unless it is double reduction, i.e., the use of reduction is
......@@ -3108,28 +3161,21 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
if (nested_in_vect_loop && !double_reduc)
goto vect_finalize_reduction;
/* The epilogue is created for the outer-loop, i.e., for the loop being
vectorized. */
if (double_reduc)
loop = outer_loop;
/* FORNOW */
gcc_assert (ncopies == 1);
/* 2.3 Create the reduction code, using one of the three schemes described
above. */
if (reduc_code != ERROR_MARK)
above. In SLP we simply need to extract all the elements from the
vector (without reducing them), so we use scalar shifts. */
if (reduc_code != ERROR_MARK && !slp_node)
{
tree tmp;
/*** Case 1: Create:
v_out2 = reduc_expr <v_out1> */
v_out2 = reduc_expr <v_out1> */
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "Reduce using direct vector reduction.");
fprintf (vect_dump, "Reduce using direct vector reduction.");
vec_dest = vect_create_destination_var (scalar_dest, vectype);
new_phi = VEC_index (gimple, new_phis, 0);
tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi));
epilog_stmt = gimple_build_assign (vec_dest, tmp);
new_temp = make_ssa_name (vec_dest, epilog_stmt);
......@@ -3148,142 +3194,182 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
tree vec_temp;
if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
shift_code = VEC_RSHIFT_EXPR;
shift_code = VEC_RSHIFT_EXPR;
else
have_whole_vector_shift = false;
have_whole_vector_shift = false;
/* Regardless of whether we have a whole vector shift, if we're
emulating the operation via tree-vect-generic, we don't want
to use it. Only the first round of the reduction is likely
to still be profitable via emulation. */
emulating the operation via tree-vect-generic, we don't want
to use it. Only the first round of the reduction is likely
to still be profitable via emulation. */
/* ??? It might be better to emit a reduction tree code here, so that
tree-vect-generic can expand the first round via bit tricks. */
tree-vect-generic can expand the first round via bit tricks. */
if (!VECTOR_MODE_P (mode))
have_whole_vector_shift = false;
have_whole_vector_shift = false;
else
{
optab optab = optab_for_tree_code (code, vectype, optab_default);
if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
have_whole_vector_shift = false;
}
if (have_whole_vector_shift)
{
/*** Case 2: Create:
for (offset = VS/2; offset >= element_size; offset/=2)
{
Create: va' = vec_shift <va, offset>
Create: va = vop <va, va'>
} */
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "Reduce using vector shifts");
optab optab = optab_for_tree_code (code, vectype, optab_default);
if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
have_whole_vector_shift = false;
}
vec_dest = vect_create_destination_var (scalar_dest, vectype);
new_temp = PHI_RESULT (new_phi);
if (have_whole_vector_shift && !slp_node)
{
/*** Case 2: Create:
for (offset = VS/2; offset >= element_size; offset/=2)
{
Create: va' = vec_shift <va, offset>
Create: va = vop <va, va'>
} */
for (bit_offset = vec_size_in_bits/2;
bit_offset >= element_bitsize;
bit_offset /= 2)
{
tree bitpos = size_int (bit_offset);
epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
new_temp, bitpos);
new_name = make_ssa_name (vec_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_name);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
new_name, new_temp);
new_temp = make_ssa_name (vec_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_temp);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
}
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "Reduce using vector shifts");
vec_dest = vect_create_destination_var (scalar_dest, vectype);
new_phi = VEC_index (gimple, new_phis, 0);
new_temp = PHI_RESULT (new_phi);
for (bit_offset = vec_size_in_bits/2;
bit_offset >= element_bitsize;
bit_offset /= 2)
{
tree bitpos = size_int (bit_offset);
epilog_stmt = gimple_build_assign_with_ops (shift_code,
vec_dest, new_temp, bitpos);
new_name = make_ssa_name (vec_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_name);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
new_name, new_temp);
new_temp = make_ssa_name (vec_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_temp);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
}
extract_scalar_result = true;
}
extract_scalar_result = true;
}
else
{
tree rhs;
/*** Case 3: Create:
s = extract_field <v_out2, 0>
for (offset = element_size;
offset < vector_size;
offset += element_size;)
{
Create: s' = extract_field <v_out2, offset>
Create: s = op <s, s'>
} */
tree rhs;
/*** Case 3: Create:
s = extract_field <v_out2, 0>
for (offset = element_size;
offset < vector_size;
offset += element_size;)
{
Create: s' = extract_field <v_out2, offset>
Create: s = op <s, s'> // For non SLP cases
} */
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "Reduce using scalar code. ");
vec_temp = PHI_RESULT (new_phi);
vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
bitsize_zero_node);
epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_temp);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
for (bit_offset = element_bitsize;
bit_offset < vec_size_in_bits;
bit_offset += element_bitsize)
{
tree bitpos = bitsize_int (bit_offset);
tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
bitpos);
epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_name);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
epilog_stmt = gimple_build_assign_with_ops (code,
new_scalar_dest,
new_name, new_temp);
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_temp);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
}
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "Reduce using scalar code. ");
extract_scalar_result = false;
}
vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++)
{
vec_temp = PHI_RESULT (new_phi);
rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
bitsize_zero_node);
epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_temp);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
/* In SLP we don't need to apply reduction operation, so we just
collect s' values in SCALAR_RESULTS. */
if (slp_node)
VEC_safe_push (tree, heap, scalar_results, new_temp);
for (bit_offset = element_bitsize;
bit_offset < vec_size_in_bits;
bit_offset += element_bitsize)
{
tree bitpos = bitsize_int (bit_offset);
tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
bitsize, bitpos);
epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_name);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
if (slp_node)
{
/* In SLP we don't need to apply reduction operation, so
we just collect s' values in SCALAR_RESULTS. */
new_temp = new_name;
VEC_safe_push (tree, heap, scalar_results, new_name);
}
else
{
epilog_stmt = gimple_build_assign_with_ops (code,
new_scalar_dest, new_name, new_temp);
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_temp);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
}
}
}
/* The only case where we need to reduce scalar results in SLP, is
unrolling. If the size of SCALAR_RESULTS is greater than
GROUP_SIZE, we reduce them combining elements modulo
GROUP_SIZE. */
if (slp_node)
{
tree res, first_res, new_res;
gimple new_stmt;
/* Reduce multiple scalar results in case of SLP unrolling. */
for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
j++)
{
first_res = VEC_index (tree, scalar_results, j % group_size);
new_stmt = gimple_build_assign_with_ops (code,
new_scalar_dest, first_res, res);
new_res = make_ssa_name (new_scalar_dest, new_stmt);
gimple_assign_set_lhs (new_stmt, new_res);
gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
VEC_replace (tree, scalar_results, j % group_size, new_res);
}
}
else
/* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
VEC_safe_push (tree, heap, scalar_results, new_temp);
extract_scalar_result = false;
}
}
/* 2.4 Extract the final scalar result. Create:
s_out3 = extract_field <v_out2, bitpos> */
s_out3 = extract_field <v_out2, bitpos> */
if (extract_scalar_result)
{
tree rhs;
gcc_assert (!nested_in_vect_loop || double_reduc);
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "extract scalar result");
fprintf (vect_dump, "extract scalar result");
if (BYTES_BIG_ENDIAN)
bitpos = size_binop (MULT_EXPR,
bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
TYPE_SIZE (scalar_type));
bitpos = size_binop (MULT_EXPR,
bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
TYPE_SIZE (scalar_type));
else
bitpos = bitsize_zero_node;
bitpos = bitsize_zero_node;
rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_temp);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
VEC_safe_push (tree, heap, scalar_results, new_temp);
}
vect_finalize_reduction:
if (double_reduc)
loop = loop->inner;
/* 2.5 Adjust the final result by the initial value of the reduction
variable. (When such adjustment is not needed, then
'adjustment_def' is zero). For example, if code is PLUS we create:
......@@ -3291,14 +3377,17 @@ vect_finalize_reduction:
if (adjustment_def)
{
gcc_assert (!slp_node);
if (nested_in_vect_loop)
{
new_phi = VEC_index (gimple, new_phis, 0);
gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
new_dest = vect_create_destination_var (scalar_dest, vectype);
}
else
{
new_temp = VEC_index (tree, scalar_results, 0);
gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
expr = build2 (code, scalar_type, new_temp, adjustment_def);
new_dest = vect_create_destination_var (scalar_dest, scalar_type);
......@@ -3309,142 +3398,206 @@ vect_finalize_reduction:
gimple_assign_set_lhs (epilog_stmt, new_temp);
SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
if (nested_in_vect_loop)
{
set_vinfo_for_stmt (epilog_stmt,
new_stmt_vec_info (epilog_stmt, loop_vinfo,
NULL));
STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
if (!double_reduc)
VEC_quick_push (tree, scalar_results, new_temp);
else
VEC_replace (tree, scalar_results, 0, new_temp);
}
else
VEC_replace (tree, scalar_results, 0, new_temp);
VEC_replace (gimple, new_phis, 0, epilog_stmt);
}
/* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
phis with new adjusted scalar results, i.e., replace use <s_out0>
with use <s_out4>.
/* 2.6 Handle the loop-exit phi */
Transform:
loop_exit:
s_out0 = phi <s_loop> # (scalar) EXIT_PHI
v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
v_out2 = reduce <v_out1>
s_out3 = extract_field <v_out2, 0>
s_out4 = adjust_result <s_out3>
use <s_out0>
use <s_out0>
into:
/* Replace uses of s_out0 with uses of s_out3:
Find the loop-closed-use at the loop exit of the original scalar result.
(The reduction result is expected to have two immediate uses - one at the
latch block, and one at the loop exit). */
phis = VEC_alloc (gimple, heap, 10);
FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
loop_exit:
s_out0 = phi <s_loop> # (scalar) EXIT_PHI
v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
v_out2 = reduce <v_out1>
s_out3 = extract_field <v_out2, 0>
s_out4 = adjust_result <s_out3>
use <s_out4> */
/* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
case that GROUP_SIZE is greater than vectorization factor). Therefore, we
need to match SCALAR_RESULTS with corresponding statements. The first
(GROUP_SIZE / number of new vector stmts) scalar results correspond to
the first vector stmt, etc.
(RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
ratio = group_size / VEC_length (gimple, new_phis);
gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
for (k = 0; k < group_size; k++)
{
if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
{
exit_phi = USE_STMT (use_p);
VEC_quick_push (gimple, phis, exit_phi);
}
}
if (k % ratio == 0)
{
epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
}
/* We expect to have found an exit_phi because of loop-closed-ssa form. */
gcc_assert (!VEC_empty (gimple, phis));
if (slp_node)
{
gimple current_stmt = VEC_index (gimple,
SLP_TREE_SCALAR_STMTS (slp_node), k);
for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
{
if (nested_in_vect_loop)
{
stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
gimple vect_phi;
/* FORNOW. Currently not supporting the case that an inner-loop
reduction is not used in the outer-loop (but only outside the
outer-loop), unless it is double reduction. */
gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
&& !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
set_vinfo_for_stmt (epilog_stmt,
new_stmt_vec_info (epilog_stmt, loop_vinfo,
NULL));
if (adjustment_def)
STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
if (!double_reduc
|| STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
continue;
/* Handle double reduction:
stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
stmt2: s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
stmt3: s4 = use (s3) - (regular) reduction stmt (inner loop)
stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
At that point the regular reduction (stmt2 and stmt3) is already
vectorized, as well as the exit phi node, stmt4.
Here we vectorize the phi node of double reduction, stmt1, and
update all relevant statements. */
/* Go through all the uses of s2 to find double reduction phi node,
i.e., stmt1 above. */
orig_name = PHI_RESULT (exit_phi);
FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
/* SLP statements can't participate in patterns. */
gcc_assert (!orig_stmt);
scalar_dest = gimple_assign_lhs (current_stmt);
}
phis = VEC_alloc (gimple, heap, 3);
/* Find the loop-closed-use at the loop exit of the original scalar
result. (The reduction result is expected to have two immediate uses -
one at the latch block, and one at the loop exit). */
FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
/* We expect to have found an exit_phi because of loop-closed-ssa
form. */
gcc_assert (!VEC_empty (gimple, phis));
for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
{
if (outer_loop)
{
stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
stmt_vec_info new_phi_vinfo;
tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
basic_block bb = gimple_bb (use_stmt);
gimple use;
/* Check that USE_STMT is really double reduction phi node. */
if (gimple_code (use_stmt) != GIMPLE_PHI
|| gimple_phi_num_args (use_stmt) != 2
|| !use_stmt_vinfo
|| STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
!= vect_double_reduction_def
|| bb->loop_father != outer_loop)
stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
gimple vect_phi;
/* FORNOW. Currently not supporting the case that an inner-loop
reduction is not used in the outer-loop (but only outside the
outer-loop), unless it is double reduction. */
gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
&& !STMT_VINFO_LIVE_P (exit_phi_vinfo))
|| double_reduc);
STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
if (!double_reduc
|| STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
!= vect_double_reduction_def)
continue;
/* Create vector phi node for double reduction:
vs1 = phi <vs0, vs2>
vs1 was created previously in this function by a call to
vect_get_vec_def_for_operand and is stored in vec_initial_def;
vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
vs0 is created here. */
/* Handle double reduction:
/* Create vector phi node. */
vect_phi = create_phi_node (vec_initial_def, bb);
new_phi_vinfo = new_stmt_vec_info (vect_phi,
loop_vec_info_for_loop (outer_loop), NULL);
set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
/* Create vs0 - initial def of the double reduction phi. */
preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
loop_preheader_edge (outer_loop));
init_def = get_initial_def_for_reduction (stmt, preheader_arg,
NULL);
vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
NULL);
/* Update phi node arguments with vs0 and vs2. */
add_phi_arg (vect_phi, vect_phi_init,
loop_preheader_edge (outer_loop), UNKNOWN_LOCATION);
add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "created double reduction phi node: ");
print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
}
vect_phi_res = PHI_RESULT (vect_phi);
At that point the regular reduction (stmt2 and stmt3) is
already vectorized, as well as the exit phi node, stmt4.
Here we vectorize the phi node of double reduction, stmt1, and
update all relevant statements. */
/* Replace the use, i.e., set the correct vs1 in the regular
reduction phi node. FORNOW, NCOPIES is always 1, so the loop
is redundant. */
use = reduction_phi;
for (j = 0; j < ncopies; j++)
/* Go through all the uses of s2 to find double reduction phi
node, i.e., stmt1 above. */
orig_name = PHI_RESULT (exit_phi);
FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
{
edge pr_edge = loop_preheader_edge (loop);
SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
stmt_vec_info new_phi_vinfo;
tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
basic_block bb = gimple_bb (use_stmt);
gimple use;
/* Check that USE_STMT is really double reduction phi
node. */
if (gimple_code (use_stmt) != GIMPLE_PHI
|| gimple_phi_num_args (use_stmt) != 2
|| !use_stmt_vinfo
|| STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
!= vect_double_reduction_def
|| bb->loop_father != outer_loop)
continue;
/* Create vector phi node for double reduction:
vs1 = phi <vs0, vs2>
vs1 was created previously in this function by a call to
vect_get_vec_def_for_operand and is stored in
vec_initial_def;
vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
vs0 is created here. */
/* Create vector phi node. */
vect_phi = create_phi_node (vec_initial_def, bb);
new_phi_vinfo = new_stmt_vec_info (vect_phi,
loop_vec_info_for_loop (outer_loop), NULL);
set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
/* Create vs0 - initial def of the double reduction phi. */
preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
loop_preheader_edge (outer_loop));
init_def = get_initial_def_for_reduction (stmt,
preheader_arg, NULL);
vect_phi_init = vect_init_vector (use_stmt, init_def,
vectype, NULL);
/* Update phi node arguments with vs0 and vs2. */
add_phi_arg (vect_phi, vect_phi_init,
loop_preheader_edge (outer_loop),
UNKNOWN_LOCATION);
add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "created double reduction phi "
"node: ");
print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
}
vect_phi_res = PHI_RESULT (vect_phi);
/* Replace the use, i.e., set the correct vs1 in the regular
reduction phi node. FORNOW, NCOPIES is always 1, so the
loop is redundant. */
use = reduction_phi;
for (j = 0; j < ncopies; j++)
{
edge pr_edge = loop_preheader_edge (loop);
SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
}
}
}
}
/* Replace the uses: */
orig_name = PHI_RESULT (exit_phi);
FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
SET_USE (use_p, new_temp);
/* Replace the uses: */
orig_name = PHI_RESULT (exit_phi);
scalar_result = VEC_index (tree, scalar_results, k);
FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
SET_USE (use_p, scalar_result);
}
VEC_free (gimple, heap, phis);
}
VEC_free (gimple, heap, phis);
}
VEC_free (tree, heap, scalar_results);
VEC_free (gimple, heap, new_phis);
}
/* Function vectorizable_reduction.
......@@ -3489,7 +3642,7 @@ vect_finalize_reduction:
bool
vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
gimple *vec_stmt)
gimple *vec_stmt, slp_tree slp_node)
{
tree vec_dest;
tree scalar_dest;
......@@ -3517,7 +3670,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
int ncopies;
int epilog_copies;
stmt_vec_info prev_stmt_info, prev_phi_info;
gimple first_phi = NULL;
bool single_defuse_cycle = false;
tree reduc_def = NULL_TREE;
gimple new_stmt = NULL;
......@@ -3532,6 +3684,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
struct loop * def_stmt_loop, *outer_loop = NULL;
tree def_arg;
gimple def_arg_stmt;
VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
VEC (gimple, heap) *phis = NULL;
int vec_num;
tree def0, def1;
if (nested_in_vect_loop_p (loop, stmt))
{
......@@ -3540,10 +3696,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
nested_cycle = true;
}
/* FORNOW: SLP not supported. */
if (STMT_SLP_TYPE (stmt_info))
return false;
/* 1. Is vectorizable reduction? */
/* Not supportable if the reduction variable is used in the loop. */
if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
......@@ -3676,9 +3828,12 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
return false;
if (slp_node)
ncopies = 1;
else
ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
/ TYPE_VECTOR_SUBPARTS (vectype_in));
ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
/ TYPE_VECTOR_SUBPARTS (vectype_in));
gcc_assert (ncopies >= 1);
vec_mode = TYPE_MODE (vectype_in);
......@@ -3897,23 +4052,48 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
prev_stmt_info = NULL;
prev_phi_info = NULL;
if (slp_node)
{
vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
== TYPE_VECTOR_SUBPARTS (vectype_in));
}
else
{
vec_num = 1;
vec_oprnds0 = VEC_alloc (tree, heap, 1);
if (op_type == ternary_op)
vec_oprnds1 = VEC_alloc (tree, heap, 1);
}
phis = VEC_alloc (gimple, heap, vec_num);
vect_defs = VEC_alloc (tree, heap, vec_num);
if (!slp_node)
VEC_quick_push (tree, vect_defs, NULL_TREE);
for (j = 0; j < ncopies; j++)
{
if (j == 0 || !single_defuse_cycle)
{
/* Create the reduction-phi that defines the reduction-operand. */
new_phi = create_phi_node (vec_dest, loop->header);
set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
NULL));
/* Get the vector def for the reduction variable from the phi
node. */
reduc_def = PHI_RESULT (new_phi);
}
for (i = 0; i < vec_num; i++)
{
/* Create the reduction-phi that defines the reduction
operand. */
new_phi = create_phi_node (vec_dest, loop->header);
set_vinfo_for_stmt (new_phi,
new_stmt_vec_info (new_phi, loop_vinfo,
NULL));
if (j == 0 || slp_node)
VEC_quick_push (gimple, phis, new_phi);
}
}
if (code == COND_EXPR)
{
first_phi = new_phi;
vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index);
gcc_assert (!slp_node);
vectorizable_condition (stmt, gsi, vec_stmt,
PHI_RESULT (VEC_index (gimple, phis, 0)),
reduc_index);
/* Multiple types are not supported for condition. */
break;
}
......@@ -3921,65 +4101,94 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
/* Handle uses. */
if (j == 0)
{
loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
stmt, NULL);
if (op_type == ternary_op)
if (slp_node)
vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
else
{
if (reduc_index == 0)
loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
NULL);
else
loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
NULL);
loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
stmt, NULL);
VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
if (op_type == ternary_op)
{
if (reduc_index == 0)
loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
NULL);
else
loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
NULL);
VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
}
}
/* Get the vector def for the reduction variable from the phi
node. */
first_phi = new_phi;
}
else
{
enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
if (op_type == ternary_op)
loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
if (!slp_node)
{
enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
if (op_type == ternary_op)
{
loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
loop_vec_def1);
VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
}
}
if (single_defuse_cycle)
reduc_def = gimple_assign_lhs (new_stmt);
else
reduc_def = PHI_RESULT (new_phi);
if (single_defuse_cycle)
reduc_def = gimple_assign_lhs (new_stmt);
STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
}
/* Arguments are ready. Create the new vector stmt. */
if (op_type == binary_op)
for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++)
{
if (reduc_index == 0)
expr = build2 (code, vectype_out, reduc_def, loop_vec_def0);
if (slp_node)
reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
else
expr = build2 (code, vectype_out, loop_vec_def0, reduc_def);
}
else
{
if (reduc_index == 0)
expr = build3 (code, vectype_out, reduc_def, loop_vec_def0,
loop_vec_def1);
{
if (!single_defuse_cycle || j == 0)
reduc_def = PHI_RESULT (new_phi);
}
def1 = ((op_type == ternary_op)
? VEC_index (tree, vec_oprnds1, i) : NULL);
if (op_type == binary_op)
{
if (reduc_index == 0)
expr = build2 (code, vectype_out, reduc_def, def0);
else
expr = build2 (code, vectype_out, def0, reduc_def);
}
else
{
if (reduc_index == 1)
expr = build3 (code, vectype_out, loop_vec_def0, reduc_def,
loop_vec_def1);
if (reduc_index == 0)
expr = build3 (code, vectype_out, reduc_def, def0, def1);
else
expr = build3 (code, vectype_out, loop_vec_def0, loop_vec_def1,
reduc_def);
{
if (reduc_index == 1)
expr = build3 (code, vectype_out, def0, reduc_def, def1);
else
expr = build3 (code, vectype_out, def0, def1, reduc_def);
}
}
new_stmt = gimple_build_assign (vec_dest, expr);
new_temp = make_ssa_name (vec_dest, new_stmt);
gimple_assign_set_lhs (new_stmt, new_temp);
vect_finish_stmt_generation (stmt, new_stmt, gsi);
if (slp_node)
{
VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
VEC_quick_push (tree, vect_defs, new_temp);
}
else
VEC_replace (tree, vect_defs, 0, new_temp);
}
new_stmt = gimple_build_assign (vec_dest, expr);
new_temp = make_ssa_name (vec_dest, new_stmt);
gimple_assign_set_lhs (new_stmt, new_temp);
vect_finish_stmt_generation (stmt, new_stmt, gsi);
if (slp_node)
continue;
if (j == 0)
STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
......@@ -3992,12 +4201,21 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
/* Finalize the reduction-phi (set its arguments) and create the
epilog reduction code. */
if (!single_defuse_cycle || code == COND_EXPR)
new_temp = gimple_assign_lhs (*vec_stmt);
if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
{
new_temp = gimple_assign_lhs (*vec_stmt);
VEC_replace (tree, vect_defs, 0, new_temp);
}
vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
epilog_reduc_code, phis, reduc_index,
double_reduc, slp_node);
VEC_free (gimple, heap, phis);
VEC_free (tree, heap, vec_oprnds0);
if (vec_oprnds1)
VEC_free (tree, heap, vec_oprnds1);
vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
epilog_reduc_code, first_phi, reduc_index,
double_reduc);
return true;
}
......
......@@ -670,6 +670,8 @@ vect_pattern_recog_1 (
tree pattern_vectype;
tree type_in, type_out;
enum tree_code code;
int i;
gimple next;
pattern_stmt = (* vect_recog_func) (stmt, &type_in, &type_out);
if (!pattern_stmt)
......@@ -735,7 +737,13 @@ vect_pattern_recog_1 (
STMT_VINFO_IN_PATTERN_P (stmt_info) = true;
STMT_VINFO_RELATED_STMT (stmt_info) = pattern_stmt;
return;
/* Patterns cannot be vectorized using SLP, because they change the order of
computation. */
for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i,
next);
i++)
if (next == stmt)
VEC_ordered_remove (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i);
}
......
......@@ -273,6 +273,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
break;
case vect_internal_def:
case vect_reduction_def:
if (i == 0)
VEC_safe_push (gimple, heap, *def_stmts0, def_stmt);
else
......@@ -332,7 +333,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
HOST_WIDE_INT dummy;
bool permutation = false;
unsigned int load_place;
gimple first_load;
gimple first_load, prev_first_load = NULL;
/* For every stmt in NODE find its def stmt/s. */
for (i = 0; VEC_iterate (gimple, stmts, i, stmt); i++)
......@@ -485,42 +486,62 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
&pattern0, &pattern1))
return false;
}
else
{
/* Load. */
/* FORNOW: Check that there is no gap between the loads. */
if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
&& DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
|| (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
&& DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: strided "
"loads have gaps ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
else
{
/* Load. */
/* FORNOW: Check that there is no gap between the loads. */
if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
&& DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
|| (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
&& DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: strided "
"loads have gaps ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
/* Check that the size of interleaved loads group is not
greater than the SLP group size. */
if (DR_GROUP_SIZE (vinfo_for_stmt (stmt))
> ncopies * group_size)
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: the number of "
"interleaved loads is greater than"
" the SLP group size ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
return false;
}
/* Check that the size of interleaved loads group is not
greater than the SLP group size. */
if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: the number of "
"interleaved loads is greater than"
" the SLP group size ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
return false;
}
first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
if (prev_first_load)
{
/* Check that there are no loads from different interleaving
chains in the same node. The only exception is complex
numbers. */
if (prev_first_load != first_load
&& rhs_code != REALPART_EXPR
&& rhs_code != IMAGPART_EXPR)
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: different "
"interleaving chains in one node ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
}
else
prev_first_load = first_load;
if (first_load == stmt)
{
......@@ -787,6 +808,39 @@ vect_supported_slp_permutation_p (slp_instance instance)
}
/* Rearrange the statements of NODE according to PERMUTATION. */
static void
vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
VEC (int, heap) *permutation)
{
gimple stmt;
VEC (gimple, heap) *tmp_stmts;
unsigned int index, i;
if (!node)
return;
vect_slp_rearrange_stmts (SLP_TREE_LEFT (node), group_size, permutation);
vect_slp_rearrange_stmts (SLP_TREE_RIGHT (node), group_size, permutation);
gcc_assert (group_size == VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node)));
tmp_stmts = VEC_alloc (gimple, heap, group_size);
for (i = 0; i < group_size; i++)
VEC_safe_push (gimple, heap, tmp_stmts, NULL);
for (i = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++)
{
index = VEC_index (int, permutation, i);
VEC_replace (gimple, tmp_stmts, index, stmt);
}
VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node));
SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
}
/* Check if the required load permutation is supported.
LOAD_PERMUTATION contains a list of indices of the loads.
In SLP this permutation is relative to the order of strided stores that are
......@@ -796,9 +850,11 @@ static bool
vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
VEC (int, heap) *load_permutation)
{
int i = 0, j, prev = -1, next, k;
bool supported;
int i = 0, j, prev = -1, next, k, number_of_groups;
bool supported, bad_permutation = false;
sbitmap load_index;
slp_tree node;
gimple stmt;
/* FORNOW: permutations are only supported in SLP. */
if (!slp_instn)
......@@ -811,9 +867,72 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
fprintf (vect_dump, "%d ", next);
}
/* In case of reduction every load permutation is allowed, since the order
of the reduction statements is not important (as opposed to the case of
strided stores). The only condition we need to check is that all the
load nodes are of the same size and have the same permutation (and then
rearrange all the nodes of the SLP instance according to this
permutation). */
/* Check that all the load nodes are of the same size. */
for (i = 0;
VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node);
i++)
if (VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node))
!= (unsigned) group_size)
return false;
node = SLP_INSTANCE_TREE (slp_instn);
stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
/* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
instance, not all the loads belong to the same node or interleaving
group. Hence, we need to divide them into groups according to
GROUP_SIZE. */
number_of_groups = VEC_length (int, load_permutation) / group_size;
/* Reduction (there are no data-refs in the root). */
if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
{
int first_group_load_index;
/* Compare all the permutation sequences to the first one. */
for (i = 1; i < number_of_groups; i++)
{
k = 0;
for (j = i * group_size; j < i * group_size + group_size; j++)
{
next = VEC_index (int, load_permutation, j);
first_group_load_index = VEC_index (int, load_permutation, k);
if (next != first_group_load_index)
{
bad_permutation = true;
break;
}
k++;
}
if (bad_permutation)
break;
}
if (!bad_permutation)
{
/* This permutaion is valid for reduction. Since the order of the
statements in the nodes is not important unless they are memory
accesses, we can rearrange the statements in all the nodes
according to the order of the loads. */
vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
load_permutation);
VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn));
return true;
}
}
/* FORNOW: the only supported permutation is 0..01..1.. of length equal to
GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
well. */
well (unless it's reduction). */
if (VEC_length (int, load_permutation)
!= (unsigned int) (group_size * group_size))
return false;
......@@ -896,17 +1015,28 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
slp_tree node = XNEW (struct _slp_tree);
unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
unsigned int unrolling_factor = 1, nunits;
tree vectype, scalar_type;
tree vectype, scalar_type = NULL_TREE;
gimple next;
unsigned int vectorization_factor = 0;
int inside_cost = 0, outside_cost = 0, ncopies_for_cost;
int inside_cost = 0, outside_cost = 0, ncopies_for_cost, i;
unsigned int max_nunits = 0;
VEC (int, heap) *load_permutation;
VEC (slp_tree, heap) *loads;
struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
if (dr)
{
scalar_type = TREE_TYPE (DR_REF (dr));
vectype = get_vectype_for_scalar_type (scalar_type);
group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
}
else
{
gcc_assert (loop_vinfo);
vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
group_size = VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo));
}
scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (
vinfo_for_stmt (stmt))));
vectype = get_vectype_for_scalar_type (scalar_type);
if (!vectype)
{
if (vect_print_dump_info (REPORT_SLP))
......@@ -914,6 +1044,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
fprintf (vect_dump, "Build SLP failed: unsupported data-type ");
print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
}
return false;
}
......@@ -938,11 +1069,29 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
/* Create a node (a root of the SLP tree) for the packed strided stores. */
SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (gimple, heap, group_size);
next = stmt;
/* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
while (next)
if (dr)
{
VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
/* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
while (next)
{
VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
}
}
else
{
/* Collect reduction statements. */
for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i,
next);
i++)
{
VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "pushing reduction into node: ");
print_gimple_stmt (vect_dump, next, 0, TDF_SLIM);
}
}
}
SLP_TREE_VEC_STMTS (node) = NULL;
......@@ -1035,7 +1184,7 @@ bool
vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
{
unsigned int i;
VEC (gimple, heap) *strided_stores;
VEC (gimple, heap) *strided_stores, *reductions = NULL;
gimple store;
bool ok = false;
......@@ -1043,10 +1192,14 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
fprintf (vect_dump, "=== vect_analyze_slp ===");
if (loop_vinfo)
strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo);
{
strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo);
reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
}
else
strided_stores = BB_VINFO_STRIDED_STORES (bb_vinfo);
/* Find SLP sequences starting from groups of strided stores. */
for (i = 0; VEC_iterate (gimple, strided_stores, i, store); i++)
if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, store))
ok = true;
......@@ -1059,6 +1212,12 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
return false;
}
/* Find SLP sequences starting from groups of reductions. */
if (loop_vinfo && VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo))
&& vect_analyze_slp_instance (loop_vinfo, bb_vinfo,
VEC_index (gimple, reductions, 0)))
ok = true;
return true;
}
......@@ -1120,7 +1279,10 @@ vect_detect_hybrid_slp_stmts (slp_tree node)
if ((stmt_vinfo = vinfo_for_stmt (use_stmt))
&& !STMT_SLP_TYPE (stmt_vinfo)
&& (STMT_VINFO_RELEVANT (stmt_vinfo)
|| VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo))))
|| VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
&& !(gimple_code (use_stmt) == GIMPLE_PHI
&& STMT_VINFO_DEF_TYPE (vinfo_for_stmt (use_stmt))
== vect_reduction_def))
vect_mark_slp_stmts (node, hybrid, i);
vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node));
......@@ -1429,11 +1591,14 @@ vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
/* For constant and loop invariant defs of SLP_NODE this function returns
(vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
REDUC_INDEX is the index of the reduction operand in the statements, unless
it is -1. */
static void
vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
unsigned int op_num, unsigned int number_of_vectors)
unsigned int op_num, unsigned int number_of_vectors,
int reduc_index)
{
VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
gimple stmt = VEC_index (gimple, stmts, 0);
......@@ -1449,6 +1614,50 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
int number_of_copies = 1;
VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
bool constant_p, is_store;
tree neutral_op = NULL;
if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def)
{
enum tree_code code = gimple_assign_rhs_code (stmt);
if (reduc_index == -1)
{
VEC_free (tree, heap, *vec_oprnds);
return;
}
op_num = reduc_index - 1;
op = gimple_op (stmt, op_num + 1);
/* For additional copies (see the explanation of NUMBER_OF_COPIES below)
we need either neutral operands or the original operands. See
get_initial_def_for_reduction() for details. */
switch (code)
{
case WIDEN_SUM_EXPR:
case DOT_PROD_EXPR:
case PLUS_EXPR:
case MINUS_EXPR:
case BIT_IOR_EXPR:
case BIT_XOR_EXPR:
if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
neutral_op = build_real (TREE_TYPE (op), dconst0);
else
neutral_op = build_int_cst (TREE_TYPE (op), 0);
break;
case MULT_EXPR:
case BIT_AND_EXPR:
if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
neutral_op = build_real (TREE_TYPE (op), dconst1);
else
neutral_op = build_int_cst (TREE_TYPE (op), 1);
break;
default:
neutral_op = NULL;
}
}
if (STMT_VINFO_DATA_REF (stmt_vinfo))
{
......@@ -1499,6 +1708,19 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
else
op = gimple_op (stmt, op_num + 1);
if (reduc_index != -1)
{
struct loop *loop = (gimple_bb (stmt))->loop_father;
gimple def_stmt = SSA_NAME_DEF_STMT (op);
gcc_assert (loop);
/* Get the def before the loop. */
op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
loop_preheader_edge (loop));
if (j != (number_of_copies - 1) && neutral_op)
op = neutral_op;
}
/* Create 'vect_ = {op0,op1,...,opn}'. */
t = tree_cons (NULL_TREE, op, t);
......@@ -1536,8 +1758,25 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
to replicate the vectors. */
while (number_of_vectors > VEC_length (tree, *vec_oprnds))
{
for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
VEC_quick_push (tree, *vec_oprnds, vop);
tree neutral_vec = NULL;
if (neutral_op)
{
if (!neutral_vec)
{
t = NULL;
for (i = 0; i < (unsigned) nunits; i++)
t = tree_cons (NULL_TREE, neutral_op, t);
neutral_vec = build_vector (vector_type, t);
}
VEC_quick_push (tree, *vec_oprnds, neutral_vec);
}
else
{
for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
VEC_quick_push (tree, *vec_oprnds, vop);
}
}
}
......@@ -1576,7 +1815,7 @@ vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
void
vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
VEC (tree,heap) **vec_oprnds1)
VEC (tree,heap) **vec_oprnds1, int reduc_index)
{
gimple first_stmt;
enum tree_code code;
......@@ -1607,19 +1846,26 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
*vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
/* SLP_NODE corresponds either to a group of stores or to a group of
unary/binary operations. We don't call this function for loads. */
if (SLP_TREE_LEFT (slp_node))
unary/binary operations. We don't call this function for loads.
For reduction defs we call vect_get_constant_vectors(), since we are
looking for initial loop invariant values. */
if (SLP_TREE_LEFT (slp_node) && reduc_index == -1)
/* The defs are already vectorized. */
vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
else
/* Build vectors from scalar defs. */
vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects,
reduc_index);
if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
/* Since we don't call this function with loads, this is a group of
stores. */
return;
/* For reductions, we only need initial values. */
if (reduc_index != -1)
return;
code = gimple_assign_rhs_code (first_stmt);
if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
return;
......@@ -1638,7 +1884,7 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
else
/* Build vectors from scalar defs. */
vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects, -1);
}
......@@ -2027,22 +2273,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
si = gsi_for_stmt (stmt);
is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
if (is_store)
{
if (DR_GROUP_FIRST_DR (stmt_info))
/* If IS_STORE is TRUE, the vectorization of the
interleaving chain was completed - free all the stores in
the chain. */
vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
else
/* FORNOW: SLP originates only from strided stores. */
gcc_unreachable ();
return true;
}
/* FORNOW: SLP originates only from strided stores. */
return false;
return is_store;
}
......@@ -2075,6 +2306,26 @@ vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
fprintf (vect_dump, "vectorizing stmts using SLP.");
}
for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
{
slp_tree root = SLP_INSTANCE_TREE (instance);
gimple store;
unsigned int j;
gimple_stmt_iterator gsi;
for (j = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (root), j, store)
&& j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
{
if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
break;
/* Free the attached stmt_vec_info and remove the stmt. */
gsi = gsi_for_stmt (store);
gsi_remove (&gsi, true);
free_stmt_vec_info (store);
}
}
return is_store;
}
......
......@@ -1134,7 +1134,7 @@ vect_get_vec_defs (tree op0, tree op1, gimple stmt,
slp_tree slp_node)
{
if (slp_node)
vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1, -1);
else
{
tree vec_oprnd;
......@@ -2519,7 +2519,7 @@ vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
{
/* Handle uses. */
if (slp_node)
vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
vect_get_slp_defs (slp_node, &vec_oprnds0, NULL, -1);
else
{
VEC_free (tree, heap, vec_oprnds0);
......@@ -2819,7 +2819,7 @@ vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
if (j == 0)
{
if (slp_node)
vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
else
{
vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
......@@ -3105,7 +3105,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
if (slp)
{
/* Get vectorized arguments for SLP_NODE. */
vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
vect_get_slp_defs (slp_node, &vec_oprnds, NULL, -1);
vec_oprnd = VEC_index (tree, vec_oprnds, 0);
}
......@@ -4049,7 +4049,7 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vectorize, slp_tree node)
|| vectorizable_load (stmt, NULL, NULL, NULL, NULL)
|| vectorizable_call (stmt, NULL, NULL)
|| vectorizable_store (stmt, NULL, NULL, NULL)
|| vectorizable_reduction (stmt, NULL, NULL)
|| vectorizable_reduction (stmt, NULL, NULL, NULL)
|| vectorizable_condition (stmt, NULL, NULL, NULL, 0));
else
{
......@@ -4201,8 +4201,7 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
break;
case reduc_vec_info_type:
gcc_assert (!slp_node);
done = vectorizable_reduction (stmt, gsi, &vec_stmt);
done = vectorizable_reduction (stmt, gsi, &vec_stmt, slp_node);
gcc_assert (done);
break;
......
......@@ -242,6 +242,9 @@ typedef struct _loop_vec_info {
/* The unrolling factor needed to SLP the loop. In case of that pure SLP is
applied to the loop, i.e., no unrolling is needed, this is 1. */
unsigned slp_unrolling_factor;
/* Reduction cycles detected in the loop. Used in loop-aware SLP. */
VEC (gimple, heap) *reductions;
} *loop_vec_info;
/* Access Functions. */
......@@ -266,6 +269,7 @@ typedef struct _loop_vec_info {
#define LOOP_VINFO_STRIDED_STORES(L) (L)->strided_stores
#define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances
#define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
#define LOOP_VINFO_REDUCTIONS(L) (L)->reductions
#define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
VEC_length (gimple, (L)->may_misalign_stmts) > 0
......@@ -844,7 +848,8 @@ extern void vect_transform_loop (loop_vec_info);
extern loop_vec_info vect_analyze_loop_form (struct loop *);
extern bool vectorizable_live_operation (gimple, gimple_stmt_iterator *,
gimple *);
extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *);
extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *,
slp_tree);
extern bool vectorizable_induction (gimple, gimple_stmt_iterator *, gimple *);
extern int vect_estimate_min_profitable_iters (loop_vec_info);
extern tree get_initial_def_for_reduction (gimple, tree, tree *);
......@@ -862,7 +867,7 @@ extern bool vect_analyze_slp (loop_vec_info, bb_vec_info);
extern void vect_make_slp_decision (loop_vec_info);
extern void vect_detect_hybrid_slp (loop_vec_info);
extern void vect_get_slp_defs (slp_tree, VEC (tree,heap) **,
VEC (tree,heap) **);
VEC (tree,heap) **, int);
extern LOC find_bb_location (basic_block);
extern bb_vec_info vect_slp_analyze_bb (basic_block);
extern void vect_slp_transform_bb (basic_block);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment