re PR tree-optimization/37027 (SLP loop vectorization missing support for reductions)

PR tree-optimization/37027 * tree-vectorizer.h (struct _loop_vec_info): Add new field reductions and macro to access it. (vectorizable_reduction): Add argument. (vect_get_slp_defs): Likewise. * tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction statements for possible use in SLP. (new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS. (destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS. (vect_create_epilog_for_reduction): Handle SLP. Modify documentation, add new argument. (vectorizable_reduction): Likewise. * tree-vect-stmts.c (vect_get_vec_defs): Update call to vect_get_slp_defs. (vectorizable_type_demotion, vectorizable_type_promotion, vectorizable_store): Likewise. (vect_analyze_stmt): Update call to vectorizable_reduction. (vect_transform_stmt): Likewise. * tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction. (vect_build_slp_tree): Fix indentation. Check that there are no loads from different interleaving chains in same node. (vect_slp_rearrange_stmts): New function. (vect_supported_load_permutation_p): Allow load permutations for reductions. Call vect_slp_rearrange_stmts() to rearrange statements inside SLP nodes if necessary. (vect_analyze_slp_instance): Handle reductions. (vect_analyze_slp): Try to build SLP instances originating from groups of reductions. (vect_detect_hybrid_slp_stmts): Skip reduction statements. (vect_get_constant_vectors): Create initial vectors for reductions according to reduction code. Add new argument. (vect_get_slp_defs): Add new argument, pass it to vect_get_constant_vectors. (vect_schedule_slp_instance): Remove SLP tree root statements. From-SVN: r158506

re PR tree-optimization/37027 (SLP loop vectorization missing support for reductions)
PR tree-optimization/37027 * tree-vectorizer.h (struct _loop_vec_info): Add new field reductions and macro to access it. (vectorizable_reduction): Add argument. (vect_get_slp_defs): Likewise. * tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction statements for possible use in SLP. (new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS. (destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS. (vect_create_epilog_for_reduction): Handle SLP. Modify documentation, add new argument. (vectorizable_reduction): Likewise. * tree-vect-stmts.c (vect_get_vec_defs): Update call to vect_get_slp_defs. (vectorizable_type_demotion, vectorizable_type_promotion, vectorizable_store): Likewise. (vect_analyze_stmt): Update call to vectorizable_reduction. (vect_transform_stmt): Likewise. * tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction. (vect_build_slp_tree): Fix indentation. Check that there are no loads from different interleaving chains in same node. (vect_slp_rearrange_stmts): New function. (vect_supported_load_permutation_p): Allow load permutations for reductions. Call vect_slp_rearrange_stmts() to rearrange statements inside SLP nodes if necessary. (vect_analyze_slp_instance): Handle reductions. (vect_analyze_slp): Try to build SLP instances originating from groups of reductions. (vect_detect_hybrid_slp_stmts): Skip reduction statements. (vect_get_constant_vectors): Create initial vectors for reductions according to reduction code. Add new argument. (vect_get_slp_defs): Add new argument, pass it to vect_get_constant_vectors. (vect_schedule_slp_instance): Remove SLP tree root statements. From-SVN: r158506
b5aeb3bb · Ira Rosen · Ira Rosen · 5a2fa9e8 · b5aeb3bb · b5aeb3bb
Commit b5aeb3bb authored Apr 19, 2010 by Ira Rosen Committed by Ira Rosen Apr 19, 2010
15 changed files
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2010-04-19 Ira Rosen <irar@il.ibm.com>
+
+	PR tree-optimization/37027
+	* tree-vectorizer.h (struct _loop_vec_info): Add new field reductions 
+	and macro to access it.
+	(vectorizable_reduction): Add argument.
+	(vect_get_slp_defs): Likewise.
+	* tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction
+	statements for possible use in SLP.
+	(new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS.
+	(destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS.
+	(vect_create_epilog_for_reduction): Handle SLP. Modify documentation,
+	add new argument.
+	(vectorizable_reduction): Likewise.
+	* tree-vect-stmts.c (vect_get_vec_defs): Update call to 
+	vect_get_slp_defs.
+	(vectorizable_type_demotion, vectorizable_type_promotion,
+	vectorizable_store): Likewise.
+	(vect_analyze_stmt): Update call to vectorizable_reduction.
+	(vect_transform_stmt): Likewise.
+	* tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction.
+	(vect_build_slp_tree): Fix indentation. Check that there are no loads
+	from different interleaving chains in same node.
+	(vect_slp_rearrange_stmts): New function.
+	(vect_supported_load_permutation_p): Allow load permutations for 
+	reductions. Call vect_slp_rearrange_stmts() to rearrange statements
+	inside SLP nodes if necessary.
+	(vect_analyze_slp_instance): Handle reductions.
+	(vect_analyze_slp): Try to build SLP instances originating from groups
+	of reductions.
+	(vect_detect_hybrid_slp_stmts): Skip reduction statements.
+	(vect_get_constant_vectors): Create initial vectors for reductions
+	according to reduction code. Add new argument.
+	(vect_get_slp_defs): Add new argument, pass it to 
+	vect_get_constant_vectors.
+	(vect_schedule_slp_instance): Remove SLP tree root statements.
+
 2010-04-19  Jakub Jelinek  <jakub@redhat.com>

 	* tree.h (ENUM_IS_SCOPED): Define.

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
+2010-04-19 Ira Rosen <irar@il.ibm.com>
+
+	PR tree-optimization/37027	
+	* lib/target-supports.exp 
+	(check_effective_target_vect_widen_sum_hi_to_si_pattern): New.
+	* gcc.dg/vect/pr37027.c: New test.
+	* gcc.dg/vect/slp-reduc-1.c, gcc.dg/vect/slp-reduc-2.c, 
+	gcc.dg/vect/slp-reduc-3.c, gcc.dg/vect/slp-reduc-4.c, 
+	gcc.dg/vect/slp-reduc-5.c, gcc.dg/vect/slp-reduc-6.c, 
+	gcc.dg/vect/vect-complex-6.c: Likewise.	
+
 2010-04-19  Jakub Jelinek  <jakub@redhat.com>

 	* g++.dg/debug/dwarf2/enum1.C: New test.

--- a/gcc/testsuite/gcc.dg/vect/pr37027.c
+++ b/gcc/testsuite/gcc.dg/vect/pr37027.c
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+
+struct mystr
+{
+  int f1;
+  int f2;
+};
+
+struct mystr a[16];
+struct mystr b[16];
+int res1, res2;
+
+
+void
+foo (void)
+{
+  int i;
+  int sum1;
+  int sum2;
+
+  for (i = 0; i < 16; i++)
+  {
+    sum1 += a[i].f1 + b[i].f1;
+    sum2 += a[i].f2 + b[i].f2;
+  }
+
+  res1 = sum1;
+  res2 = sum2;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 16
+
+unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+unsigned int uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+
+/* Vectorization of reduction using loop-aware SLP.  */
+
+__attribute__ ((noinline))
+int main1 (int n, int res0, int res1, int res2, int res3)
+{
+  int i;
+  unsigned int udiff0 = 5, udiff1 = 10, udiff2 = 20, udiff3 = 30;
+
+  for (i = 0; i < n; i++) {
+    udiff3 += (ub[4*i + 3] - uc[4*i + 3]);
+    udiff2 += (ub[4*i + 2] - uc[4*i + 2]);
+    udiff1 += (ub[4*i + 1] - uc[4*i + 1]);
+    udiff0 += (ub[4*i] - uc[4*i]);
+  }
+
+  /* Check results:  */
+  if (udiff0 != res0
+      || udiff1 != res1
+      || udiff2 != res2
+      || udiff3 != res3)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{
+  check_vect ();
+
+  main1 (N/4, 53, 66, 84, 102);
+  main1 (N/4 - 1, 29, 40, 56, 72);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 16
+
+unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+unsigned int uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+
+/* Vectorization of reduction using loop-aware SLP (with unrolling).  */
+
+__attribute__ ((noinline))
+int main1 (int n, int res0, int res1, int res2, int res3)
+{
+  int i;
+  unsigned int udiff0 = 5, udiff1 = 10;
+
+  for (i = 0; i < n; i++) {
+    udiff1 += (ub[2*i + 1] - uc[2*i + 1]);
+    udiff0 += (ub[2*i] - uc[2*i]);
+  }
+
+  /* Check results:  */
+  if (udiff0 != res0
+      || udiff1 != res1)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{
+  check_vect ();
+
+  main1 (N/2, 117, 138, 84, 102);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+#define DOT1 21834 
+#define DOT2 21876
+
+unsigned short X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+unsigned short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+
+/* short->short->int dot product. 
+   Not detected as a dot-product pattern.
+   Requires support for non-widneing multiplication and widening-summation.  
+   Vectorized with loop-aware SLP. */
+__attribute__ ((noinline)) unsigned int
+foo1(int len, int *result1, int *result2) 
+{
+  int i;
+  unsigned int res1 = 10, res2 = 20;
+  unsigned short prod;
+
+  for (i=0; i<len; i++) {
+    prod = X[2*i] * Y[2*i];
+    res1 += prod;
+    prod = X[2*i+1] * Y[2*i+1];
+    res2 += prod;
+  }
+
+  *result1 = res1;
+  *result2 = res2;
+
+  return 0;
+}
+
+int main (void)
+{
+  unsigned int dot1, dot2;
+  unsigned short i;
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    X[i] = i;
+    Y[i] = 64-i;
+  }
+
+  foo1 (N/2, &dot1, &dot2);
+
+  if (dot1 != DOT1 || dot2 != DOT2)
+    abort ();
+
+  return 0;
+}
+
+/* The initialization loop in main also gets vectorized.  */
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target { vect_short_mult && vect_widen_sum_hi_to_si } } } } */ 
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_widen_sum_hi_to_si_pattern } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-4.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-4.c
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128 
+
+unsigned int uc[N];
+
+/* Vectorization of reduction using loop-aware SLP.  */
+
+__attribute__ ((noinline))
+int main1 (int n, int res0, int res1, int res2, int res3, int res4, int res5, int res6, int res7)
+{
+  int i;
+  unsigned int max0 = 5, max1 = 10, max2 = 20, max3 = 30, max4 = 2, max5 = 13, max6 = 7, max7 = 313;
+
+  for (i = 0; i < n; i++) {
+    max2 = max2 < uc[8*i+2] ? uc[8*i+2] : max2;
+    max3 = max3 < uc[8*i+3] ? uc[8*i+3] : max3;
+    max1 = max1 < uc[8*i+1] ? uc[8*i+1] : max1;
+    max7 = max7 < uc[8*i+7] ? uc[8*i+7] : max7;
+    max6 = max6 < uc[8*i+6] ? uc[8*i+6] : max6;
+    max0 = max0 < uc[8*i] ? uc[8*i] : max0;
+    max4 = max4 < uc[8*i+4] ? uc[8*i+4] : max4;
+    max5 = max5 < uc[8*i+5] ? uc[8*i+5] : max5;
+  }
+
+  /* Check results:  */
+  if (max0 != res0
+      || max1 != res1
+      || max2 != res2
+      || max3 != res3
+      || max4 != res4
+      || max5 != res5
+      || max6 != res6
+      || max7 != res7)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i < N; i++)
+    uc[i] = i+3;
+
+  main1 (N/8, 123, 124, 125, 126, 127, 128, 129, 313);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_max } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_max } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128 
+
+int c[N];
+
+/* Vectorization of reduction using loop-aware SLP.  */
+
+__attribute__ ((noinline))
+int main1 (int n, int res0, int res1)
+{
+  int i;
+  int max0 = -100, max1 = -313;
+
+  for (i = 0; i < n; i++) {
+    max1 = max1 < c[2*i+1] ? c[2*i+1] : max1;
+    max0 = max0 < c[2*i] ? c[2*i] : max0;
+  }
+
+  /* Check results:  */
+  if (max0 != res0
+      || max1 != res1)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i < N; i++)
+    c[i] = (i+3) * -1;
+
+  c[0] = c[1] = -100;
+  main1 (N/2, -5, -6);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_max } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_max } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 128 
+
+int a[N], b[N];
+
+/* Vectorization of reduction. Loop-aware SLP is not possible, because of 
+   different arrays.  */
+
+__attribute__ ((noinline))
+int main1 (int n, int res0, int res1)
+{
+  int i;
+  int sum0 = 0, sum1 = 0;
+
+  for (i = 0; i < n; i++) {
+    sum1 += a[2*i];
+    sum0 += b[2*i];
+  }
+
+  /* Check results:  */
+  if (sum0 != res0
+      || sum1 != res1)
+    abort ();
+
+  return 0;
+}
+
+int main (void)
+{
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i < N; i++)
+    a[i] = b[i] = i;
+
+  main1 (N/2, 4032, 4032);
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_add } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { scan-tree-dump-times "different interleaving chains in one node" 1 "vect" { target { ! vect_no_int_add } } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2105,6 +2105,25 @@ proc check_effective_target_vect_perm { } {
    return $et_vect_perm_saved
 }

+# Return 1 if the target plus current options supports a vector
+# widening summation of *short* args into *int* result, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
+    global et_vect_widen_sum_hi_to_si_pattern
+
+    if [info exists et_vect_widen_sum_hi_to_si_pattern_saved] {
+        verbose "check_effective_target_vect_widen_sum_hi_to_si_pattern: using cached result" 2
+    } else {
+        set et_vect_widen_sum_hi_to_si_pattern_saved 0
+        if { [istarget powerpc*-*-*] } {
+            set et_vect_widen_sum_hi_to_si_pattern_saved 1
+        }
+    }
+    verbose "check_effective_target_vect_widen_sum_hi_to_si_pattern: returning $et_vect_widen_sum_hi_to_si_pattern_saved" 2
+    return $et_vect_widen_sum_hi_to_si_pattern_saved
+}

 # Return 1 if the target plus current options supports a vector
 # widening summation of *short* args into *int* result, 0 otherwise.

--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -545,6 +545,11 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
                  STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
                                                           vect_reduction_def;
+                  /* Store the reduction cycles for possible vectorization in
+                     loop-aware SLP.  */
+                  VEC_safe_push (gimple, heap,
+                                 LOOP_VINFO_REDUCTIONS (loop_vinfo),
+                                 reduc_stmt);
                }
            }
        }
@@ -745,6 +750,7 @@ new_loop_vec_info (struct loop *loop)
    VEC_alloc (ddr_p, heap,
               PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
  LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
+  LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
  LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
  LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;

@@ -835,6 +841,7 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)

  VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
  VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
+  VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));

  free (loop_vinfo);
  loop->aux = NULL;
@@ -1223,7 +1230,6 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
          if ((STMT_VINFO_RELEVANT_P (stmt_info)
               || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
              && !PURE_SLP_STMT (stmt_info))
-
            /* STMT needs both SLP and loop-based vectorization.  */
            only_slp_in_loop = false;
        }
@@ -2860,28 +2866,33 @@ get_initial_def_for_reduction (gimple stmt, tree init_val,
 /* Function vect_create_epilog_for_reduction

   Create code at the loop-epilog to finalize the result of a reduction
-   computation.
-
-   VECT_DEF is a vector of partial results.
-   REDUC_CODE is the tree-code for the epilog reduction.
+   computation. 
+  
+   VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector 
+     reduction statements. 
+   STMT is the scalar reduction stmt that is being vectorized.
   NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
     number of elements that we can fit in a vectype (nunits). In this case
     we have to generate more than one vector stmt - i.e - we need to "unroll"
     the vector stmt by a factor VF/nunits.  For more details see documentation
     in vectorizable_operation.
-   STMT is the scalar reduction stmt that is being vectorized.
-   REDUCTION_PHI is the phi-node that carries the reduction computation.
-   REDUC_INDEX is the index of the operand in the right hand side of the
+   REDUC_CODE is the tree-code for the epilog reduction.
+   REDUCTION_PHIS is a list of the phi-nodes that carry the reduction 
+     computation.
+   REDUC_INDEX is the index of the operand in the right hand side of the 
     statement that is defined by REDUCTION_PHI.
   DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
+   SLP_NODE is an SLP node containing a group of reduction statements. The 
+     first one in this group is STMT.

   This function:
-   1. Creates the reduction def-use cycle: sets the arguments for
-      REDUCTION_PHI:
+   1. Creates the reduction def-use cycles: sets the arguments for 
+      REDUCTION_PHIS:
      The loop-entry argument is the vectorized initial-value of the reduction.
-      The loop-latch argument is VECT_DEF - the vector of partial sums.
-   2. "Reduces" the vector of partial results VECT_DEF into a single result,
-      by applying the operation specified by REDUC_CODE if available, or by
+      The loop-latch argument is taken from VECT_DEFS - the vector of partial 
+      sums.
+   2. "Reduces" each vector of partial results VECT_DEFS into a single result,
+      by applying the operation specified by REDUC_CODE if available, or by 
      other means (whole-vector shifts or a scalar loop).
      The function also creates a new phi node at the loop exit to preserve
      loop-closed form, as illustrated below.
@@ -2914,12 +2925,11 @@ get_initial_def_for_reduction (gimple stmt, tree init_val,
 */

 static void
-vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
-				  int ncopies,
-				  enum tree_code reduc_code,
-				  gimple reduction_phi,
-                                  int reduc_index,
-                                  bool double_reduc)
+vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
+				  int ncopies, enum tree_code reduc_code,
+				  VEC (gimple, heap) *reduction_phis,
+                                  int reduc_index, bool double_reduc, 
+                                  slp_tree slp_node)
 {
  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
  stmt_vec_info prev_phi_info;
@@ -2933,32 +2943,37 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
  gimple new_phi = NULL, phi;
  gimple_stmt_iterator exit_gsi;
  tree vec_dest;
-  tree new_temp = NULL_TREE;
-  tree new_name;
+  tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
  gimple epilog_stmt = NULL;
-  tree new_scalar_dest, new_dest;
+  enum tree_code code = gimple_assign_rhs_code (stmt);
  gimple exit_phi;
  tree bitsize, bitpos;
-  enum tree_code code = gimple_assign_rhs_code (stmt);
-  tree adjustment_def;
-  tree vec_initial_def, def;
-  tree orig_name;
+  tree adjustment_def = NULL;
+  tree vec_initial_def = NULL;
+  tree reduction_op, expr, def;
+  tree orig_name, scalar_result;
  imm_use_iterator imm_iter;
  use_operand_p use_p;
  bool extract_scalar_result = false;
-  tree reduction_op, expr;
-  gimple orig_stmt;
-  gimple use_stmt;
+  gimple use_stmt, orig_stmt, reduction_phi = NULL;
  bool nested_in_vect_loop = false;
-  VEC(gimple,heap) *phis = NULL;
+  VEC (gimple, heap) *new_phis = NULL;
  enum vect_def_type dt = vect_unknown_def_type;
  int j, i;
+  VEC (tree, heap) *scalar_results = NULL;
+  int group_size = 1, k, ratio;
+  VEC (tree, heap) *vec_initial_defs = NULL;
+  VEC (gimple, heap) *phis;
+
+  if (slp_node)
+    group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node)); 

  if (nested_in_vect_loop_p (loop, stmt))
    {
      outer_loop = loop;
      loop = loop->inner;
      nested_in_vect_loop = true;
+      gcc_assert (!slp_node);
    }

  switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
@@ -2983,47 +2998,80 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
  gcc_assert (vectype);
  mode = TYPE_MODE (vectype);

-  /*** 1. Create the reduction def-use cycle  ***/
+  /* 1. Create the reduction def-use cycle:
+     Set the arguments of REDUCTION_PHIS, i.e., transform
+
+        loop:
+          vec_def = phi <null, null>            # REDUCTION_PHI
+          VECT_DEF = vector_stmt                # vectorized form of STMT
+          ...

-  /* For the case of reduction, vect_get_vec_def_for_operand returns
-     the scalar def before the loop, that defines the initial value
-     of the reduction variable.  */
-  vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
-					          &adjustment_def);
+     into:

-  phi = reduction_phi;
-  def = vect_def;
-  for (j = 0; j < ncopies; j++)
+        loop:
+          vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
+          VECT_DEF = vector_stmt                # vectorized form of STMT
+          ...
+
+     (in case of SLP, do it for all the phis). */
+
+  /* Get the loop-entry arguments.  */
+  if (slp_node)
+    vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index);
+  else
    {
-      /* 1.1 set the loop-entry arg of the reduction-phi:  */
-      add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop),
-		   UNKNOWN_LOCATION);
+      vec_initial_defs = VEC_alloc (tree, heap, 1);
+     /* For the case of reduction, vect_get_vec_def_for_operand returns
+        the scalar def before the loop, that defines the initial value
+        of the reduction variable.  */
+      vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
+                                                      &adjustment_def);
+      VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
+    }

-      /* 1.2 set the loop-latch arg for the reduction-phi:  */
-      if (j > 0)
-        def = vect_get_vec_def_for_stmt_copy (dt, def);
-      add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
+  /* Set phi nodes arguments.  */
+  for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++)
+    {
+      tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
+      tree def = VEC_index (tree, vect_defs, i);
+      for (j = 0; j < ncopies; j++)
+        {
+          /* Set the loop-entry arg of the reduction-phi.  */
+          add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
+                       UNKNOWN_LOCATION);

-      if (vect_print_dump_info (REPORT_DETAILS))
-	{
-	  fprintf (vect_dump, "transform reduction: created def-use cycle: ");
-	  print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
-          fprintf (vect_dump, "\n");
-          print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
-	}
+          /* Set the loop-latch arg for the reduction-phi.  */
+          if (j > 0)
+            def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);

-      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
+          add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
+
+          if (vect_print_dump_info (REPORT_DETAILS))
+            {
+              fprintf (vect_dump, "transform reduction: created def-use"
+                                  " cycle: ");
+              print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
+              fprintf (vect_dump, "\n");
+              print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
+                                 TDF_SLIM);
+            }
+
+          phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
+        }
    }

-  /*** 2. Create epilog code
-	  The reduction epilog code operates across the elements of the vector
-          of partial results computed by the vectorized loop.
-          The reduction epilog code consists of:
-          step 1: compute the scalar result in a vector (v_out2)
-          step 2: extract the scalar result (s_out3) from the vector (v_out2)
-          step 3: adjust the scalar result (s_out3) if needed.
+  VEC_free (tree, heap, vec_initial_defs);
+
+  /* 2. Create epilog code.
+        The reduction epilog code operates across the elements of the vector
+        of partial results computed by the vectorized loop.
+        The reduction epilog code consists of:
+
+        step 1: compute the scalar result in a vector (v_out2)
+        step 2: extract the scalar result (s_out3) from the vector (v_out2)
+        step 3: adjust the scalar result (s_out3) if needed.

-          Step 1 can be accomplished using one the following three schemes:
+        Step 1 can be accomplished using one the following three schemes:
          (scheme 1) using reduc_code, if available.
          (scheme 2) using whole-vector shifts, if available.
          (scheme 3) using a scalar loop. In this case steps 1+2 above are
@@ -3038,29 +3086,33 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
          s_out4 = adjust_result <s_out3>       # step 3

          (step 3 is optional, and steps 1 and 2 may be combined).
-          Lastly, the uses of s_out0 are replaced by s_out4.
+          Lastly, the uses of s_out0 are replaced by s_out4.  */

-	  ***/

-  /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
-        v_out1 = phi <v_loop>  */
+  /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
+         v_out1 = phi <VECT_DEF> 
+         Store them in NEW_PHIS.  */

  exit_bb = single_exit (loop)->dest;
-  def = vect_def;
  prev_phi_info = NULL;
-  for (j = 0; j < ncopies; j++)
+  new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
+  for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++)
    {
-      phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
-      set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
-      if (j == 0)
-	new_phi = phi;
-      else
-	{
-	  def = vect_get_vec_def_for_stmt_copy (dt, def);
-	  STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
-	}
-      SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
-      prev_phi_info = vinfo_for_stmt (phi);
+      for (j = 0; j < ncopies; j++)
+        {
+          phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
+          set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
+          if (j == 0)
+            VEC_quick_push (gimple, new_phis, phi);
+          else
+	    {
+	      def = vect_get_vec_def_for_stmt_copy (dt, def);
+	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
+	    }
+
+          SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
+          prev_phi_info = vinfo_for_stmt (phi);
+        }
    }

  exit_gsi = gsi_after_labels (exit_bb);
@@ -3089,16 +3141,17 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
    }

  code = gimple_assign_rhs_code (orig_stmt);
+  /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
+     partial results are added and not subtracted.  */
+  if (code == MINUS_EXPR) 
+    code = PLUS_EXPR;
+  
  scalar_dest = gimple_assign_lhs (orig_stmt);
  scalar_type = TREE_TYPE (scalar_dest);
+  scalar_results = VEC_alloc (tree, heap, group_size); 
  new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
  bitsize = TYPE_SIZE (scalar_type);

-  /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
-     partial results are added and not subtracted.  */
-  if (code == MINUS_EXPR)
-    code = PLUS_EXPR;
-
  /* In case this is a reduction in an inner-loop while vectorizing an outer
     loop - we don't need to extract a single scalar result at the end of the
     inner-loop (unless it is double reduction, i.e., the use of reduction is
@@ -3108,28 +3161,21 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
  if (nested_in_vect_loop && !double_reduc)
    goto vect_finalize_reduction;

-  /* The epilogue is created for the outer-loop, i.e., for the loop being
-     vectorized.  */
-  if (double_reduc)
-    loop = outer_loop;
-
-  /* FORNOW */
-  gcc_assert (ncopies == 1);
-
  /* 2.3 Create the reduction code, using one of the three schemes described
-         above.  */
-
-  if (reduc_code != ERROR_MARK)
+         above. In SLP we simply need to extract all the elements from the 
+         vector (without reducing them), so we use scalar shifts.  */
+  if (reduc_code != ERROR_MARK && !slp_node)
    {
      tree tmp;

      /*** Case 1:  Create:
-	   v_out2 = reduc_expr <v_out1>  */
+           v_out2 = reduc_expr <v_out1>  */

      if (vect_print_dump_info (REPORT_DETAILS))
-	fprintf (vect_dump, "Reduce using direct vector reduction.");
+        fprintf (vect_dump, "Reduce using direct vector reduction.");

      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+      new_phi = VEC_index (gimple, new_phis, 0);
      tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
      epilog_stmt = gimple_build_assign (vec_dest, tmp);
      new_temp = make_ssa_name (vec_dest, epilog_stmt);
@@ -3148,142 +3194,182 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
      tree vec_temp;

      if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
-	shift_code = VEC_RSHIFT_EXPR;
+        shift_code = VEC_RSHIFT_EXPR;
      else
-	have_whole_vector_shift = false;
+        have_whole_vector_shift = false;

      /* Regardless of whether we have a whole vector shift, if we're
-	 emulating the operation via tree-vect-generic, we don't want
-	 to use it.  Only the first round of the reduction is likely
-	 to still be profitable via emulation.  */
+         emulating the operation via tree-vect-generic, we don't want
+         to use it.  Only the first round of the reduction is likely
+         to still be profitable via emulation.  */
      /* ??? It might be better to emit a reduction tree code here, so that
-	 tree-vect-generic can expand the first round via bit tricks.  */
+         tree-vect-generic can expand the first round via bit tricks.  */
      if (!VECTOR_MODE_P (mode))
-	have_whole_vector_shift = false;
+        have_whole_vector_shift = false;
      else
-	{
-	  optab optab = optab_for_tree_code (code, vectype, optab_default);
-	  if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
-	    have_whole_vector_shift = false;
-	}
-
-      if (have_whole_vector_shift)
        {
-	  /*** Case 2: Create:
-	     for (offset = VS/2; offset >= element_size; offset/=2)
-	        {
-	          Create:  va' = vec_shift <va, offset>
-	          Create:  va = vop <va, va'>
-	        }  */
-
-	  if (vect_print_dump_info (REPORT_DETAILS))
-	    fprintf (vect_dump, "Reduce using vector shifts");
+          optab optab = optab_for_tree_code (code, vectype, optab_default);
+          if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
+            have_whole_vector_shift = false;
+        }

-	  vec_dest = vect_create_destination_var (scalar_dest, vectype);
-	  new_temp = PHI_RESULT (new_phi);
+      if (have_whole_vector_shift && !slp_node)
+        {
+          /*** Case 2: Create:
+             for (offset = VS/2; offset >= element_size; offset/=2)
+                {
+                  Create:  va' = vec_shift <va, offset>
+                  Create:  va = vop <va, va'>
+                }  */

-	  for (bit_offset = vec_size_in_bits/2;
-	       bit_offset >= element_bitsize;
-	       bit_offset /= 2)
-	    {
-	      tree bitpos = size_int (bit_offset);
-
-	      epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
-							  new_temp, bitpos);
-	      new_name = make_ssa_name (vec_dest, epilog_stmt);
-	      gimple_assign_set_lhs (epilog_stmt, new_name);
-	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-
-	      epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
-							  new_name, new_temp);
-	      new_temp = make_ssa_name (vec_dest, epilog_stmt);
-	      gimple_assign_set_lhs (epilog_stmt, new_temp);
-	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	    }
+          if (vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "Reduce using vector shifts");
+
+          vec_dest = vect_create_destination_var (scalar_dest, vectype);
+          new_phi = VEC_index (gimple, new_phis, 0);
+          new_temp = PHI_RESULT (new_phi);
+          for (bit_offset = vec_size_in_bits/2;
+               bit_offset >= element_bitsize;
+               bit_offset /= 2)
+            {
+              tree bitpos = size_int (bit_offset);
+
+              epilog_stmt = gimple_build_assign_with_ops (shift_code,
+                                               vec_dest, new_temp, bitpos);
+              new_name = make_ssa_name (vec_dest, epilog_stmt);
+              gimple_assign_set_lhs (epilog_stmt, new_name);
+              gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+
+              epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
+                                                          new_name, new_temp);
+              new_temp = make_ssa_name (vec_dest, epilog_stmt);
+              gimple_assign_set_lhs (epilog_stmt, new_temp);
+              gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+            }

-	  extract_scalar_result = true;
-	}
+          extract_scalar_result = true;
+        }
      else
        {
-	  tree rhs;
-
-	  /*** Case 3: Create:
-	     s = extract_field <v_out2, 0>
-	     for (offset = element_size;
-		  offset < vector_size;
-		  offset += element_size;)
-	       {
-	         Create:  s' = extract_field <v_out2, offset>
-	         Create:  s = op <s, s'>
-	       }  */
+          tree rhs;
+
+          /*** Case 3: Create:
+             s = extract_field <v_out2, 0>
+             for (offset = element_size;
+                  offset < vector_size;
+                  offset += element_size;)
+               {
+                 Create:  s' = extract_field <v_out2, offset>
+                 Create:  s = op <s, s'>  // For non SLP cases
+               }  */

-	  if (vect_print_dump_info (REPORT_DETAILS))
-	    fprintf (vect_dump, "Reduce using scalar code. ");
-
-	  vec_temp = PHI_RESULT (new_phi);
-	  vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
-	  rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
-			 bitsize_zero_node);
-	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
-	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
-	  gimple_assign_set_lhs (epilog_stmt, new_temp);
-	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-
-	  for (bit_offset = element_bitsize;
-	       bit_offset < vec_size_in_bits;
-	       bit_offset += element_bitsize)
-	    {
-	      tree bitpos = bitsize_int (bit_offset);
-	      tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
-				 bitpos);
-
-	      epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
-	      new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
-	      gimple_assign_set_lhs (epilog_stmt, new_name);
-	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-
-	      epilog_stmt = gimple_build_assign_with_ops (code,
-							  new_scalar_dest,
-							  new_name, new_temp);
-	      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
-	      gimple_assign_set_lhs (epilog_stmt, new_temp);
-	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-	    }
+          if (vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "Reduce using scalar code. ");

-	  extract_scalar_result = false;
-	}
+          vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+          for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++)
+            {
+              vec_temp = PHI_RESULT (new_phi);
+              rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
+                            bitsize_zero_node);
+              epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
+              new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
+              gimple_assign_set_lhs (epilog_stmt, new_temp);
+              gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+
+              /* In SLP we don't need to apply reduction operation, so we just
+                 collect s' values in SCALAR_RESULTS.  */
+              if (slp_node)
+                VEC_safe_push (tree, heap, scalar_results, new_temp);
+
+              for (bit_offset = element_bitsize;
+                   bit_offset < vec_size_in_bits;
+                   bit_offset += element_bitsize)
+                {
+                  tree bitpos = bitsize_int (bit_offset);
+                  tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
+                                     bitsize, bitpos);
+
+                  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
+                  new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
+                  gimple_assign_set_lhs (epilog_stmt, new_name);
+                  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+
+                  if (slp_node)
+                    {
+                      /* In SLP we don't need to apply reduction operation, so 
+                         we just collect s' values in SCALAR_RESULTS.  */
+                      new_temp = new_name;
+                      VEC_safe_push (tree, heap, scalar_results, new_name);
+                    }
+                  else
+                    {
+                      epilog_stmt = gimple_build_assign_with_ops (code,
+                                          new_scalar_dest, new_name, new_temp);
+                      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
+                      gimple_assign_set_lhs (epilog_stmt, new_temp);
+                      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+                    }
+                }
+            }
+
+          /* The only case where we need to reduce scalar results in SLP, is
+             unrolling. If the size of SCALAR_RESULTS is greater than 
+             GROUP_SIZE, we reduce them combining elements modulo 
+             GROUP_SIZE.  */
+          if (slp_node)
+            {
+              tree res, first_res, new_res;
+              gimple new_stmt;
+            
+              /* Reduce multiple scalar results in case of SLP unrolling.  */
+              for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
+                   j++)
+                {
+                  first_res = VEC_index (tree, scalar_results, j % group_size);
+                  new_stmt = gimple_build_assign_with_ops (code,
+                                              new_scalar_dest, first_res, res);
+                  new_res = make_ssa_name (new_scalar_dest, new_stmt);
+                  gimple_assign_set_lhs (new_stmt, new_res);
+                  gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
+                  VEC_replace (tree, scalar_results, j % group_size, new_res);
+                }
+            }
+          else
+            /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
+            VEC_safe_push (tree, heap, scalar_results, new_temp);
+
+          extract_scalar_result = false;
+        }
    }

  /* 2.4  Extract the final scalar result.  Create:
-         s_out3 = extract_field <v_out2, bitpos>  */
+          s_out3 = extract_field <v_out2, bitpos>  */

  if (extract_scalar_result)
    {
      tree rhs;

-      gcc_assert (!nested_in_vect_loop || double_reduc);
      if (vect_print_dump_info (REPORT_DETAILS))
-	fprintf (vect_dump, "extract scalar result");
+        fprintf (vect_dump, "extract scalar result");

      if (BYTES_BIG_ENDIAN)
-	bitpos = size_binop (MULT_EXPR,
-		       bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
-		       TYPE_SIZE (scalar_type));
+        bitpos = size_binop (MULT_EXPR,
+                             bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
+                             TYPE_SIZE (scalar_type));
      else
-	bitpos = bitsize_zero_node;
+        bitpos = bitsize_zero_node;

      rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
      epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
      gimple_assign_set_lhs (epilog_stmt, new_temp);
      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+      VEC_safe_push (tree, heap, scalar_results, new_temp);
    }
-
+  
 vect_finalize_reduction:

-  if (double_reduc)
-    loop = loop->inner;
-
  /* 2.5 Adjust the final result by the initial value of the reduction
 	 variable. (When such adjustment is not needed, then
 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
@@ -3291,14 +3377,17 @@ vect_finalize_reduction:

  if (adjustment_def)
    {
+      gcc_assert (!slp_node);
      if (nested_in_vect_loop)
 	{
+          new_phi = VEC_index (gimple, new_phis, 0);
 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
 	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
 	  new_dest = vect_create_destination_var (scalar_dest, vectype);
 	}
      else
 	{
+          new_temp = VEC_index (tree, scalar_results, 0);
 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
 	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
 	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
@@ -3309,142 +3398,206 @@ vect_finalize_reduction:
      gimple_assign_set_lhs (epilog_stmt, new_temp);
      SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+      if (nested_in_vect_loop)
+        {
+          set_vinfo_for_stmt (epilog_stmt,
+                              new_stmt_vec_info (epilog_stmt, loop_vinfo,
+                                                 NULL));
+          STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
+                STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
+
+          if (!double_reduc)
+            VEC_quick_push (tree, scalar_results, new_temp);
+          else
+            VEC_replace (tree, scalar_results, 0, new_temp);
+        }
+      else
+        VEC_replace (tree, scalar_results, 0, new_temp);
+
+      VEC_replace (gimple, new_phis, 0, epilog_stmt);
    }

+  /* 2.6  Handle the loop-exit phis. Replace the uses of scalar loop-exit
+          phis with new adjusted scalar results, i.e., replace use <s_out0>
+          with use <s_out4>.        

-  /* 2.6  Handle the loop-exit phi  */
+     Transform:
+        loop_exit:
+          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
+          v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
+          v_out2 = reduce <v_out1>
+          s_out3 = extract_field <v_out2, 0>
+          s_out4 = adjust_result <s_out3>
+          use <s_out0>
+          use <s_out0>
+
+     into:

-  /* Replace uses of s_out0 with uses of s_out3:
-     Find the loop-closed-use at the loop exit of the original scalar result.
-     (The reduction result is expected to have two immediate uses - one at the
-     latch block, and one at the loop exit).  */
-  phis = VEC_alloc (gimple, heap, 10);
-  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
+        loop_exit:
+          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
+          v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
+          v_out2 = reduce <v_out1>
+          s_out3 = extract_field <v_out2, 0>
+          s_out4 = adjust_result <s_out3>
+          use <s_out4>  */
+
+  /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in 
+     case that GROUP_SIZE is greater than vectorization factor). Therefore, we
+     need to match SCALAR_RESULTS with corresponding statements. The first
+     (GROUP_SIZE / number of new vector stmts) scalar results correspond to
+     the first vector stmt, etc.  
+     (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */ 
+  ratio = group_size / VEC_length (gimple, new_phis);
+  gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
+
+  for (k = 0; k < group_size; k++)
    {
-      if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
-	{
-	  exit_phi = USE_STMT (use_p);
-	  VEC_quick_push (gimple, phis, exit_phi);
-	}
-    }
+      if (k % ratio == 0)
+        {
+          epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
+          reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
+        }

-  /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
-  gcc_assert (!VEC_empty (gimple, phis));
+      if (slp_node)
+        {
+          gimple current_stmt = VEC_index (gimple,
+                                       SLP_TREE_SCALAR_STMTS (slp_node), k);

-  for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
-    {
-      if (nested_in_vect_loop)
-	{
-	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
-          gimple vect_phi;
-
-	  /* FORNOW. Currently not supporting the case that an inner-loop
-	     reduction is not used in the outer-loop (but only outside the
-	     outer-loop), unless it is double reduction.  */
-	  gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
-                      && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
-
-	  epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
-	  STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
-	  set_vinfo_for_stmt (epilog_stmt,
-			      new_stmt_vec_info (epilog_stmt, loop_vinfo,
-			                         NULL));
-	  if (adjustment_def)
-	    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
-		STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
-
-          if (!double_reduc
-              || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
-            continue;
-
-          /* Handle double reduction:
-
-             stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
-             stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
-             stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
-             stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
-
-             At that point the regular reduction (stmt2 and stmt3) is already
-             vectorized, as well as the exit phi node, stmt4.
-             Here we vectorize the phi node of double reduction, stmt1, and
-             update all relevant statements.  */
-
-          /* Go through all the uses of s2 to find double reduction phi node,
-             i.e., stmt1 above.  */
-          orig_name = PHI_RESULT (exit_phi);
-          FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
+          orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
+          /* SLP statements can't participate in patterns.  */
+          gcc_assert (!orig_stmt);
+          scalar_dest = gimple_assign_lhs (current_stmt);
+        }
+
+      phis = VEC_alloc (gimple, heap, 3);
+      /* Find the loop-closed-use at the loop exit of the original scalar
+         result. (The reduction result is expected to have two immediate uses -
+         one at the latch block, and one at the loop exit).  */
+      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
+        if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
+          VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
+
+      /* We expect to have found an exit_phi because of loop-closed-ssa
+         form.  */
+      gcc_assert (!VEC_empty (gimple, phis));
+
+      for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
+        {
+          if (outer_loop)
            {
-              stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
-              stmt_vec_info new_phi_vinfo;
-              tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
-              basic_block bb = gimple_bb (use_stmt);
-              gimple use;
-
-              /* Check that USE_STMT is really double reduction phi node.  */
-              if (gimple_code (use_stmt) != GIMPLE_PHI
-                  || gimple_phi_num_args (use_stmt) != 2
-                  || !use_stmt_vinfo
-                  || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
-                      != vect_double_reduction_def
-                  || bb->loop_father != outer_loop)
+              stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
+              gimple vect_phi;
+
+              /* FORNOW. Currently not supporting the case that an inner-loop
+                 reduction is not used in the outer-loop (but only outside the
+                 outer-loop), unless it is double reduction.  */
+              gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
+                           && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
+                          || double_reduc);
+
+              STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
+              if (!double_reduc
+                  || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
+                      != vect_double_reduction_def)
                continue;

-              /* Create vector phi node for double reduction:
-                 vs1 = phi <vs0, vs2>
-                 vs1 was created previously in this function by a call to
-                 vect_get_vec_def_for_operand and is stored in vec_initial_def;
-                 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
-                 vs0 is created here.  */
+              /* Handle double reduction:

-              /* Create vector phi node.  */
-              vect_phi = create_phi_node (vec_initial_def, bb);
-              new_phi_vinfo = new_stmt_vec_info (vect_phi,
-                                    loop_vec_info_for_loop (outer_loop), NULL);
-              set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
+                 stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
+                 stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
+                 stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
+                 stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)

-              /* Create vs0 - initial def of the double reduction phi.  */
-              preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
-                                             loop_preheader_edge (outer_loop));
-              init_def = get_initial_def_for_reduction (stmt, preheader_arg,
-                                                        NULL);
-              vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
-                                                NULL);
-
-              /* Update phi node arguments with vs0 and vs2.  */
-              add_phi_arg (vect_phi, vect_phi_init,
-                           loop_preheader_edge (outer_loop), UNKNOWN_LOCATION);
-              add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
-                           loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
-              if (vect_print_dump_info (REPORT_DETAILS))
-                {
-                  fprintf (vect_dump, "created double reduction phi node: ");
-                  print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
-                }
-
-              vect_phi_res = PHI_RESULT (vect_phi);
+                 At that point the regular reduction (stmt2 and stmt3) is
+                 already vectorized, as well as the exit phi node, stmt4.
+                 Here we vectorize the phi node of double reduction, stmt1, and
+                 update all relevant statements.  */

-              /* Replace the use, i.e., set the correct vs1 in the regular
-                 reduction phi node. FORNOW, NCOPIES is always 1, so the loop
-                 is redundant.  */
-              use = reduction_phi;
-              for (j = 0; j < ncopies; j++)
+              /* Go through all the uses of s2 to find double reduction phi
+                 node, i.e., stmt1 above.  */
+              orig_name = PHI_RESULT (exit_phi);
+              FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
                {
-                  edge pr_edge = loop_preheader_edge (loop);
-                  SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
-                  use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
+                  stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
+                  stmt_vec_info new_phi_vinfo;
+                  tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
+                  basic_block bb = gimple_bb (use_stmt);
+                  gimple use;
+
+                  /* Check that USE_STMT is really double reduction phi
+                     node.  */
+                  if (gimple_code (use_stmt) != GIMPLE_PHI
+                      || gimple_phi_num_args (use_stmt) != 2
+                      || !use_stmt_vinfo
+                      || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
+                          != vect_double_reduction_def
+                      || bb->loop_father != outer_loop)
+                    continue;
+
+                  /* Create vector phi node for double reduction:
+                     vs1 = phi <vs0, vs2>
+                     vs1 was created previously in this function by a call to
+                       vect_get_vec_def_for_operand and is stored in
+                       vec_initial_def;
+                     vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
+                     vs0 is created here.  */
+
+                  /* Create vector phi node.  */
+                  vect_phi = create_phi_node (vec_initial_def, bb);
+                  new_phi_vinfo = new_stmt_vec_info (vect_phi,
+                                    loop_vec_info_for_loop (outer_loop), NULL);
+                  set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
+
+                  /* Create vs0 - initial def of the double reduction phi.  */
+                  preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
+                                             loop_preheader_edge (outer_loop));
+                  init_def = get_initial_def_for_reduction (stmt,
+                                                          preheader_arg, NULL);
+                  vect_phi_init = vect_init_vector (use_stmt, init_def,
+                                                    vectype, NULL);
+
+                  /* Update phi node arguments with vs0 and vs2.  */
+                  add_phi_arg (vect_phi, vect_phi_init,
+                               loop_preheader_edge (outer_loop),
+                               UNKNOWN_LOCATION);
+                  add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
+                               loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
+                  if (vect_print_dump_info (REPORT_DETAILS))
+                    {
+                      fprintf (vect_dump, "created double reduction phi "
+                                          "node: ");
+                      print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
+                    }
+
+                  vect_phi_res = PHI_RESULT (vect_phi);
+
+                  /* Replace the use, i.e., set the correct vs1 in the regular
+                     reduction phi node. FORNOW, NCOPIES is always 1, so the
+                     loop is redundant.  */
+                  use = reduction_phi;
+                  for (j = 0; j < ncopies; j++)
+                    {
+                      edge pr_edge = loop_preheader_edge (loop);
+                      SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
+                      use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
+                    }
                }
            }
-	}

-      /* Replace the uses:  */
-      orig_name = PHI_RESULT (exit_phi);
-      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
-	FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
-	  SET_USE (use_p, new_temp);
+          /* Replace the uses:  */
+          orig_name = PHI_RESULT (exit_phi);
+          scalar_result = VEC_index (tree, scalar_results, k);
+          FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
+            FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+              SET_USE (use_p, scalar_result);
+        }
+
+      VEC_free (gimple, heap, phis);
    }

-  VEC_free (gimple, heap, phis);
-}
+  VEC_free (tree, heap, scalar_results);
+  VEC_free (gimple, heap, new_phis);
+} 


 /* Function vectorizable_reduction.
@@ -3489,7 +3642,7 @@ vect_finalize_reduction:

 bool
 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
-			gimple *vec_stmt)
+			gimple *vec_stmt, slp_tree slp_node)
 {
  tree vec_dest;
  tree scalar_dest;
@@ -3517,7 +3670,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
  int ncopies;
  int epilog_copies;
  stmt_vec_info prev_stmt_info, prev_phi_info;
-  gimple first_phi = NULL;
  bool single_defuse_cycle = false;
  tree reduc_def = NULL_TREE;
  gimple new_stmt = NULL;
@@ -3532,6 +3684,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
  struct loop * def_stmt_loop, *outer_loop = NULL;
  tree def_arg;
  gimple def_arg_stmt;
+  VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
+  VEC (gimple, heap) *phis = NULL;
+  int vec_num;
+  tree def0, def1;

  if (nested_in_vect_loop_p (loop, stmt))
    {
@@ -3540,10 +3696,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
      nested_cycle = true;
    }

-  /* FORNOW: SLP not supported.  */
-  if (STMT_SLP_TYPE (stmt_info))
-    return false;
-
  /* 1. Is vectorizable reduction?  */
  /* Not supportable if the reduction variable is used in the loop.  */
  if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
@@ -3676,9 +3828,12 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
  if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
    return false;

+  if (slp_node)
+    ncopies = 1;
+  else
+    ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+               / TYPE_VECTOR_SUBPARTS (vectype_in));

-  ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-	     / TYPE_VECTOR_SUBPARTS (vectype_in));
  gcc_assert (ncopies >= 1);

  vec_mode = TYPE_MODE (vectype_in);
@@ -3897,23 +4052,48 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,

  prev_stmt_info = NULL;
  prev_phi_info = NULL;
+  if (slp_node)
+    {
+      vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+      gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out) 
+                  == TYPE_VECTOR_SUBPARTS (vectype_in));
+    }
+  else
+    {
+      vec_num = 1;
+      vec_oprnds0 = VEC_alloc (tree, heap, 1);
+      if (op_type == ternary_op)
+        vec_oprnds1 = VEC_alloc (tree, heap, 1);
+    }
+
+  phis = VEC_alloc (gimple, heap, vec_num);
+  vect_defs = VEC_alloc (tree, heap, vec_num);
+  if (!slp_node)
+    VEC_quick_push (tree, vect_defs, NULL_TREE);
+
  for (j = 0; j < ncopies; j++)
    {
      if (j == 0 || !single_defuse_cycle)
 	{
-	  /* Create the reduction-phi that defines the reduction-operand.  */
-	  new_phi = create_phi_node (vec_dest, loop->header);
-	  set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
-	                                                  NULL));
-          /* Get the vector def for the reduction variable from the phi
-             node.  */
-          reduc_def = PHI_RESULT (new_phi);
-	}
+          for (i = 0; i < vec_num; i++)
+            {
+              /* Create the reduction-phi that defines the reduction
+                 operand.  */
+              new_phi = create_phi_node (vec_dest, loop->header);
+              set_vinfo_for_stmt (new_phi,
+                                  new_stmt_vec_info (new_phi, loop_vinfo,
+                                                     NULL));
+               if (j == 0 || slp_node)
+                 VEC_quick_push (gimple, phis, new_phi);
+            }
+        }

      if (code == COND_EXPR)
        {
-          first_phi = new_phi;
-          vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index);
+          gcc_assert (!slp_node);
+          vectorizable_condition (stmt, gsi, vec_stmt, 
+                                  PHI_RESULT (VEC_index (gimple, phis, 0)), 
+                                  reduc_index);
          /* Multiple types are not supported for condition.  */
          break;
        }
@@ -3921,65 +4101,94 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
      /* Handle uses.  */
      if (j == 0)
        {
-	  loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
-                                                        stmt, NULL);
-          if (op_type == ternary_op)
+          if (slp_node)
+            vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
+          else
            {
-              if (reduc_index == 0)
- 	        loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
-                                                              NULL);
-              else
-                loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
-                                                              NULL);
+              loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
+                                                            stmt, NULL);
+              VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
+              if (op_type == ternary_op)
+               {
+                 if (reduc_index == 0)
+                   loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
+                                                                 NULL);
+                 else
+                   loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
+                                                                 NULL);
+
+                 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
+               }
            }
-
-          /* Get the vector def for the reduction variable from the phi
-             node.  */
-	  first_phi = new_phi;
        }
      else
        {
-          enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
-          loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
-          if (op_type == ternary_op)
-            loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
+          if (!slp_node)
+            {
+              enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
+              loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
+              VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
+              if (op_type == ternary_op)
+                {
+                  loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
+                                                                loop_vec_def1);
+                  VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
+                }
+            }

-	  if (single_defuse_cycle)
-	    reduc_def = gimple_assign_lhs (new_stmt);
-	  else
-	    reduc_def = PHI_RESULT (new_phi);
+          if (single_defuse_cycle)
+            reduc_def = gimple_assign_lhs (new_stmt);

-	  STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
+          STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
        }

-      /* Arguments are ready. Create the new vector stmt.  */
-      if (op_type == binary_op)
+      for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++)
        {
-          if (reduc_index == 0)
-            expr = build2 (code, vectype_out, reduc_def, loop_vec_def0);
+          if (slp_node)
+            reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
          else
-            expr = build2 (code, vectype_out, loop_vec_def0, reduc_def);
-        }
-      else
-        {
-          if (reduc_index == 0)
-            expr = build3 (code, vectype_out, reduc_def, loop_vec_def0,
-                           loop_vec_def1);
+            {
+              if (!single_defuse_cycle || j == 0)
+                reduc_def = PHI_RESULT (new_phi);
+            }
+
+          def1 = ((op_type == ternary_op)
+                  ? VEC_index (tree, vec_oprnds1, i) : NULL);
+          if (op_type == binary_op)
+            {
+              if (reduc_index == 0)
+                expr = build2 (code, vectype_out, reduc_def, def0);
+              else
+                expr = build2 (code, vectype_out, def0, reduc_def);
+            }
          else
            {
-              if (reduc_index == 1)
-                expr = build3 (code, vectype_out, loop_vec_def0, reduc_def,
-                               loop_vec_def1);
+              if (reduc_index == 0)
+                expr = build3 (code, vectype_out, reduc_def, def0, def1);
              else
-                expr = build3 (code, vectype_out, loop_vec_def0, loop_vec_def1,
-	     	               reduc_def);
+                {
+                  if (reduc_index == 1)
+                    expr = build3 (code, vectype_out, def0, reduc_def, def1);
+                  else
+                    expr = build3 (code, vectype_out, def0, def1, reduc_def);
+                }
+            }
+
+          new_stmt = gimple_build_assign (vec_dest, expr);
+          new_temp = make_ssa_name (vec_dest, new_stmt);
+          gimple_assign_set_lhs (new_stmt, new_temp);
+          vect_finish_stmt_generation (stmt, new_stmt, gsi);
+          if (slp_node)
+            {
+              VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
+              VEC_quick_push (tree, vect_defs, new_temp);
            }
+          else
+            VEC_replace (tree, vect_defs, 0, new_temp);
        }

-      new_stmt = gimple_build_assign (vec_dest, expr);
-      new_temp = make_ssa_name (vec_dest, new_stmt);
-      gimple_assign_set_lhs (new_stmt, new_temp);
-      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+      if (slp_node)
+        continue;

      if (j == 0)
 	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
@@ -3992,12 +4201,21 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,

  /* Finalize the reduction-phi (set its arguments) and create the
     epilog reduction code.  */
-  if (!single_defuse_cycle || code == COND_EXPR)
-    new_temp = gimple_assign_lhs (*vec_stmt);
+  if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
+    {
+      new_temp = gimple_assign_lhs (*vec_stmt);
+      VEC_replace (tree, vect_defs, 0, new_temp);
+    }
+
+  vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
+                                    epilog_reduc_code, phis, reduc_index,
+                                    double_reduc, slp_node);
+
+  VEC_free (gimple, heap, phis);
+  VEC_free (tree, heap, vec_oprnds0);
+  if (vec_oprnds1)
+    VEC_free (tree, heap, vec_oprnds1);

-  vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
-				    epilog_reduc_code, first_phi, reduc_index,
-                                    double_reduc);
  return true;
 }


--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -670,6 +670,8 @@ vect_pattern_recog_1 (
  tree pattern_vectype;
  tree type_in, type_out;
  enum tree_code code;
+  int i;
+  gimple next;

  pattern_stmt = (* vect_recog_func) (stmt, &type_in, &type_out);
  if (!pattern_stmt)
@@ -735,7 +737,13 @@ vect_pattern_recog_1 (
  STMT_VINFO_IN_PATTERN_P (stmt_info) = true;
  STMT_VINFO_RELATED_STMT (stmt_info) = pattern_stmt;

-  return;
+  /* Patterns cannot be vectorized using SLP, because they change the order of
+     computation.  */
+  for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i,
+                           next);
+       i++)
+    if (next == stmt)
+      VEC_ordered_remove (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i); 
 }



--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -273,6 +273,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 	  break;

 	case vect_internal_def:
+        case vect_reduction_def:
 	  if (i == 0)
 	    VEC_safe_push (gimple, heap, *def_stmts0, def_stmt);
 	  else
@@ -332,7 +333,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
  HOST_WIDE_INT dummy;
  bool permutation = false;
  unsigned int load_place;
-  gimple first_load;
+  gimple first_load, prev_first_load = NULL;

  /* For every stmt in NODE find its def stmt/s.  */
  for (i = 0; VEC_iterate (gimple, stmts, i, stmt); i++)
@@ -485,42 +486,62 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                                                &pattern0, &pattern1))
 		return false;
 	    }
-	    else
-	      {
-		/* Load.  */
-                /* FORNOW: Check that there is no gap between the loads.  */
-                if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
-                     && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
-                    || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
-                        && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
-                  {
-                    if (vect_print_dump_info (REPORT_SLP))
-                      {
-                        fprintf (vect_dump, "Build SLP failed: strided "
-                                            "loads have gaps ");
-                        print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
-                      }
+	  else
+	    {
+	      /* Load.  */
+              /* FORNOW: Check that there is no gap between the loads.  */
+              if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
+                   && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
+                  || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
+                      && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
+                {
+                  if (vect_print_dump_info (REPORT_SLP))
+                    {
+                      fprintf (vect_dump, "Build SLP failed: strided "
+                                          "loads have gaps ");
+                      print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+                    }

-                    return false;
-                  }
-
-                /* Check that the size of interleaved loads group is not
-                   greater than the SLP group size.  */
-                if (DR_GROUP_SIZE (vinfo_for_stmt (stmt))
-                    > ncopies * group_size)
-                  {
-                    if (vect_print_dump_info (REPORT_SLP))
-                      {
-                        fprintf (vect_dump, "Build SLP failed: the number of "
-                                            "interleaved loads is greater than"
-                                            " the SLP group size ");
-                        print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
-                      }
+                  return false;
+                }

-                    return false;
-                  }
+              /* Check that the size of interleaved loads group is not
+                 greater than the SLP group size.  */
+              if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
+                {
+                  if (vect_print_dump_info (REPORT_SLP))
+                    {
+                      fprintf (vect_dump, "Build SLP failed: the number of "
+                                          "interleaved loads is greater than"
+                                          " the SLP group size ");
+                      print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+                    }

-                first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
+                  return false;
+                }
+
+              first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
+              if (prev_first_load)
+                {
+                  /* Check that there are no loads from different interleaving
+                     chains in the same node. The only exception is complex
+                     numbers.  */
+                  if (prev_first_load != first_load
+                      && rhs_code != REALPART_EXPR 
+                      && rhs_code != IMAGPART_EXPR)
+                    {    
+                      if (vect_print_dump_info (REPORT_SLP))
+                        {
+                          fprintf (vect_dump, "Build SLP failed: different "
+                                           "interleaving chains in one node ");
+                          print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+                        }
+ 
+                      return false;
+                    }
+                }
+              else
+                prev_first_load = first_load;

              if (first_load == stmt)
                {
@@ -787,6 +808,39 @@ vect_supported_slp_permutation_p (slp_instance instance)
 }


+/* Rearrange the statements of NODE according to PERMUTATION.  */
+
+static void
+vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
+                          VEC (int, heap) *permutation)
+{
+  gimple stmt;
+  VEC (gimple, heap) *tmp_stmts;
+  unsigned int index, i;
+
+  if (!node)
+    return;
+
+  vect_slp_rearrange_stmts (SLP_TREE_LEFT (node), group_size, permutation);
+  vect_slp_rearrange_stmts (SLP_TREE_RIGHT (node), group_size, permutation);
+
+  gcc_assert (group_size == VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node)));
+  tmp_stmts = VEC_alloc (gimple, heap, group_size);
+
+  for (i = 0; i < group_size; i++)
+    VEC_safe_push (gimple, heap, tmp_stmts, NULL);
+
+  for (i = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++)
+    {
+      index = VEC_index (int, permutation, i);
+      VEC_replace (gimple, tmp_stmts, index, stmt);
+    }
+
+  VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node));
+  SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
+}
+
+
 /* Check if the required load permutation is supported.
   LOAD_PERMUTATION contains a list of indices of the loads.
   In SLP this permutation is relative to the order of strided stores that are
@@ -796,9 +850,11 @@ static bool
 vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
                                   VEC (int, heap) *load_permutation)
 {
-  int i = 0, j, prev = -1, next, k;
-  bool supported;
+  int i = 0, j, prev = -1, next, k, number_of_groups;
+  bool supported, bad_permutation = false;
  sbitmap load_index;
+  slp_tree node;
+  gimple stmt;

  /* FORNOW: permutations are only supported in SLP.  */
  if (!slp_instn)
@@ -811,9 +867,72 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
        fprintf (vect_dump, "%d ", next);
    }

+  /* In case of reduction every load permutation is allowed, since the order
+     of the reduction statements is not important (as opposed to the case of
+     strided stores). The only condition we need to check is that all the 
+     load nodes are of the same size and have the same permutation (and then
+     rearrange all the nodes of the SLP instance according to this 
+     permutation).  */
+
+  /* Check that all the load nodes are of the same size.  */
+  for (i = 0;
+       VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node);
+       i++)
+    if (VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node))
+        != (unsigned) group_size)
+      return false;
+     
+  node = SLP_INSTANCE_TREE (slp_instn);
+  stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
+  /* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
+     instance, not all the loads belong to the same node or interleaving
+     group. Hence, we need to divide them into groups according to
+     GROUP_SIZE.  */
+  number_of_groups = VEC_length (int, load_permutation) / group_size;
+
+  /* Reduction (there are no data-refs in the root).  */
+  if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
+    {
+      int first_group_load_index;
+
+      /* Compare all the permutation sequences to the first one.  */
+      for (i = 1; i < number_of_groups; i++)
+        {
+          k = 0;
+          for (j = i * group_size; j < i * group_size + group_size; j++)
+            {
+              next = VEC_index (int, load_permutation, j);
+              first_group_load_index = VEC_index (int, load_permutation, k);
+
+              if (next != first_group_load_index)
+                {
+                  bad_permutation = true;
+                  break;
+                }
+
+              k++;
+            }
+
+          if (bad_permutation)
+            break;
+        }
+
+      if (!bad_permutation)
+        {
+          /* This permutaion is valid for reduction. Since the order of the
+             statements in the nodes is not important unless they are memory
+             accesses, we can rearrange the statements in all the nodes 
+             according to the order of the loads.  */
+          vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
+                                    load_permutation);
+          VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn));
+          return true;
+        }
+    }
+
  /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
     GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
-     well.  */
+     well (unless it's reduction).  */
  if (VEC_length (int, load_permutation)
      != (unsigned int) (group_size * group_size))
    return false;
@@ -896,17 +1015,28 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
  slp_tree node = XNEW (struct _slp_tree);
  unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
  unsigned int unrolling_factor = 1, nunits;
-  tree vectype, scalar_type;
+  tree vectype, scalar_type = NULL_TREE;
  gimple next;
  unsigned int vectorization_factor = 0;
-  int inside_cost = 0, outside_cost = 0, ncopies_for_cost;
+  int inside_cost = 0, outside_cost = 0, ncopies_for_cost, i;
  unsigned int max_nunits = 0;
  VEC (int, heap) *load_permutation;
  VEC (slp_tree, heap) *loads;
+  struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
+
+  if (dr)
+    {
+      scalar_type = TREE_TYPE (DR_REF (dr));
+      vectype = get_vectype_for_scalar_type (scalar_type);
+      group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
+    }
+  else
+    {
+      gcc_assert (loop_vinfo);
+      vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
+      group_size = VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo));
+    }

-  scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (
-                                             vinfo_for_stmt (stmt))));
-  vectype = get_vectype_for_scalar_type (scalar_type);
  if (!vectype)
    {
      if (vect_print_dump_info (REPORT_SLP))
@@ -914,6 +1044,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
          fprintf (vect_dump, "Build SLP failed: unsupported data-type ");
          print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
        }
+
      return false;
    }

@@ -938,11 +1069,29 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
  /* Create a node (a root of the SLP tree) for the packed strided stores.  */
  SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (gimple, heap, group_size);
  next = stmt;
-  /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
-  while (next)
+  if (dr)
    {
-      VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
-      next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+      /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
+      while (next)
+        {
+          VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
+          next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
+        }
+    }
+  else
+    {
+      /* Collect reduction statements.  */
+      for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i, 
+                               next); 
+           i++)
+        {
+          VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
+          if (vect_print_dump_info (REPORT_DETAILS))
+            {
+              fprintf (vect_dump, "pushing reduction into node: ");
+              print_gimple_stmt (vect_dump, next, 0, TDF_SLIM);
+            }
+        }
    }

  SLP_TREE_VEC_STMTS (node) = NULL;
@@ -1035,7 +1184,7 @@ bool
 vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 {
  unsigned int i;
-  VEC (gimple, heap) *strided_stores;
+  VEC (gimple, heap) *strided_stores, *reductions = NULL;
  gimple store;
  bool ok = false;

@@ -1043,10 +1192,14 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
    fprintf (vect_dump, "=== vect_analyze_slp ===");

  if (loop_vinfo)
-    strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo);
+    {
+      strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo);
+      reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
+    }
  else
    strided_stores = BB_VINFO_STRIDED_STORES (bb_vinfo);

+  /* Find SLP sequences starting from groups of strided stores.  */
  for (i = 0; VEC_iterate (gimple, strided_stores, i, store); i++)
    if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, store))
      ok = true;
@@ -1059,6 +1212,12 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
      return false;
    }

+  /* Find SLP sequences starting from groups of reductions.  */
+  if (loop_vinfo &&  VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo))
+      && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, 
+                                    VEC_index (gimple, reductions, 0)))
+    ok = true;
+
  return true;
 }

@@ -1120,7 +1279,10 @@ vect_detect_hybrid_slp_stmts (slp_tree node)
 	if ((stmt_vinfo = vinfo_for_stmt (use_stmt))
 	    && !STMT_SLP_TYPE (stmt_vinfo)
            && (STMT_VINFO_RELEVANT (stmt_vinfo)
-                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo))))
+                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
+            && !(gimple_code (use_stmt) == GIMPLE_PHI
+                 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (use_stmt)) 
+                     == vect_reduction_def))
 	  vect_mark_slp_stmts (node, hybrid, i);

  vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node));
@@ -1429,11 +1591,14 @@ vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
 /* For constant and loop invariant defs of SLP_NODE this function returns
   (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
   OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
-   stmts. NUMBER_OF_VECTORS is the number of vector defs to create.  */
+   stmts. NUMBER_OF_VECTORS is the number of vector defs to create.  
+   REDUC_INDEX is the index of the reduction operand in the statements, unless
+   it is -1.  */

 static void
 vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
-			   unsigned int op_num, unsigned int number_of_vectors)
+			   unsigned int op_num, unsigned int number_of_vectors,
+                           int reduc_index)
 {
  VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
  gimple stmt = VEC_index (gimple, stmts, 0);
@@ -1449,6 +1614,50 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
  int number_of_copies = 1;
  VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
  bool constant_p, is_store;
+  tree neutral_op = NULL;
+
+  if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def)
+    {
+      enum tree_code code = gimple_assign_rhs_code (stmt);
+      if (reduc_index == -1)
+        {
+          VEC_free (tree, heap, *vec_oprnds);
+          return;
+        }
+
+      op_num = reduc_index - 1;
+      op = gimple_op (stmt, op_num + 1);
+      /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
+         we need either neutral operands or the original operands. See
+         get_initial_def_for_reduction() for details.  */
+      switch (code)
+        {
+          case WIDEN_SUM_EXPR:
+          case DOT_PROD_EXPR:
+          case PLUS_EXPR:
+          case MINUS_EXPR:
+          case BIT_IOR_EXPR:
+          case BIT_XOR_EXPR:
+             if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
+               neutral_op = build_real (TREE_TYPE (op), dconst0);
+             else
+               neutral_op = build_int_cst (TREE_TYPE (op), 0);
+
+             break;
+
+          case MULT_EXPR:
+          case BIT_AND_EXPR:
+             if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
+               neutral_op = build_real (TREE_TYPE (op), dconst1);
+             else
+               neutral_op = build_int_cst (TREE_TYPE (op), 1);
+
+             break;
+
+          default:
+             neutral_op = NULL;
+        }
+    }

  if (STMT_VINFO_DATA_REF (stmt_vinfo))
    {
@@ -1499,6 +1708,19 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
          else
            op = gimple_op (stmt, op_num + 1);

+          if (reduc_index != -1)
+            {
+              struct loop *loop = (gimple_bb (stmt))->loop_father;
+              gimple def_stmt = SSA_NAME_DEF_STMT (op);
+
+              gcc_assert (loop);
+              /* Get the def before the loop.  */
+              op = PHI_ARG_DEF_FROM_EDGE (def_stmt, 
+                                          loop_preheader_edge (loop));
+              if (j != (number_of_copies - 1) && neutral_op)
+                op = neutral_op;
+            }
+
          /* Create 'vect_ = {op0,op1,...,opn}'.  */
          t = tree_cons (NULL_TREE, op, t);

@@ -1536,8 +1758,25 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
     to replicate the vectors.  */
  while (number_of_vectors > VEC_length (tree, *vec_oprnds))
    {
-      for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
-        VEC_quick_push (tree, *vec_oprnds, vop);
+      tree neutral_vec = NULL;
+
+      if (neutral_op)
+        {
+          if (!neutral_vec)
+            {
+              t = NULL;
+              for (i = 0; i < (unsigned) nunits; i++)
+                 t = tree_cons (NULL_TREE, neutral_op, t);
+              neutral_vec = build_vector (vector_type, t);
+            }
+
+          VEC_quick_push (tree, *vec_oprnds, neutral_vec);
+        }
+      else
+        {
+          for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
+            VEC_quick_push (tree, *vec_oprnds, vop);
+        }
    }
 }

@@ -1576,7 +1815,7 @@ vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)

 void
 vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
-                   VEC (tree,heap) **vec_oprnds1)
+                   VEC (tree,heap) **vec_oprnds1, int reduc_index)
 {
  gimple first_stmt;
  enum tree_code code;
@@ -1607,19 +1846,26 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
  *vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);

  /* SLP_NODE corresponds either to a group of stores or to a group of
-     unary/binary operations. We don't call this function for loads.  */
-  if (SLP_TREE_LEFT (slp_node))
+     unary/binary operations. We don't call this function for loads.  
+     For reduction defs we call vect_get_constant_vectors(), since we are
+     looking for initial loop invariant values.  */
+  if (SLP_TREE_LEFT (slp_node) && reduc_index == -1)
    /* The defs are already vectorized.  */
    vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
  else
    /* Build vectors from scalar defs.  */
-    vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
+    vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects,
+                               reduc_index);

  if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
    /* Since we don't call this function with loads, this is a group of
       stores.  */
    return;

+  /* For reductions, we only need initial values.  */
+  if (reduc_index != -1)
+    return;
+
  code = gimple_assign_rhs_code (first_stmt);
  if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
    return;
@@ -1638,7 +1884,7 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
    vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
  else
    /* Build vectors from scalar defs.  */
-    vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
+    vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects, -1);
 }


@@ -2027,22 +2273,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
    si = gsi_for_stmt (stmt);

  is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
-  if (is_store)
-    {
-      if (DR_GROUP_FIRST_DR (stmt_info))
-	/* If IS_STORE is TRUE, the vectorization of the
-	   interleaving chain was completed - free all the stores in
-	   the chain.  */
-	vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
-      else
-	/* FORNOW: SLP originates only from strided stores.  */
-	gcc_unreachable ();
-
-      return true;
-    }
-
-  /* FORNOW: SLP originates only from strided stores.  */
-  return false;
+  return is_store;
 }


@@ -2075,6 +2306,26 @@ vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 	fprintf (vect_dump, "vectorizing stmts using SLP.");
    }

+  for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
+    {
+      slp_tree root = SLP_INSTANCE_TREE (instance);
+      gimple store;
+      unsigned int j;
+      gimple_stmt_iterator gsi;
+
+      for (j = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (root), j, store)
+                  && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
+        {
+          if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
+            break;
+
+          /* Free the attached stmt_vec_info and remove the stmt.  */
+          gsi = gsi_for_stmt (store);
+          gsi_remove (&gsi, true);
+          free_stmt_vec_info (store);
+        }
+    }
+
  return is_store;
 }


--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1134,7 +1134,7 @@ vect_get_vec_defs (tree op0, tree op1, gimple stmt,
 		   slp_tree slp_node)
 {
  if (slp_node)
-    vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
+    vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1, -1);
  else
    {
      tree vec_oprnd;
@@ -2519,7 +2519,7 @@ vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
    {
      /* Handle uses.  */
      if (slp_node)
-        vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
+        vect_get_slp_defs (slp_node, &vec_oprnds0, NULL, -1);
      else
        {
          VEC_free (tree, heap, vec_oprnds0);
@@ -2819,7 +2819,7 @@ vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
      if (j == 0)
        {
          if (slp_node)
-              vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
+              vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
          else
            {
              vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
@@ -3105,7 +3105,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
          if (slp)
            {
 	      /* Get vectorized arguments for SLP_NODE.  */
-              vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
+              vect_get_slp_defs (slp_node, &vec_oprnds, NULL, -1);

              vec_oprnd = VEC_index (tree, vec_oprnds, 0);
            }
@@ -4049,7 +4049,7 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vectorize, slp_tree node)
            || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
            || vectorizable_call (stmt, NULL, NULL)
            || vectorizable_store (stmt, NULL, NULL, NULL)
-            || vectorizable_reduction (stmt, NULL, NULL)
+            || vectorizable_reduction (stmt, NULL, NULL, NULL)
            || vectorizable_condition (stmt, NULL, NULL, NULL, 0));
    else
      {
@@ -4201,8 +4201,7 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
      break;

    case reduc_vec_info_type:
-      gcc_assert (!slp_node);
-      done = vectorizable_reduction (stmt, gsi, &vec_stmt);
+      done = vectorizable_reduction (stmt, gsi, &vec_stmt, slp_node);
      gcc_assert (done);
      break;


--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -242,6 +242,9 @@ typedef struct _loop_vec_info {
  /* The unrolling factor needed to SLP the loop. In case of that pure SLP is
     applied to the loop, i.e., no unrolling is needed, this is 1.  */
  unsigned slp_unrolling_factor;
+
+  /* Reduction cycles detected in the loop. Used in loop-aware SLP.  */
+  VEC (gimple, heap) *reductions;
 } *loop_vec_info;

 /* Access Functions.  */
@@ -266,6 +269,7 @@ typedef struct _loop_vec_info {
 #define LOOP_VINFO_STRIDED_STORES(L)       (L)->strided_stores
 #define LOOP_VINFO_SLP_INSTANCES(L)        (L)->slp_instances
 #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
+#define LOOP_VINFO_REDUCTIONS(L)           (L)->reductions

 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
 VEC_length (gimple, (L)->may_misalign_stmts) > 0
@@ -844,7 +848,8 @@ extern void vect_transform_loop (loop_vec_info);
 extern loop_vec_info vect_analyze_loop_form (struct loop *);
 extern bool vectorizable_live_operation (gimple, gimple_stmt_iterator *,
                                         gimple *);
-extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *);
+extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *,
+                                    slp_tree);
 extern bool vectorizable_induction (gimple, gimple_stmt_iterator *, gimple *);
 extern int vect_estimate_min_profitable_iters (loop_vec_info);
 extern tree get_initial_def_for_reduction (gimple, tree, tree *);
@@ -862,7 +867,7 @@ extern bool vect_analyze_slp (loop_vec_info, bb_vec_info);
 extern void vect_make_slp_decision (loop_vec_info);
 extern void vect_detect_hybrid_slp (loop_vec_info);
 extern void vect_get_slp_defs (slp_tree, VEC (tree,heap) **,
-                               VEC (tree,heap) **);
+                               VEC (tree,heap) **, int);
 extern LOC find_bb_location (basic_block);
 extern bb_vec_info vect_slp_analyze_bb (basic_block);
 extern void vect_slp_transform_bb (basic_block);