Commit 0fca40f5 by Ira Rosen Committed by Ira Rosen

target.h (struct vectorize): Add new target builtin.

	* target.h (struct vectorize): Add new target builtin.
	* tree-vectorizer.c (destroy_loop_vec_info): Call
	vect_free_slp_instance instead of vect_free_slp_node.
	* tree-vectorizer.h (enum slp_load_perm_type): New.
	(struct _slp_instance): Add new fields.
	(SLP_INSTANCE_LOAD_PERMUTATION): New.
	(SLP_INSTANCE_LOADS): New.
	(vect_free_slp_tree): Remove.
	(vect_free_slp_instance): Declare.
	(SLP_TREE_LOADS_PERM_TYPE, TARG_VEC_PERMUTE_COST): New.
	(vectorizable_load): Add argument.
	(vect_transform_slp_perm_load): New.
	* tree-vect-analyze.c (vect_analyze_operations): Add an argument to
	vectorizable_load.
	(vect_get_place_in_interleaving_chain): New function.
	(vect_free_slp_tree): Make static.
	(vect_free_slp_instance): New function.
	(vect_build_slp_tree): Add new arguments. Allow load permutations and
	collect the load location in the interleaving chain.
	(vect_supported_slp_permutation_p): New function.
	(vect_supported_load_permutation_p): Likewise.
	(vect_analyze_slp_instance): In case of loads permutation, call
	vect_supported_load_permutation_p to check that the permutation is
	supported.
	* target-def.h (TARGET_VECTORIZE_BUILTIN_VEC_PERM): New.
	* tree-vect-transform.c (vect_transform_stmt): Add new argument.
	(vect_create_mask_and_perm): New function.
	(vect_get_mask_element, vect_transform_slp_perm_load): Likewise.
	(vectorizable_load): Add an argument. Don't keep the created vectors
	statements in the node if permutation is required. Call
	vect_transform_slp_perm_load to generate the permutation.
	(vect_transform_stmt): Add new argument. Call vectorizable_load with
	additional argument.
	(vect_schedule_slp_instance): In case of loads permutation, allocate
	vectorized statements structure for all the related SLP nodes. Call
	vect_transform_stmt with addditional argument.
	(vect_transform_loop): Call vect_transform_stmt with correct arguments.
	* config/spu/spu.c (spu_builtin_vec_perm): New.
	(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
	* config/spu/spu.h (TARG_VEC_PERMUTE_COS): Define.
	* config/rs6000/rs6000.c (rs6000_builtin_vec_perm): New.
	(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.

From-SVN: r139706
parent b8c41c8e
2008-08-28 Ira Rosen <irar@il.ibm.com>
* target.h (struct vectorize): Add new target builtin.
* tree-vectorizer.c (destroy_loop_vec_info): Call
vect_free_slp_instance instead of vect_free_slp_node.
* tree-vectorizer.h (enum slp_load_perm_type): New.
(struct _slp_instance): Add new fields.
(SLP_INSTANCE_LOAD_PERMUTATION): New.
(SLP_INSTANCE_LOADS): New.
(vect_free_slp_tree): Remove.
(vect_free_slp_instance): Declare.
(SLP_TREE_LOADS_PERM_TYPE, TARG_VEC_PERMUTE_COST): New.
(vectorizable_load): Add argument.
(vect_transform_slp_perm_load): New.
* tree-vect-analyze.c (vect_analyze_operations): Add an argument to
vectorizable_load.
(vect_get_place_in_interleaving_chain): New function.
(vect_free_slp_tree): Make static.
(vect_free_slp_instance): New function.
(vect_build_slp_tree): Add new arguments. Allow load permutations and
collect the load location in the interleaving chain.
(vect_supported_slp_permutation_p): New function.
(vect_supported_load_permutation_p): Likewise.
(vect_analyze_slp_instance): In case of loads permutation, call
vect_supported_load_permutation_p to check that the permutation is
supported.
* target-def.h (TARGET_VECTORIZE_BUILTIN_VEC_PERM): New.
* tree-vect-transform.c (vect_transform_stmt): Add new argument.
(vect_create_mask_and_perm): New function.
(vect_get_mask_element, vect_transform_slp_perm_load): Likewise.
(vectorizable_load): Add an argument. Don't keep the created vectors
statements in the node if permutation is required. Call
vect_transform_slp_perm_load to generate the permutation.
(vect_transform_stmt): Add new argument. Call vectorizable_load with
additional argument.
(vect_schedule_slp_instance): In case of loads permutation, allocate
vectorized statements structure for all the related SLP nodes. Call
vect_transform_stmt with addditional argument.
(vect_transform_loop): Call vect_transform_stmt with correct arguments.
* config/spu/spu.c (spu_builtin_vec_perm): New.
(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
* config/spu/spu.h (TARG_VEC_PERMUTE_COS): Define.
* config/rs6000/rs6000.c (rs6000_builtin_vec_perm): New.
(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
2008-08-28 Chris Fairles <chris.fairles@gmail.com> 2008-08-28 Chris Fairles <chris.fairles@gmail.com>
* gthr-posix.h (__gthread_create, __gthread_join, __gthread_detach, * gthr-posix.h (__gthread_create, __gthread_join, __gthread_detach,
......
...@@ -862,6 +862,7 @@ static tree rs6000_builtin_mask_for_load (void); ...@@ -862,6 +862,7 @@ static tree rs6000_builtin_mask_for_load (void);
static tree rs6000_builtin_mul_widen_even (tree); static tree rs6000_builtin_mul_widen_even (tree);
static tree rs6000_builtin_mul_widen_odd (tree); static tree rs6000_builtin_mul_widen_odd (tree);
static tree rs6000_builtin_conversion (enum tree_code, tree); static tree rs6000_builtin_conversion (enum tree_code, tree);
static tree rs6000_builtin_vec_perm (tree, tree *);
static void def_builtin (int, const char *, tree, int); static void def_builtin (int, const char *, tree, int);
static bool rs6000_vector_alignment_reachable (const_tree, bool); static bool rs6000_vector_alignment_reachable (const_tree, bool);
...@@ -1138,6 +1139,8 @@ static const char alt_reg_names[][8] = ...@@ -1138,6 +1139,8 @@ static const char alt_reg_names[][8] =
#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD rs6000_builtin_mul_widen_odd #define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD rs6000_builtin_mul_widen_odd
#undef TARGET_VECTORIZE_BUILTIN_CONVERSION #undef TARGET_VECTORIZE_BUILTIN_CONVERSION
#define TARGET_VECTORIZE_BUILTIN_CONVERSION rs6000_builtin_conversion #define TARGET_VECTORIZE_BUILTIN_CONVERSION rs6000_builtin_conversion
#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
#define TARGET_VECTORIZE_BUILTIN_VEC_PERM rs6000_builtin_vec_perm
#undef TARGET_VECTOR_ALIGNMENT_REACHABLE #undef TARGET_VECTOR_ALIGNMENT_REACHABLE
#define TARGET_VECTOR_ALIGNMENT_REACHABLE rs6000_vector_alignment_reachable #define TARGET_VECTOR_ALIGNMENT_REACHABLE rs6000_vector_alignment_reachable
...@@ -2080,6 +2083,40 @@ rs6000_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_pac ...@@ -2080,6 +2083,40 @@ rs6000_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_pac
} }
} }
/* Implement targetm.vectorize.builtin_vec_perm. */
tree
rs6000_builtin_vec_perm (tree type, tree *mask_element_type)
{
tree d;
*mask_element_type = unsigned_char_type_node;
switch (TYPE_MODE (type))
{
case V16QImode:
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_16QI];
break;
case V8HImode:
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_8HI];
break;
case V4SImode:
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_4SI];
break;
case V4SFmode:
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_4SF];
break;
default:
return NULL_TREE;
}
gcc_assert (d);
return d;
}
/* Handle generic options of the form -mfoo=yes/no. /* Handle generic options of the form -mfoo=yes/no.
NAME is the option name. NAME is the option name.
VALUE is the option value. VALUE is the option value.
......
...@@ -137,6 +137,7 @@ static tree spu_builtin_mul_widen_odd (tree); ...@@ -137,6 +137,7 @@ static tree spu_builtin_mul_widen_odd (tree);
static tree spu_builtin_mask_for_load (void); static tree spu_builtin_mask_for_load (void);
static int spu_builtin_vectorization_cost (bool); static int spu_builtin_vectorization_cost (bool);
static bool spu_vector_alignment_reachable (const_tree, bool); static bool spu_vector_alignment_reachable (const_tree, bool);
static tree spu_builtin_vec_perm (tree, tree *);
static int spu_sms_res_mii (struct ddg *g); static int spu_sms_res_mii (struct ddg *g);
extern const char *reg_names[]; extern const char *reg_names[];
...@@ -288,6 +289,9 @@ const struct attribute_spec spu_attribute_table[]; ...@@ -288,6 +289,9 @@ const struct attribute_spec spu_attribute_table[];
#undef TARGET_VECTOR_ALIGNMENT_REACHABLE #undef TARGET_VECTOR_ALIGNMENT_REACHABLE
#define TARGET_VECTOR_ALIGNMENT_REACHABLE spu_vector_alignment_reachable #define TARGET_VECTOR_ALIGNMENT_REACHABLE spu_vector_alignment_reachable
#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
#define TARGET_VECTORIZE_BUILTIN_VEC_PERM spu_builtin_vec_perm
#undef TARGET_LIBGCC_CMP_RETURN_MODE #undef TARGET_LIBGCC_CMP_RETURN_MODE
#define TARGET_LIBGCC_CMP_RETURN_MODE spu_libgcc_cmp_return_mode #define TARGET_LIBGCC_CMP_RETURN_MODE spu_libgcc_cmp_return_mode
...@@ -5543,6 +5547,60 @@ spu_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_packed ...@@ -5543,6 +5547,60 @@ spu_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_packed
return true; return true;
} }
/* Implement targetm.vectorize.builtin_vec_perm. */
tree
spu_builtin_vec_perm (tree type, tree *mask_element_type)
{
struct spu_builtin_description *d;
*mask_element_type = unsigned_char_type_node;
switch (TYPE_MODE (type))
{
case V16QImode:
if (TYPE_UNSIGNED (type))
d = &spu_builtins[SPU_SHUFFLE_0];
else
d = &spu_builtins[SPU_SHUFFLE_1];
break;
case V8HImode:
if (TYPE_UNSIGNED (type))
d = &spu_builtins[SPU_SHUFFLE_2];
else
d = &spu_builtins[SPU_SHUFFLE_3];
break;
case V4SImode:
if (TYPE_UNSIGNED (type))
d = &spu_builtins[SPU_SHUFFLE_4];
else
d = &spu_builtins[SPU_SHUFFLE_5];
break;
case V2DImode:
if (TYPE_UNSIGNED (type))
d = &spu_builtins[SPU_SHUFFLE_6];
else
d = &spu_builtins[SPU_SHUFFLE_7];
break;
case V4SFmode:
d = &spu_builtins[SPU_SHUFFLE_8];
break;
case V2DFmode:
d = &spu_builtins[SPU_SHUFFLE_9];
break;
default:
return NULL_TREE;
}
gcc_assert (d);
return d->fndecl;
}
/* Count the total number of instructions in each pipe and return the /* Count the total number of instructions in each pipe and return the
maximum, which is used as the Minimum Iteration Interval (MII) maximum, which is used as the Minimum Iteration Interval (MII)
in the modulo scheduler. get_pipe() will return -2, -1, 0, or 1. in the modulo scheduler. get_pipe() will return -2, -1, 0, or 1.
......
...@@ -572,6 +572,11 @@ targetm.resolve_overloaded_builtin = spu_resolve_overloaded_builtin; \ ...@@ -572,6 +572,11 @@ targetm.resolve_overloaded_builtin = spu_resolve_overloaded_builtin; \
#undef TARG_VEC_STORE_COST #undef TARG_VEC_STORE_COST
#define TARG_VEC_STORE_COST 1 #define TARG_VEC_STORE_COST 1
/* Cost of vector permutation. */
#ifndef TARG_VEC_PERMUTE_COST
#define TARG_VEC_PERMUTE_COST 1
#endif
/* Misc */ /* Misc */
......
...@@ -364,6 +364,7 @@ ...@@ -364,6 +364,7 @@
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST 0 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST 0
#define TARGET_VECTOR_ALIGNMENT_REACHABLE \ #define TARGET_VECTOR_ALIGNMENT_REACHABLE \
default_builtin_vector_alignment_reachable default_builtin_vector_alignment_reachable
#define TARGET_VECTORIZE_BUILTIN_VEC_PERM 0
#define TARGET_VECTORIZE \ #define TARGET_VECTORIZE \
{ \ { \
...@@ -373,7 +374,8 @@ ...@@ -373,7 +374,8 @@
TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN, \ TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN, \
TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD, \ TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD, \
TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST, \ TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST, \
TARGET_VECTOR_ALIGNMENT_REACHABLE \ TARGET_VECTOR_ALIGNMENT_REACHABLE, \
TARGET_VECTORIZE_BUILTIN_VEC_PERM \
} }
#define TARGET_DEFAULT_TARGET_FLAGS 0 #define TARGET_DEFAULT_TARGET_FLAGS 0
......
...@@ -438,7 +438,10 @@ struct gcc_target ...@@ -438,7 +438,10 @@ struct gcc_target
/* Return true if vector alignment is reachable (by peeling N /* Return true if vector alignment is reachable (by peeling N
iterations) for the given type. */ iterations) for the given type. */
bool (* vector_alignment_reachable) (const_tree, bool); bool (* vector_alignment_reachable) (const_tree, bool);
} vectorize;
/* Target builtin that implements vector permute. */
tree (* builtin_vec_perm) (tree, tree*);
} vectorize;
/* The initial value of target_flags. */ /* The initial value of target_flags. */
int default_target_flags; int default_target_flags;
......
2008-08-28 Ira Rosen <irar@il.ibm.com>
* lib/target-supports.exp (check_effective_target_vect_perm): New.
* gcc.dg/vect/slp-perm-1.c: New testcase.
* gcc.dg/vect/slp-perm-2.c: New testcase.
* gcc.dg/vect/slp-perm-3.c: New testcase.
* gcc.dg/vect/slp-perm-4.c: New testcase.
* gcc.dg/vect/slp-perm-5.c: New testcase.
* gcc.dg/vect/slp-perm-6.c: New testcase.
* gcc.dg/vect/slp-perm-7.c: New testcase.
* gcc.dg/vect/slp-perm-8.c: New testcase.
* gcc.dg/vect/slp-perm-9.c: New testcase.
2008-08-27 Manuel Lopez-Ibanez <manu@gcc.gnu.org> 2008-08-27 Manuel Lopez-Ibanez <manu@gcc.gnu.org>
PR 37217 PR 37217
......
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M01 1322
#define M11 13
#define M21 27271
#define M02 74
#define M12 191
#define M22 500
#define N 16
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
{
unsigned int i, a, b, c;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
}
}
int main (int argc, const char* argv[])
{
unsigned int input[N], output[N], i;
unsigned int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
if (input[i] > 200)
abort();
output[i] = 0;
}
foo (input, output);
for (i = 0; i < N; i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M01 1322
#define M11 13
#define M02 74
#define M12 191
#define N 16
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
{
unsigned int i, a, b;
for (i = 0; i < N / 2; i++)
{
a = *pInput++;
b = *pInput++;
*pOutput++ = M00 * a + M01 * b;
*pOutput++ = M10 * a + M11 * b;
}
}
int main (int argc, const char* argv[])
{
unsigned int input[N], output[N], i;
unsigned int check_results[N] = {1322, 13, 4166, 471, 7010, 929, 9854, 1387, 12698, 1845, 15542, 2303, 18386, 2761, 21230, 3219};
for (i = 0; i < N; i++)
{
input[i] = i%256;
if (input[i] > 200)
abort();
output[i] = 0;
}
foo (input, output);
for (i = 0; i < N; i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M30 237
#define M01 1322
#define M11 13
#define M21 27271
#define M31 2280
#define M02 74
#define M12 191
#define M22 500
#define M32 111
#define M03 134
#define M13 117
#define M23 11
#define M33 771
#define N 16
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
{
unsigned int i, a, b, c, d;
for (i = 0; i < N / 4; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput++;
*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;
*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;
*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;
*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;
}
}
int main (int argc, const char* argv[])
{
unsigned int input[N], output[N], i;
unsigned int check_results[N] = {1872, 746, 28304, 4815, 8392, 2894, 139524, 18411, 14912, 5042, 250744, 32007, 21432, 7190, 361964, 45603};
for (i = 0; i < N; i++)
{
input[i] = i%256;
if (input[i] > 200)
abort();
output[i] = 0;
}
foo (input, output);
for (i = 0; i < N - N; i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M30 237
#define M40 437
#define M01 1322
#define M11 13
#define M21 27271
#define M31 2280
#define M41 284
#define M02 74
#define M12 191
#define M22 500
#define M32 111
#define M42 1114
#define M03 134
#define M13 117
#define M23 11
#define M33 771
#define M43 71
#define M04 334
#define M14 147
#define M24 115
#define M34 7716
#define M44 16
#define N 16
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
{
unsigned int i, a, b, c, d, e;
for (i = 0; i < N / 5; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput++;
e = *pInput++;
*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
*pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
}
}
int main (int argc, const char* argv[])
{
unsigned int input[N], output[N], i;
unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
if (input[i] > 200)
abort();
output[i] = 0;
}
foo (input, output);
for (i = 0; i < N - N; i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */
/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M01 1322
#define M11 13
#define M21 27271
#define M02 74
#define M12 191
#define M22 500
#define K00 405
#define K10 112
#define K01 4322
#define K11 135
#define N 16
void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
int *__restrict__ pInput2, int *__restrict__ pOutput2)
{
int i, a, b, c, d, e;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput2++;
e = *pInput2++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
*pOutput2++ = K00 * d + K01 * e;
*pOutput2++ = K10 * d + K11 * e;
}
}
int main (int argc, const char* argv[])
{
int input[N], output[N], i;
int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
int input2[N], output2[N];
int check_results2[N] = {4322, 135, 13776, 629, 23230, 1123, 32684, 1617, 42138, 2111, 0, 0, 0, 0, 0, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
input2[i] = i%256;
output[i] = 0;
output2[i] = 0;
if (input[i] > 256)
abort ();
}
foo (input, output, input2, output2);
for (i = 0; i < N; i++)
if (output[i] != check_results[i] || output2[i] != check_results2[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M01 1322
#define M11 13
#define M21 27271
#define M02 74
#define M12 191
#define M22 500
#define K00 405
#define K10 112
#define K01 4322
#define K11 135
#define N 16
void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
int *__restrict__ pInput2, int *__restrict__ pOutput2)
{
int i, a, b, c, d, e;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput2++;
e = *pInput2++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
/* Regular SLP - no permutation required. */
*pOutput2++ = K00 * d;
*pOutput2++ = K10 * e;
}
}
int main (int argc, const char* argv[])
{
int input[N], output[N], i;
int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
int input2[N], output2[N];
int check_results2[N] = {0, 112, 810, 336, 1620, 560, 2430, 784, 3240, 1008, 0, 0, 0, 0, 0, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
input2[i] = i%256;
output[i] = 0;
output2[i] = 0;
if (input[i] > 256)
abort ();
}
foo (input, output, input2, output2);
for (i = 0; i < N; i++)
if (output[i] != check_results[i] || output2[i] != check_results2[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M01 1322
#define M11 13
#define M21 27271
#define M02 74
#define M12 191
#define M22 500
#define K00 405
#define K10 112
#define K01 4322
#define K11 135
#define N 16
/* SLP with load permutation and loop-based vectorization. */
void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
int *__restrict__ pInput2, int *__restrict__ pOutput2)
{
int i, a, b, c, d;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput2++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
/* Loop-based vectorization. */
*pOutput2++ = K00 * d;
}
}
int main (int argc, const char* argv[])
{
int input[N], output[N], i;
int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
int input2[N], output2[N];
int check_results2[N] = {0, 405, 810, 1215, 1620, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
input2[i] = i%256;
output[i] = 0;
output2[i] = 0;
if (input[i] > 200)
abort ();
}
foo (input, output, input2, output2);
for (i = 0; i < N; i++)
if (output[i] != check_results[i] || output2[i] != check_results2[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 200
void foo (unsigned char *__restrict__ pInput, unsigned char *__restrict__ pOutput)
{
unsigned char i, a, b, c;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
*pOutput++ = a + b + c + 3;
*pOutput++ = a + b + c + 12;
*pOutput++ = a + b + c + 1;
}
}
int main (int argc, const char* argv[])
{
unsigned char input[N], output[N], i;
unsigned char check_results[N];
for (i = 0; i < N; i++)
{
input[i] = i;
output[i] = 0;
if (input[i] > 256)
abort ();
}
for (i = 0; i < N / 3; i++)
{
check_results[3*i] = 9 * i + 6;
check_results[3*i+1] = 9 * i + 15;
check_results[3*i+2] = 9 * i + 4;
}
foo (input, output);
for (i = 0; i < N - (N % 3); i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 200
void foo (unsigned short *__restrict__ pInput, unsigned short *__restrict__ pOutput)
{
unsigned short i, a, b, c;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
*pOutput++ = a + b + c + 3;
*pOutput++ = a + b + c + 12;
*pOutput++ = a + b + c + 1;
}
}
int main (int argc, const char* argv[])
{
unsigned short input[N], output[N], i;
unsigned short check_results[N];
for (i = 0; i < N; i++)
{
input[i] = i;
output[i] = 0;
if (input[i] > 256)
abort ();
}
for (i = 0; i < N / 3; i++)
{
check_results[3*i] = 9 * i + 6;
check_results[3*i+1] = 9 * i + 15;
check_results[3*i+2] = 9 * i + 4;
}
foo (input, output);
for (i = 0; i < N - (N % 3); i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
...@@ -1608,6 +1608,28 @@ proc check_effective_target_vect_no_bitwise { } { ...@@ -1608,6 +1608,28 @@ proc check_effective_target_vect_no_bitwise { } {
return $et_vect_no_bitwise_saved return $et_vect_no_bitwise_saved
} }
# Return 1 if the target plus current options supports vector permutation,
# 0 otherwise.
#
# This won't change for different subtargets so cache the result.
proc check_effective_target_vect_perm { } {
global et_vect_perm
if [info exists et_vect_perm_saved] {
verbose "check_effective_target_vect_perm: using cached result" 2
} else {
set et_vect_perm_saved 0
if { [istarget powerpc*-*-*]
|| [istarget spu-*-*] } {
set et_vect_perm_saved 1
}
}
verbose "check_effective_target_vect_perm: returning $et_vect_perm_saved" 2
return $et_vect_perm_saved
}
# Return 1 if the target plus current options supports a vector # Return 1 if the target plus current options supports a vector
# widening summation of *short* args into *int* result, 0 otherwise. # widening summation of *short* args into *int* result, 0 otherwise.
# A target can also support this widening summation if it can support # A target can also support this widening summation if it can support
......
...@@ -1802,7 +1802,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts) ...@@ -1802,7 +1802,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++) for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); vect_free_slp_instance (instance);
VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo)); VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
......
...@@ -105,6 +105,8 @@ typedef struct _slp_tree { ...@@ -105,6 +105,8 @@ typedef struct _slp_tree {
} cost; } cost;
} *slp_tree; } *slp_tree;
DEF_VEC_P(slp_tree);
DEF_VEC_ALLOC_P(slp_tree, heap);
/* SLP instance is a sequence of stmts in a loop that can be packed into /* SLP instance is a sequence of stmts in a loop that can be packed into
SIMD stmts. */ SIMD stmts. */
...@@ -124,6 +126,13 @@ typedef struct _slp_instance { ...@@ -124,6 +126,13 @@ typedef struct _slp_instance {
int outside_of_loop; /* Statements generated outside loop. */ int outside_of_loop; /* Statements generated outside loop. */
int inside_of_loop; /* Statements generated inside loop. */ int inside_of_loop; /* Statements generated inside loop. */
} cost; } cost;
/* Loads permutation relatively to the stores, NULL if there is no
permutation. */
VEC (int, heap) *load_permutation;
/* The group of nodes that contain loads of this SLP instance. */
VEC (slp_tree, heap) *loads;
} *slp_instance; } *slp_instance;
DEF_VEC_P(slp_instance); DEF_VEC_P(slp_instance);
...@@ -135,6 +144,8 @@ DEF_VEC_ALLOC_P(slp_instance, heap); ...@@ -135,6 +144,8 @@ DEF_VEC_ALLOC_P(slp_instance, heap);
#define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor #define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor
#define SLP_INSTANCE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop #define SLP_INSTANCE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop
#define SLP_INSTANCE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop #define SLP_INSTANCE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop
#define SLP_INSTANCE_LOAD_PERMUTATION(S) (S)->load_permutation
#define SLP_INSTANCE_LOADS(S) (S)->loads
#define SLP_TREE_LEFT(S) (S)->left #define SLP_TREE_LEFT(S) (S)->left
#define SLP_TREE_RIGHT(S) (S)->right #define SLP_TREE_RIGHT(S) (S)->right
...@@ -522,6 +533,11 @@ typedef struct _stmt_vec_info { ...@@ -522,6 +533,11 @@ typedef struct _stmt_vec_info {
#define TARG_VEC_STORE_COST 1 #define TARG_VEC_STORE_COST 1
#endif #endif
/* Cost of vector permutation. */
#ifndef TARG_VEC_PERMUTE_COST
#define TARG_VEC_PERMUTE_COST 1
#endif
/* The maximum number of intermediate steps required in multi-step type /* The maximum number of intermediate steps required in multi-step type
conversion. */ conversion. */
#define MAX_INTERM_CVT_STEPS 3 #define MAX_INTERM_CVT_STEPS 3
...@@ -700,7 +716,7 @@ extern void free_stmt_vec_info (gimple stmt); ...@@ -700,7 +716,7 @@ extern void free_stmt_vec_info (gimple stmt);
/** In tree-vect-analyze.c **/ /** In tree-vect-analyze.c **/
/* Driver for analysis stage. */ /* Driver for analysis stage. */
extern loop_vec_info vect_analyze_loop (struct loop *); extern loop_vec_info vect_analyze_loop (struct loop *);
extern void vect_free_slp_tree (slp_tree); extern void vect_free_slp_instance (slp_instance);
extern loop_vec_info vect_analyze_loop_form (struct loop *); extern loop_vec_info vect_analyze_loop_form (struct loop *);
extern tree vect_get_smallest_scalar_type (gimple, HOST_WIDE_INT *, extern tree vect_get_smallest_scalar_type (gimple, HOST_WIDE_INT *,
HOST_WIDE_INT *); HOST_WIDE_INT *);
...@@ -716,7 +732,7 @@ void vect_pattern_recog (loop_vec_info); ...@@ -716,7 +732,7 @@ void vect_pattern_recog (loop_vec_info);
/** In tree-vect-transform.c **/ /** In tree-vect-transform.c **/
extern bool vectorizable_load (gimple, gimple_stmt_iterator *, gimple *, extern bool vectorizable_load (gimple, gimple_stmt_iterator *, gimple *,
slp_tree); slp_tree, slp_instance);
extern bool vectorizable_store (gimple, gimple_stmt_iterator *, gimple *, extern bool vectorizable_store (gimple, gimple_stmt_iterator *, gimple *,
slp_tree); slp_tree);
extern bool vectorizable_operation (gimple, gimple_stmt_iterator *, gimple *, extern bool vectorizable_operation (gimple, gimple_stmt_iterator *, gimple *,
...@@ -742,6 +758,9 @@ extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *, ...@@ -742,6 +758,9 @@ extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *,
extern void vect_model_store_cost (stmt_vec_info, int, enum vect_def_type, extern void vect_model_store_cost (stmt_vec_info, int, enum vect_def_type,
slp_tree); slp_tree);
extern void vect_model_load_cost (stmt_vec_info, int, slp_tree); extern void vect_model_load_cost (stmt_vec_info, int, slp_tree);
extern bool vect_transform_slp_perm_load (gimple, VEC (tree, heap) *,
gimple_stmt_iterator *, int, slp_instance, bool);
/* Driver for transformation stage. */ /* Driver for transformation stage. */
extern void vect_transform_loop (loop_vec_info); extern void vect_transform_loop (loop_vec_info);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment