Commit 0fca40f5 by Ira Rosen Committed by Ira Rosen

target.h (struct vectorize): Add new target builtin.

	* target.h (struct vectorize): Add new target builtin.
	* tree-vectorizer.c (destroy_loop_vec_info): Call
	vect_free_slp_instance instead of vect_free_slp_node.
	* tree-vectorizer.h (enum slp_load_perm_type): New.
	(struct _slp_instance): Add new fields.
	(SLP_INSTANCE_LOAD_PERMUTATION): New.
	(SLP_INSTANCE_LOADS): New.
	(vect_free_slp_tree): Remove.
	(vect_free_slp_instance): Declare.
	(SLP_TREE_LOADS_PERM_TYPE, TARG_VEC_PERMUTE_COST): New.
	(vectorizable_load): Add argument.
	(vect_transform_slp_perm_load): New.
	* tree-vect-analyze.c (vect_analyze_operations): Add an argument to
	vectorizable_load.
	(vect_get_place_in_interleaving_chain): New function.
	(vect_free_slp_tree): Make static.
	(vect_free_slp_instance): New function.
	(vect_build_slp_tree): Add new arguments. Allow load permutations and
	collect the load location in the interleaving chain.
	(vect_supported_slp_permutation_p): New function.
	(vect_supported_load_permutation_p): Likewise.
	(vect_analyze_slp_instance): In case of loads permutation, call
	vect_supported_load_permutation_p to check that the permutation is
	supported.
	* target-def.h (TARGET_VECTORIZE_BUILTIN_VEC_PERM): New.
	* tree-vect-transform.c (vect_transform_stmt): Add new argument.
	(vect_create_mask_and_perm): New function.
	(vect_get_mask_element, vect_transform_slp_perm_load): Likewise.
	(vectorizable_load): Add an argument. Don't keep the created vectors
	statements in the node if permutation is required. Call
	vect_transform_slp_perm_load to generate the permutation.
	(vect_transform_stmt): Add new argument. Call vectorizable_load with
	additional argument.
	(vect_schedule_slp_instance): In case of loads permutation, allocate
	vectorized statements structure for all the related SLP nodes. Call
	vect_transform_stmt with addditional argument.
	(vect_transform_loop): Call vect_transform_stmt with correct arguments.
	* config/spu/spu.c (spu_builtin_vec_perm): New.
	(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
	* config/spu/spu.h (TARG_VEC_PERMUTE_COS): Define.
	* config/rs6000/rs6000.c (rs6000_builtin_vec_perm): New.
	(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.

From-SVN: r139706
parent b8c41c8e
2008-08-28 Ira Rosen <irar@il.ibm.com>
* target.h (struct vectorize): Add new target builtin.
* tree-vectorizer.c (destroy_loop_vec_info): Call
vect_free_slp_instance instead of vect_free_slp_node.
* tree-vectorizer.h (enum slp_load_perm_type): New.
(struct _slp_instance): Add new fields.
(SLP_INSTANCE_LOAD_PERMUTATION): New.
(SLP_INSTANCE_LOADS): New.
(vect_free_slp_tree): Remove.
(vect_free_slp_instance): Declare.
(SLP_TREE_LOADS_PERM_TYPE, TARG_VEC_PERMUTE_COST): New.
(vectorizable_load): Add argument.
(vect_transform_slp_perm_load): New.
* tree-vect-analyze.c (vect_analyze_operations): Add an argument to
vectorizable_load.
(vect_get_place_in_interleaving_chain): New function.
(vect_free_slp_tree): Make static.
(vect_free_slp_instance): New function.
(vect_build_slp_tree): Add new arguments. Allow load permutations and
collect the load location in the interleaving chain.
(vect_supported_slp_permutation_p): New function.
(vect_supported_load_permutation_p): Likewise.
(vect_analyze_slp_instance): In case of loads permutation, call
vect_supported_load_permutation_p to check that the permutation is
supported.
* target-def.h (TARGET_VECTORIZE_BUILTIN_VEC_PERM): New.
* tree-vect-transform.c (vect_transform_stmt): Add new argument.
(vect_create_mask_and_perm): New function.
(vect_get_mask_element, vect_transform_slp_perm_load): Likewise.
(vectorizable_load): Add an argument. Don't keep the created vectors
statements in the node if permutation is required. Call
vect_transform_slp_perm_load to generate the permutation.
(vect_transform_stmt): Add new argument. Call vectorizable_load with
additional argument.
(vect_schedule_slp_instance): In case of loads permutation, allocate
vectorized statements structure for all the related SLP nodes. Call
vect_transform_stmt with addditional argument.
(vect_transform_loop): Call vect_transform_stmt with correct arguments.
* config/spu/spu.c (spu_builtin_vec_perm): New.
(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
* config/spu/spu.h (TARG_VEC_PERMUTE_COS): Define.
* config/rs6000/rs6000.c (rs6000_builtin_vec_perm): New.
(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
2008-08-28 Chris Fairles <chris.fairles@gmail.com>
* gthr-posix.h (__gthread_create, __gthread_join, __gthread_detach,
......
......@@ -862,6 +862,7 @@ static tree rs6000_builtin_mask_for_load (void);
static tree rs6000_builtin_mul_widen_even (tree);
static tree rs6000_builtin_mul_widen_odd (tree);
static tree rs6000_builtin_conversion (enum tree_code, tree);
static tree rs6000_builtin_vec_perm (tree, tree *);
static void def_builtin (int, const char *, tree, int);
static bool rs6000_vector_alignment_reachable (const_tree, bool);
......@@ -1138,6 +1139,8 @@ static const char alt_reg_names[][8] =
#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD rs6000_builtin_mul_widen_odd
#undef TARGET_VECTORIZE_BUILTIN_CONVERSION
#define TARGET_VECTORIZE_BUILTIN_CONVERSION rs6000_builtin_conversion
#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
#define TARGET_VECTORIZE_BUILTIN_VEC_PERM rs6000_builtin_vec_perm
#undef TARGET_VECTOR_ALIGNMENT_REACHABLE
#define TARGET_VECTOR_ALIGNMENT_REACHABLE rs6000_vector_alignment_reachable
......@@ -2080,6 +2083,40 @@ rs6000_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_pac
}
}
/* Implement targetm.vectorize.builtin_vec_perm. */
tree
rs6000_builtin_vec_perm (tree type, tree *mask_element_type)
{
tree d;
*mask_element_type = unsigned_char_type_node;
switch (TYPE_MODE (type))
{
case V16QImode:
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_16QI];
break;
case V8HImode:
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_8HI];
break;
case V4SImode:
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_4SI];
break;
case V4SFmode:
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_4SF];
break;
default:
return NULL_TREE;
}
gcc_assert (d);
return d;
}
/* Handle generic options of the form -mfoo=yes/no.
NAME is the option name.
VALUE is the option value.
......
......@@ -137,6 +137,7 @@ static tree spu_builtin_mul_widen_odd (tree);
static tree spu_builtin_mask_for_load (void);
static int spu_builtin_vectorization_cost (bool);
static bool spu_vector_alignment_reachable (const_tree, bool);
static tree spu_builtin_vec_perm (tree, tree *);
static int spu_sms_res_mii (struct ddg *g);
extern const char *reg_names[];
......@@ -288,6 +289,9 @@ const struct attribute_spec spu_attribute_table[];
#undef TARGET_VECTOR_ALIGNMENT_REACHABLE
#define TARGET_VECTOR_ALIGNMENT_REACHABLE spu_vector_alignment_reachable
#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
#define TARGET_VECTORIZE_BUILTIN_VEC_PERM spu_builtin_vec_perm
#undef TARGET_LIBGCC_CMP_RETURN_MODE
#define TARGET_LIBGCC_CMP_RETURN_MODE spu_libgcc_cmp_return_mode
......@@ -5543,6 +5547,60 @@ spu_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_packed
return true;
}
/* Implement targetm.vectorize.builtin_vec_perm. */
tree
spu_builtin_vec_perm (tree type, tree *mask_element_type)
{
struct spu_builtin_description *d;
*mask_element_type = unsigned_char_type_node;
switch (TYPE_MODE (type))
{
case V16QImode:
if (TYPE_UNSIGNED (type))
d = &spu_builtins[SPU_SHUFFLE_0];
else
d = &spu_builtins[SPU_SHUFFLE_1];
break;
case V8HImode:
if (TYPE_UNSIGNED (type))
d = &spu_builtins[SPU_SHUFFLE_2];
else
d = &spu_builtins[SPU_SHUFFLE_3];
break;
case V4SImode:
if (TYPE_UNSIGNED (type))
d = &spu_builtins[SPU_SHUFFLE_4];
else
d = &spu_builtins[SPU_SHUFFLE_5];
break;
case V2DImode:
if (TYPE_UNSIGNED (type))
d = &spu_builtins[SPU_SHUFFLE_6];
else
d = &spu_builtins[SPU_SHUFFLE_7];
break;
case V4SFmode:
d = &spu_builtins[SPU_SHUFFLE_8];
break;
case V2DFmode:
d = &spu_builtins[SPU_SHUFFLE_9];
break;
default:
return NULL_TREE;
}
gcc_assert (d);
return d->fndecl;
}
/* Count the total number of instructions in each pipe and return the
maximum, which is used as the Minimum Iteration Interval (MII)
in the modulo scheduler. get_pipe() will return -2, -1, 0, or 1.
......
......@@ -572,6 +572,11 @@ targetm.resolve_overloaded_builtin = spu_resolve_overloaded_builtin; \
#undef TARG_VEC_STORE_COST
#define TARG_VEC_STORE_COST 1
/* Cost of vector permutation. */
#ifndef TARG_VEC_PERMUTE_COST
#define TARG_VEC_PERMUTE_COST 1
#endif
/* Misc */
......
......@@ -364,6 +364,7 @@
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST 0
#define TARGET_VECTOR_ALIGNMENT_REACHABLE \
default_builtin_vector_alignment_reachable
#define TARGET_VECTORIZE_BUILTIN_VEC_PERM 0
#define TARGET_VECTORIZE \
{ \
......@@ -373,7 +374,8 @@
TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN, \
TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD, \
TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST, \
TARGET_VECTOR_ALIGNMENT_REACHABLE \
TARGET_VECTOR_ALIGNMENT_REACHABLE, \
TARGET_VECTORIZE_BUILTIN_VEC_PERM \
}
#define TARGET_DEFAULT_TARGET_FLAGS 0
......
......@@ -438,7 +438,10 @@ struct gcc_target
/* Return true if vector alignment is reachable (by peeling N
iterations) for the given type. */
bool (* vector_alignment_reachable) (const_tree, bool);
} vectorize;
/* Target builtin that implements vector permute. */
tree (* builtin_vec_perm) (tree, tree*);
} vectorize;
/* The initial value of target_flags. */
int default_target_flags;
......
2008-08-28 Ira Rosen <irar@il.ibm.com>
* lib/target-supports.exp (check_effective_target_vect_perm): New.
* gcc.dg/vect/slp-perm-1.c: New testcase.
* gcc.dg/vect/slp-perm-2.c: New testcase.
* gcc.dg/vect/slp-perm-3.c: New testcase.
* gcc.dg/vect/slp-perm-4.c: New testcase.
* gcc.dg/vect/slp-perm-5.c: New testcase.
* gcc.dg/vect/slp-perm-6.c: New testcase.
* gcc.dg/vect/slp-perm-7.c: New testcase.
* gcc.dg/vect/slp-perm-8.c: New testcase.
* gcc.dg/vect/slp-perm-9.c: New testcase.
2008-08-27 Manuel Lopez-Ibanez <manu@gcc.gnu.org>
PR 37217
......
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M01 1322
#define M11 13
#define M21 27271
#define M02 74
#define M12 191
#define M22 500
#define N 16
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
{
unsigned int i, a, b, c;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
}
}
int main (int argc, const char* argv[])
{
unsigned int input[N], output[N], i;
unsigned int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
if (input[i] > 200)
abort();
output[i] = 0;
}
foo (input, output);
for (i = 0; i < N; i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M01 1322
#define M11 13
#define M02 74
#define M12 191
#define N 16
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
{
unsigned int i, a, b;
for (i = 0; i < N / 2; i++)
{
a = *pInput++;
b = *pInput++;
*pOutput++ = M00 * a + M01 * b;
*pOutput++ = M10 * a + M11 * b;
}
}
int main (int argc, const char* argv[])
{
unsigned int input[N], output[N], i;
unsigned int check_results[N] = {1322, 13, 4166, 471, 7010, 929, 9854, 1387, 12698, 1845, 15542, 2303, 18386, 2761, 21230, 3219};
for (i = 0; i < N; i++)
{
input[i] = i%256;
if (input[i] > 200)
abort();
output[i] = 0;
}
foo (input, output);
for (i = 0; i < N; i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M30 237
#define M01 1322
#define M11 13
#define M21 27271
#define M31 2280
#define M02 74
#define M12 191
#define M22 500
#define M32 111
#define M03 134
#define M13 117
#define M23 11
#define M33 771
#define N 16
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
{
unsigned int i, a, b, c, d;
for (i = 0; i < N / 4; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput++;
*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;
*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;
*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;
*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;
}
}
int main (int argc, const char* argv[])
{
unsigned int input[N], output[N], i;
unsigned int check_results[N] = {1872, 746, 28304, 4815, 8392, 2894, 139524, 18411, 14912, 5042, 250744, 32007, 21432, 7190, 361964, 45603};
for (i = 0; i < N; i++)
{
input[i] = i%256;
if (input[i] > 200)
abort();
output[i] = 0;
}
foo (input, output);
for (i = 0; i < N - N; i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M30 237
#define M40 437
#define M01 1322
#define M11 13
#define M21 27271
#define M31 2280
#define M41 284
#define M02 74
#define M12 191
#define M22 500
#define M32 111
#define M42 1114
#define M03 134
#define M13 117
#define M23 11
#define M33 771
#define M43 71
#define M04 334
#define M14 147
#define M24 115
#define M34 7716
#define M44 16
#define N 16
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
{
unsigned int i, a, b, c, d, e;
for (i = 0; i < N / 5; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput++;
e = *pInput++;
*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
*pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
}
}
int main (int argc, const char* argv[])
{
unsigned int input[N], output[N], i;
unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
if (input[i] > 200)
abort();
output[i] = 0;
}
foo (input, output);
for (i = 0; i < N - N; i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */
/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M01 1322
#define M11 13
#define M21 27271
#define M02 74
#define M12 191
#define M22 500
#define K00 405
#define K10 112
#define K01 4322
#define K11 135
#define N 16
void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
int *__restrict__ pInput2, int *__restrict__ pOutput2)
{
int i, a, b, c, d, e;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput2++;
e = *pInput2++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
*pOutput2++ = K00 * d + K01 * e;
*pOutput2++ = K10 * d + K11 * e;
}
}
int main (int argc, const char* argv[])
{
int input[N], output[N], i;
int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
int input2[N], output2[N];
int check_results2[N] = {4322, 135, 13776, 629, 23230, 1123, 32684, 1617, 42138, 2111, 0, 0, 0, 0, 0, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
input2[i] = i%256;
output[i] = 0;
output2[i] = 0;
if (input[i] > 256)
abort ();
}
foo (input, output, input2, output2);
for (i = 0; i < N; i++)
if (output[i] != check_results[i] || output2[i] != check_results2[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M01 1322
#define M11 13
#define M21 27271
#define M02 74
#define M12 191
#define M22 500
#define K00 405
#define K10 112
#define K01 4322
#define K11 135
#define N 16
void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
int *__restrict__ pInput2, int *__restrict__ pOutput2)
{
int i, a, b, c, d, e;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput2++;
e = *pInput2++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
/* Regular SLP - no permutation required. */
*pOutput2++ = K00 * d;
*pOutput2++ = K10 * e;
}
}
int main (int argc, const char* argv[])
{
int input[N], output[N], i;
int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
int input2[N], output2[N];
int check_results2[N] = {0, 112, 810, 336, 1620, 560, 2430, 784, 3240, 1008, 0, 0, 0, 0, 0, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
input2[i] = i%256;
output[i] = 0;
output2[i] = 0;
if (input[i] > 256)
abort ();
}
foo (input, output, input2, output2);
for (i = 0; i < N; i++)
if (output[i] != check_results[i] || output2[i] != check_results2[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M01 1322
#define M11 13
#define M21 27271
#define M02 74
#define M12 191
#define M22 500
#define K00 405
#define K10 112
#define K01 4322
#define K11 135
#define N 16
/* SLP with load permutation and loop-based vectorization. */
void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
int *__restrict__ pInput2, int *__restrict__ pOutput2)
{
int i, a, b, c, d;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput2++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
/* Loop-based vectorization. */
*pOutput2++ = K00 * d;
}
}
int main (int argc, const char* argv[])
{
int input[N], output[N], i;
int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
int input2[N], output2[N];
int check_results2[N] = {0, 405, 810, 1215, 1620, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
input2[i] = i%256;
output[i] = 0;
output2[i] = 0;
if (input[i] > 200)
abort ();
}
foo (input, output, input2, output2);
for (i = 0; i < N; i++)
if (output[i] != check_results[i] || output2[i] != check_results2[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 200
void foo (unsigned char *__restrict__ pInput, unsigned char *__restrict__ pOutput)
{
unsigned char i, a, b, c;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
*pOutput++ = a + b + c + 3;
*pOutput++ = a + b + c + 12;
*pOutput++ = a + b + c + 1;
}
}
int main (int argc, const char* argv[])
{
unsigned char input[N], output[N], i;
unsigned char check_results[N];
for (i = 0; i < N; i++)
{
input[i] = i;
output[i] = 0;
if (input[i] > 256)
abort ();
}
for (i = 0; i < N / 3; i++)
{
check_results[3*i] = 9 * i + 6;
check_results[3*i+1] = 9 * i + 15;
check_results[3*i+2] = 9 * i + 4;
}
foo (input, output);
for (i = 0; i < N - (N % 3); i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 200
void foo (unsigned short *__restrict__ pInput, unsigned short *__restrict__ pOutput)
{
unsigned short i, a, b, c;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
*pOutput++ = a + b + c + 3;
*pOutput++ = a + b + c + 12;
*pOutput++ = a + b + c + 1;
}
}
int main (int argc, const char* argv[])
{
unsigned short input[N], output[N], i;
unsigned short check_results[N];
for (i = 0; i < N; i++)
{
input[i] = i;
output[i] = 0;
if (input[i] > 256)
abort ();
}
for (i = 0; i < N / 3; i++)
{
check_results[3*i] = 9 * i + 6;
check_results[3*i+1] = 9 * i + 15;
check_results[3*i+2] = 9 * i + 4;
}
foo (input, output);
for (i = 0; i < N - (N % 3); i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
......@@ -1608,6 +1608,28 @@ proc check_effective_target_vect_no_bitwise { } {
return $et_vect_no_bitwise_saved
}
# Return 1 if the target plus current options supports vector permutation,
# 0 otherwise.
#
# This won't change for different subtargets so cache the result.
proc check_effective_target_vect_perm { } {
global et_vect_perm
if [info exists et_vect_perm_saved] {
verbose "check_effective_target_vect_perm: using cached result" 2
} else {
set et_vect_perm_saved 0
if { [istarget powerpc*-*-*]
|| [istarget spu-*-*] } {
set et_vect_perm_saved 1
}
}
verbose "check_effective_target_vect_perm: returning $et_vect_perm_saved" 2
return $et_vect_perm_saved
}
# Return 1 if the target plus current options supports a vector
# widening summation of *short* args into *int* result, 0 otherwise.
# A target can also support this widening summation if it can support
......
......@@ -1802,7 +1802,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
vect_free_slp_instance (instance);
VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
......
......@@ -105,6 +105,8 @@ typedef struct _slp_tree {
} cost;
} *slp_tree;
DEF_VEC_P(slp_tree);
DEF_VEC_ALLOC_P(slp_tree, heap);
/* SLP instance is a sequence of stmts in a loop that can be packed into
SIMD stmts. */
......@@ -124,6 +126,13 @@ typedef struct _slp_instance {
int outside_of_loop; /* Statements generated outside loop. */
int inside_of_loop; /* Statements generated inside loop. */
} cost;
/* Loads permutation relatively to the stores, NULL if there is no
permutation. */
VEC (int, heap) *load_permutation;
/* The group of nodes that contain loads of this SLP instance. */
VEC (slp_tree, heap) *loads;
} *slp_instance;
DEF_VEC_P(slp_instance);
......@@ -135,6 +144,8 @@ DEF_VEC_ALLOC_P(slp_instance, heap);
#define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor
#define SLP_INSTANCE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop
#define SLP_INSTANCE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop
#define SLP_INSTANCE_LOAD_PERMUTATION(S) (S)->load_permutation
#define SLP_INSTANCE_LOADS(S) (S)->loads
#define SLP_TREE_LEFT(S) (S)->left
#define SLP_TREE_RIGHT(S) (S)->right
......@@ -522,6 +533,11 @@ typedef struct _stmt_vec_info {
#define TARG_VEC_STORE_COST 1
#endif
/* Cost of vector permutation. */
#ifndef TARG_VEC_PERMUTE_COST
#define TARG_VEC_PERMUTE_COST 1
#endif
/* The maximum number of intermediate steps required in multi-step type
conversion. */
#define MAX_INTERM_CVT_STEPS 3
......@@ -700,7 +716,7 @@ extern void free_stmt_vec_info (gimple stmt);
/** In tree-vect-analyze.c **/
/* Driver for analysis stage. */
extern loop_vec_info vect_analyze_loop (struct loop *);
extern void vect_free_slp_tree (slp_tree);
extern void vect_free_slp_instance (slp_instance);
extern loop_vec_info vect_analyze_loop_form (struct loop *);
extern tree vect_get_smallest_scalar_type (gimple, HOST_WIDE_INT *,
HOST_WIDE_INT *);
......@@ -716,7 +732,7 @@ void vect_pattern_recog (loop_vec_info);
/** In tree-vect-transform.c **/
extern bool vectorizable_load (gimple, gimple_stmt_iterator *, gimple *,
slp_tree);
slp_tree, slp_instance);
extern bool vectorizable_store (gimple, gimple_stmt_iterator *, gimple *,
slp_tree);
extern bool vectorizable_operation (gimple, gimple_stmt_iterator *, gimple *,
......@@ -742,6 +758,9 @@ extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *,
extern void vect_model_store_cost (stmt_vec_info, int, enum vect_def_type,
slp_tree);
extern void vect_model_load_cost (stmt_vec_info, int, slp_tree);
extern bool vect_transform_slp_perm_load (gimple, VEC (tree, heap) *,
gimple_stmt_iterator *, int, slp_instance, bool);
/* Driver for transformation stage. */
extern void vect_transform_loop (loop_vec_info);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment