Commit 2c23db6d by Evgeny Stupachenko Committed by Kirill Yukhin

re PR tree-optimization/52252 (An opportunity for x86 gcc vectorizer (gain up to 3 times))

gcc/
	* tree-vect-data-refs.c (vect_grouped_load_supported): New
	check for loads group of length 3.
	(vect_permute_load_chain): New permutations for loads group of
	length 3.
	* tree-vect-stmts.c (vect_model_load_cost): Change cost
	of vec_perm_shuffle for the new permutations.

gcc/testsuite/
	PR tree-optimization/52252
	* gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3.

From-SVN: r210155
parent 586199f3
2014-05-07 Evgeny Stupachenko <evstupac@gmail.com>
* tree-vect-data-refs.c (vect_grouped_load_supported): New
check for loads group of length 3.
(vect_permute_load_chain): New permutations for loads group of
length 3.
* tree-vect-stmts.c (vect_model_load_cost): Change cost
of vec_perm_shuffle for the new permutations.
2014-05-07 Alan Lawrence <alan.lawrence@arm.com> 2014-05-07 Alan Lawrence <alan.lawrence@arm.com>
* config/aarch64/arm_neon.h (vtrn1_f32, vtrn1_p8, vtrn1_p16, vtrn1_s8, * config/aarch64/arm_neon.h (vtrn1_f32, vtrn1_p8, vtrn1_p16, vtrn1_s8,
......
2014-05-07 Evgeny Stupachenko <evstupac@gmail.com>
PR tree-optimization/52252
* gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3.
2014-05-07 Alan Lawrence <alan.lawrence@arm.com> 2014-05-07 Alan Lawrence <alan.lawrence@arm.com>
* gcc.target/aarch64/simd/vrev16p8_1.c: New file. * gcc.target/aarch64/simd/vrev16p8_1.c: New file.
......
/* { dg-do compile } */
/* { dg-options "-O2 -g -ftree-vectorize -mssse3 -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
#define byte unsigned char
void
matrix_mul (byte *in, byte *out, int size)
{
int i;
for (i = 0; i < size; i++)
{
byte in0 = in[0];
byte in1 = in[1];
byte in2 = in[2];
byte out0, out1, out2, out3;
out0 = in0 + in1;
out1 = in0 + in2;
out2 = in1 + in2;
out3 = in0 + in1 + in2;
out[0] = out0;
out[1] = out1;
out[2] = out2;
out[3] = out3;
in += 3;
out += 4;
}
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */
...@@ -4810,36 +4810,76 @@ vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count) ...@@ -4810,36 +4810,76 @@ vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
{ {
enum machine_mode mode = TYPE_MODE (vectype); enum machine_mode mode = TYPE_MODE (vectype);
/* vect_permute_load_chain requires the group size to be a power of two. */ /* vect_permute_load_chain requires the group size to be equal to 3 or
if (exact_log2 (count) == -1) be a power of two. */
if (count != 3 && exact_log2 (count) == -1)
{ {
if (dump_enabled_p ()) if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"the size of the group of accesses" "the size of the group of accesses"
" is not a power of 2\n"); " is not a power of 2 or not equal to 3\n");
return false; return false;
} }
/* Check that the permutation is supported. */ /* Check that the permutation is supported. */
if (VECTOR_MODE_P (mode)) if (VECTOR_MODE_P (mode))
{ {
unsigned int i, nelt = GET_MODE_NUNITS (mode); unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
unsigned char *sel = XALLOCAVEC (unsigned char, nelt); unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
for (i = 0; i < nelt; i++) if (count == 3)
sel[i] = i * 2;
if (can_vec_perm_p (mode, false, sel))
{ {
unsigned int k;
for (k = 0; k < 3; k++)
{
for (i = 0; i < nelt; i++)
if (3 * i + k < 2 * nelt)
sel[i] = 3 * i + k;
else
sel[i] = 0;
if (!can_vec_perm_p (mode, false, sel))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"shuffle of 3 loads is not supported by"
" target\n");
return false;
}
for (i = 0, j = 0; i < nelt; i++)
if (3 * i + k < 2 * nelt)
sel[i] = i;
else
sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
if (!can_vec_perm_p (mode, false, sel))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"shuffle of 3 loads is not supported by"
" target\n");
return false;
}
}
return true;
}
else
{
/* If length is not equal to 3 then only power of 2 is supported. */
gcc_assert (exact_log2 (count) != -1);
for (i = 0; i < nelt; i++) for (i = 0; i < nelt; i++)
sel[i] = i * 2 + 1; sel[i] = i * 2;
if (can_vec_perm_p (mode, false, sel)) if (can_vec_perm_p (mode, false, sel))
return true; {
} for (i = 0; i < nelt; i++)
sel[i] = i * 2 + 1;
if (can_vec_perm_p (mode, false, sel))
return true;
}
}
} }
if (dump_enabled_p ()) if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"extract even/odd not supported by target\n"); "extract even/odd not supported by target\n");
return false; return false;
} }
...@@ -4857,8 +4897,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) ...@@ -4857,8 +4897,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
/* Function vect_permute_load_chain. /* Function vect_permute_load_chain.
Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
a power of 2, generate extract_even/odd stmts to reorder the input data a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
correctly. Return the final references for loads in RESULT_CHAIN. the input data correctly. Return the final references for loads in
RESULT_CHAIN.
E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
The input is 4 vectors each containing 8 elements. We assign a number to each The input is 4 vectors each containing 8 elements. We assign a number to each
...@@ -4939,6 +4980,7 @@ vect_permute_load_chain (vec<tree> dr_chain, ...@@ -4939,6 +4980,7 @@ vect_permute_load_chain (vec<tree> dr_chain,
{ {
tree data_ref, first_vect, second_vect; tree data_ref, first_vect, second_vect;
tree perm_mask_even, perm_mask_odd; tree perm_mask_even, perm_mask_odd;
tree perm3_mask_low, perm3_mask_high;
gimple perm_stmt; gimple perm_stmt;
tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
unsigned int i, j, log_length = exact_log2 (length); unsigned int i, j, log_length = exact_log2 (length);
...@@ -4949,44 +4991,97 @@ vect_permute_load_chain (vec<tree> dr_chain, ...@@ -4949,44 +4991,97 @@ vect_permute_load_chain (vec<tree> dr_chain,
memcpy (result_chain->address (), dr_chain.address (), memcpy (result_chain->address (), dr_chain.address (),
length * sizeof (tree)); length * sizeof (tree));
for (i = 0; i < nelt; ++i) if (length == 3)
sel[i] = i * 2;
perm_mask_even = vect_gen_perm_mask (vectype, sel);
gcc_assert (perm_mask_even != NULL);
for (i = 0; i < nelt; ++i)
sel[i] = i * 2 + 1;
perm_mask_odd = vect_gen_perm_mask (vectype, sel);
gcc_assert (perm_mask_odd != NULL);
for (i = 0; i < log_length; i++)
{ {
for (j = 0; j < length; j += 2) unsigned int k;
{
first_vect = dr_chain[j];
second_vect = dr_chain[j+1];
/* data_ref = permute_even (first_data_ref, second_data_ref); */ for (k = 0; k < 3; k++)
data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); {
for (i = 0; i < nelt; i++)
if (3 * i + k < 2 * nelt)
sel[i] = 3 * i + k;
else
sel[i] = 0;
perm3_mask_low = vect_gen_perm_mask (vectype, sel);
gcc_assert (perm3_mask_low != NULL);
for (i = 0, j = 0; i < nelt; i++)
if (3 * i + k < 2 * nelt)
sel[i] = i;
else
sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
perm3_mask_high = vect_gen_perm_mask (vectype, sel);
gcc_assert (perm3_mask_high != NULL);
first_vect = dr_chain[0];
second_vect = dr_chain[1];
/* Create interleaving stmt (low part of):
low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
...}> */
data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low");
perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
first_vect, second_vect, first_vect, second_vect,
perm_mask_even); perm3_mask_low);
vect_finish_stmt_generation (stmt, perm_stmt, gsi); vect_finish_stmt_generation (stmt, perm_stmt, gsi);
(*result_chain)[j/2] = data_ref;
/* data_ref = permute_odd (first_data_ref, second_data_ref); */ /* Create interleaving stmt (high part of):
data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
...}> */
first_vect = data_ref;
second_vect = dr_chain[2];
data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high");
perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
first_vect, second_vect, first_vect, second_vect,
perm_mask_odd); perm3_mask_high);
vect_finish_stmt_generation (stmt, perm_stmt, gsi); vect_finish_stmt_generation (stmt, perm_stmt, gsi);
(*result_chain)[j/2+length/2] = data_ref; (*result_chain)[k] = data_ref;
} }
memcpy (dr_chain.address (), result_chain->address (),
length * sizeof (tree));
} }
} else
{
/* If length is not equal to 3 then only power of 2 is supported. */
gcc_assert (exact_log2 (length) != -1);
for (i = 0; i < nelt; ++i)
sel[i] = i * 2;
perm_mask_even = vect_gen_perm_mask (vectype, sel);
gcc_assert (perm_mask_even != NULL);
for (i = 0; i < nelt; ++i)
sel[i] = i * 2 + 1;
perm_mask_odd = vect_gen_perm_mask (vectype, sel);
gcc_assert (perm_mask_odd != NULL);
for (i = 0; i < log_length; i++)
{
for (j = 0; j < length; j += 2)
{
first_vect = dr_chain[j];
second_vect = dr_chain[j+1];
/* data_ref = permute_even (first_data_ref, second_data_ref); */
data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
first_vect, second_vect,
perm_mask_even);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
(*result_chain)[j/2] = data_ref;
/* data_ref = permute_odd (first_data_ref, second_data_ref); */
data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
first_vect, second_vect,
perm_mask_odd);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
(*result_chain)[j/2+length/2] = data_ref;
}
memcpy (dr_chain.address (), result_chain->address (),
length * sizeof (tree));
}
}
}
/* Function vect_transform_grouped_load. /* Function vect_transform_grouped_load.
......
...@@ -1091,10 +1091,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, ...@@ -1091,10 +1091,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
include the cost of the permutes. */ include the cost of the permutes. */
if (!load_lanes_p && group_size > 1) if (!load_lanes_p && group_size > 1)
{ {
/* Uses an even and odd extract operations for each needed permute. */ /* Uses an even and odd extract operations or shuffle operations
int nstmts = ncopies * exact_log2 (group_size) * group_size; for each needed permute. */
inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm, int nstmts = ncopies * ceil_log2 (group_size) * group_size;
stmt_info, 0, vect_body); inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
stmt_info, 0, vect_body);
if (dump_enabled_p ()) if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, dump_printf_loc (MSG_NOTE, vect_location,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment