Commit a4ee446d by Jakub Jelinek Committed by Jakub Jelinek

re PR tree-optimization/57233 (Vector lowering of LROTATE_EXPR pessimizes code)

	PR tree-optimization/57233
	PR tree-optimization/61299
	* tree-vect-generic.c (get_compute_type, count_type_subparts): New
	functions.
	(expand_vector_operations_1): Use them.  If {L,R}ROTATE_EXPR
	would be lowered to scalar shifts, check if corresponding
	shifts and vector BIT_IOR_EXPR are supported and don't lower
	or lower just to narrower vector type in that case.
	* expmed.c (expand_shift_1): Fix up handling of vector
	shifts and rotates.

	* gcc.dg/pr57233.c: New test.
	* gcc.target/i386/pr57233.c: New test.
	* gcc.target/i386/sse2-pr57233.c: New test.
	* gcc.target/i386/avx-pr57233.c: New test.
	* gcc.target/i386/avx2-pr57233.c: New test.
	* gcc.target/i386/avx512f-pr57233.c: New test.
	* gcc.target/i386/xop-pr57233.c: New test.

From-SVN: r212063
parent 9698b078
2014-06-27 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/57233
PR tree-optimization/61299
* tree-vect-generic.c (get_compute_type, count_type_subparts): New
functions.
(expand_vector_operations_1): Use them. If {L,R}ROTATE_EXPR
would be lowered to scalar shifts, check if corresponding
shifts and vector BIT_IOR_EXPR are supported and don't lower
or lower just to narrower vector type in that case.
* expmed.c (expand_shift_1): Fix up handling of vector
shifts and rotates.
2014-06-26 Uros Bizjak <ubizjak@gmail.com>
PR target/61586
......
......@@ -2128,9 +2128,12 @@ expand_shift_1 (enum tree_code code, enum machine_mode mode, rtx shifted,
optab lrotate_optab = rotl_optab;
optab rrotate_optab = rotr_optab;
enum machine_mode op1_mode;
enum machine_mode scalar_mode = mode;
int attempt;
bool speed = optimize_insn_for_speed_p ();
if (VECTOR_MODE_P (mode))
scalar_mode = GET_MODE_INNER (mode);
op1 = amount;
op1_mode = GET_MODE (op1);
......@@ -2153,9 +2156,9 @@ expand_shift_1 (enum tree_code code, enum machine_mode mode, rtx shifted,
{
if (CONST_INT_P (op1)
&& ((unsigned HOST_WIDE_INT) INTVAL (op1) >=
(unsigned HOST_WIDE_INT) GET_MODE_BITSIZE (mode)))
(unsigned HOST_WIDE_INT) GET_MODE_BITSIZE (scalar_mode)))
op1 = GEN_INT ((unsigned HOST_WIDE_INT) INTVAL (op1)
% GET_MODE_BITSIZE (mode));
% GET_MODE_BITSIZE (scalar_mode));
else if (GET_CODE (op1) == SUBREG
&& subreg_lowpart_p (op1)
&& SCALAR_INT_MODE_P (GET_MODE (SUBREG_REG (op1)))
......@@ -2169,10 +2172,10 @@ expand_shift_1 (enum tree_code code, enum machine_mode mode, rtx shifted,
amount instead. */
if (rotate
&& CONST_INT_P (op1)
&& IN_RANGE (INTVAL (op1), GET_MODE_BITSIZE (mode) / 2 + left,
GET_MODE_BITSIZE (mode) - 1))
&& IN_RANGE (INTVAL (op1), GET_MODE_BITSIZE (scalar_mode) / 2 + left,
GET_MODE_BITSIZE (scalar_mode) - 1))
{
op1 = GEN_INT (GET_MODE_BITSIZE (mode) - INTVAL (op1));
op1 = GEN_INT (GET_MODE_BITSIZE (scalar_mode) - INTVAL (op1));
left = !left;
code = left ? LROTATE_EXPR : RROTATE_EXPR;
}
......@@ -2185,7 +2188,7 @@ expand_shift_1 (enum tree_code code, enum machine_mode mode, rtx shifted,
if (code == LSHIFT_EXPR
&& CONST_INT_P (op1)
&& INTVAL (op1) > 0
&& INTVAL (op1) < GET_MODE_PRECISION (mode)
&& INTVAL (op1) < GET_MODE_PRECISION (scalar_mode)
&& INTVAL (op1) < MAX_BITS_PER_WORD
&& (shift_cost (speed, mode, INTVAL (op1))
> INTVAL (op1) * add_cost (speed, mode))
......@@ -2240,14 +2243,14 @@ expand_shift_1 (enum tree_code code, enum machine_mode mode, rtx shifted,
if (op1 == const0_rtx)
return shifted;
else if (CONST_INT_P (op1))
other_amount = GEN_INT (GET_MODE_BITSIZE (mode)
other_amount = GEN_INT (GET_MODE_BITSIZE (scalar_mode)
- INTVAL (op1));
else
{
other_amount
= simplify_gen_unary (NEG, GET_MODE (op1),
op1, GET_MODE (op1));
HOST_WIDE_INT mask = GET_MODE_PRECISION (mode) - 1;
HOST_WIDE_INT mask = GET_MODE_PRECISION (scalar_mode) - 1;
other_amount
= simplify_gen_binary (AND, GET_MODE (op1), other_amount,
gen_int_mode (mask, GET_MODE (op1)));
......
2014-06-27 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/57233
PR tree-optimization/61299
* gcc.dg/pr57233.c: New test.
* gcc.target/i386/pr57233.c: New test.
* gcc.target/i386/sse2-pr57233.c: New test.
* gcc.target/i386/avx-pr57233.c: New test.
* gcc.target/i386/avx2-pr57233.c: New test.
* gcc.target/i386/avx512f-pr57233.c: New test.
* gcc.target/i386/xop-pr57233.c: New test.
2014-06-27 Sebastian Huber <sebastian.huber@embedded-brains.de>
* gcc.dg/typeof-2.c: New testcase.
......
/* PR tree-optimization/57233 */
/* { dg-do run { target { ilp32 || lp64 } } } */
/* { dg-options "-O2" } */
typedef unsigned V4 __attribute__((vector_size(4 * sizeof (int))));
typedef unsigned V8 __attribute__((vector_size(8 * sizeof (int))));
typedef unsigned V16 __attribute__((vector_size(16 * sizeof (int))));
V4 a, b, g;
V8 c, d, h;
V16 e, f, j;
__attribute__((noinline)) void
f1 (void)
{
a = (a << 2) | (a >> 30);
}
__attribute__((noinline)) void
f2 (void)
{
a = (a << 30) | (a >> 2);
}
__attribute__((noinline)) void
f3 (void)
{
a = (a << b) | (a >> (32 - b));
}
__attribute__((noinline, noclone)) void
f4 (int x)
{
a = (a << x) | (a >> (32 - x));
}
__attribute__((noinline)) void
f5 (void)
{
c = (c << 2) | (c >> 30);
}
__attribute__((noinline)) void
f6 (void)
{
c = (c << 30) | (c >> 2);
}
__attribute__((noinline)) void
f7 (void)
{
c = (c << d) | (c >> (32 - d));
}
__attribute__((noinline, noclone)) void
f8 (int x)
{
c = (c << x) | (c >> (32 - x));
}
__attribute__((noinline)) void
f9 (void)
{
e = (e << 2) | (e >> 30);
}
__attribute__((noinline)) void
f10 (void)
{
e = (e << 30) | (e >> 2);
}
__attribute__((noinline)) void
f11 (void)
{
e = (e << f) | (e >> (32 - f));
}
__attribute__((noinline, noclone)) void
f12 (int x)
{
e = (e << x) | (e >> (32 - x));
}
unsigned
r (void)
{
static unsigned x = 0xdeadbeefU;
static unsigned y = 0x12347654U;
static unsigned z = 0x1a2b3c4dU;
static unsigned w = 0x87654321U;
unsigned t = x ^ (x << 11);
x = y;
y = z;
z = w;
w = w ^ (w >> 19) ^ t ^ (t >> 8);
return w;
}
void
init (unsigned int *p, int count, int mod)
{
int i;
for (i = 0; i < count; i++)
{
unsigned int v = r ();
if (mod)
v = (v % 31) + 1;
p[i] = v;
}
}
void
check (unsigned int *p, unsigned int *q, int count, unsigned int *s, int ss)
{
int i;
for (i = 0; i < count; i++)
{
if (s)
ss = s[i];
if (p[i] != ((q[i] << ss) | (q[i] >> (32 - ss))))
__builtin_abort ();
}
}
int
main ()
{
init ((unsigned int *) &a, 4, 0);
init ((unsigned int *) &b, 4, 1);
init ((unsigned int *) &c, 8, 0);
init ((unsigned int *) &d, 8, 1);
init ((unsigned int *) &e, 16, 0);
init ((unsigned int *) &f, 16, 1);
g = a;
h = c;
j = e;
f1 ();
f5 ();
f9 ();
check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 2);
check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 2);
check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 2);
g = a;
h = c;
j = e;
f2 ();
f6 ();
f10 ();
check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 30);
check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 30);
check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 30);
g = a;
h = c;
j = e;
f3 ();
f7 ();
f11 ();
check ((unsigned int *) &a, (unsigned int *) &g, 4, (unsigned int *) &b, 0);
check ((unsigned int *) &c, (unsigned int *) &h, 8, (unsigned int *) &d, 0);
check ((unsigned int *) &e, (unsigned int *) &j, 16, (unsigned int *) &f, 0);
g = a;
h = c;
j = e;
f4 (5);
f8 (5);
f12 (5);
check ((unsigned int *) &a, (unsigned int *) &g, 4, 0, 5);
check ((unsigned int *) &c, (unsigned int *) &h, 8, 0, 5);
check ((unsigned int *) &e, (unsigned int *) &j, 16, 0, 5);
return 0;
}
/* PR tree-optimization/57233 */
/* { dg-do run { target avx } } */
/* { dg-options "-O2 -mavx" } */
#include "avx-check.h"
static void
avx_test (void)
{
do_main ();
}
#undef main
#define main() do_main ()
#include "../../gcc.dg/pr57233.c"
/* PR tree-optimization/57233 */
/* { dg-do run { target avx2 } } */
/* { dg-options "-O2 -mavx2" } */
#include "avx2-check.h"
static void
avx2_test (void)
{
do_main ();
}
#undef main
#define main() do_main ()
#include "../../gcc.dg/pr57233.c"
/* PR tree-optimization/57233 */
/* { dg-do run { target avx512f } } */
/* { dg-options "-O2 -mavx512f" } */
#include "avx512f-check.h"
static void
avx512f_test (void)
{
do_main ();
}
#undef main
#define main() do_main ()
#include "../../gcc.dg/pr57233.c"
/* PR tree-optimization/57233 */
/* { dg-do compile { target avx } } */
/* { dg-options "-O2 -mavx -mno-xop" } */
typedef unsigned V4 __attribute__((vector_size(4 * sizeof (int))));
V4 a;
__attribute__((noinline)) void
foo (void)
{
a = (a << 2) | (a >> 30);
}
/* { dg-final { scan-assembler "vpsrld\[^\n\r]*30" } } */
/* { dg-final { scan-assembler "vpslld\[^\n\r]*2" } } */
/* PR tree-optimization/57233 */
/* { dg-do run { target sse2 } } */
/* { dg-options "-O2 -msse2" } */
#include "sse2-check.h"
static void
sse2_test (void)
{
do_main ();
}
#undef main
#define main() do_main ()
#include "../../gcc.dg/pr57233.c"
/* PR tree-optimization/57233 */
/* { dg-do run { target xop } } */
/* { dg-options "-O2 -mxop" } */
#include "xop-check.h"
static void
xop_test (void)
{
do_main ();
}
#undef main
#define main() do_main ()
#include "../../gcc.dg/pr57233.c"
......@@ -1334,15 +1334,67 @@ lower_vec_perm (gimple_stmt_iterator *gsi)
update_stmt (gsi_stmt (*gsi));
}
/* Return type in which CODE operation with optab OP can be
computed. */
static tree
get_compute_type (enum tree_code code, optab op, tree type)
{
/* For very wide vectors, try using a smaller vector mode. */
tree compute_type = type;
if (op
&& (!VECTOR_MODE_P (TYPE_MODE (type))
|| optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing))
{
tree vector_compute_type
= type_for_widest_vector_mode (TREE_TYPE (type), op);
if (vector_compute_type != NULL_TREE
&& (TYPE_VECTOR_SUBPARTS (vector_compute_type)
< TYPE_VECTOR_SUBPARTS (compute_type))
&& (optab_handler (op, TYPE_MODE (vector_compute_type))
!= CODE_FOR_nothing))
compute_type = vector_compute_type;
}
/* If we are breaking a BLKmode vector into smaller pieces,
type_for_widest_vector_mode has already looked into the optab,
so skip these checks. */
if (compute_type == type)
{
enum machine_mode compute_mode = TYPE_MODE (compute_type);
if (VECTOR_MODE_P (compute_mode))
{
if (op && optab_handler (op, compute_mode) != CODE_FOR_nothing)
return compute_type;
if (code == MULT_HIGHPART_EXPR
&& can_mult_highpart_p (compute_mode,
TYPE_UNSIGNED (compute_type)))
return compute_type;
}
/* There is no operation in hardware, so fall back to scalars. */
compute_type = TREE_TYPE (type);
}
return compute_type;
}
/* Helper function of expand_vector_operations_1. Return number of
vector elements for vector types or 1 for other types. */
static inline int
count_type_subparts (tree type)
{
return VECTOR_TYPE_P (type) ? TYPE_VECTOR_SUBPARTS (type) : 1;
}
/* Process one statement. If we identify a vector operation, expand it. */
static void
expand_vector_operations_1 (gimple_stmt_iterator *gsi)
{
gimple stmt = gsi_stmt (*gsi);
tree lhs, rhs1, rhs2 = NULL, type, compute_type;
tree lhs, rhs1, rhs2 = NULL, type, compute_type = NULL_TREE;
enum tree_code code;
enum machine_mode compute_mode;
optab op = unknown_optab;
enum gimple_rhs_class rhs_class;
tree new_rhs;
......@@ -1455,11 +1507,76 @@ expand_vector_operations_1 (gimple_stmt_iterator *gsi)
{
op = optab_for_tree_code (code, type, optab_scalar);
compute_type = get_compute_type (code, op, type);
if (compute_type == type)
return;
/* The rtl expander will expand vector/scalar as vector/vector
if necessary. Don't bother converting the stmt here. */
if (optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing
&& optab_handler (opv, TYPE_MODE (type)) != CODE_FOR_nothing)
if necessary. Pick one with wider vector type. */
tree compute_vtype = get_compute_type (code, opv, type);
if (count_type_subparts (compute_vtype)
> count_type_subparts (compute_type))
{
compute_type = compute_vtype;
op = opv;
}
}
if (code == LROTATE_EXPR || code == RROTATE_EXPR)
{
if (compute_type == NULL_TREE)
compute_type = get_compute_type (code, op, type);
if (compute_type == type)
return;
/* Before splitting vector rotates into scalar rotates,
see if we can't use vector shifts and BIT_IOR_EXPR
instead. For vector by vector rotates we'd also
need to check BIT_AND_EXPR and NEGATE_EXPR, punt there
for now, fold doesn't seem to create such rotates anyway. */
if (compute_type == TREE_TYPE (type)
&& !VECTOR_INTEGER_TYPE_P (TREE_TYPE (rhs2)))
{
optab oplv = vashl_optab, opl = ashl_optab;
optab oprv = vlshr_optab, opr = lshr_optab, opo = ior_optab;
tree compute_lvtype = get_compute_type (LSHIFT_EXPR, oplv, type);
tree compute_rvtype = get_compute_type (RSHIFT_EXPR, oprv, type);
tree compute_otype = get_compute_type (BIT_IOR_EXPR, opo, type);
tree compute_ltype = get_compute_type (LSHIFT_EXPR, opl, type);
tree compute_rtype = get_compute_type (RSHIFT_EXPR, opr, type);
/* The rtl expander will expand vector/scalar as vector/vector
if necessary. Pick one with wider vector type. */
if (count_type_subparts (compute_lvtype)
> count_type_subparts (compute_ltype))
{
compute_ltype = compute_lvtype;
opl = oplv;
}
if (count_type_subparts (compute_rvtype)
> count_type_subparts (compute_rtype))
{
compute_rtype = compute_rvtype;
opr = oprv;
}
/* Pick the narrowest type from LSHIFT_EXPR, RSHIFT_EXPR and
BIT_IOR_EXPR. */
compute_type = compute_ltype;
if (count_type_subparts (compute_type)
> count_type_subparts (compute_rtype))
compute_type = compute_rtype;
if (count_type_subparts (compute_type)
> count_type_subparts (compute_otype))
compute_type = compute_otype;
/* Verify all 3 operations can be performed in that type. */
if (compute_type != TREE_TYPE (type))
{
if (optab_handler (opl, TYPE_MODE (compute_type))
== CODE_FOR_nothing
|| optab_handler (opr, TYPE_MODE (compute_type))
== CODE_FOR_nothing
|| optab_handler (opo, TYPE_MODE (compute_type))
== CODE_FOR_nothing)
compute_type = TREE_TYPE (type);
}
}
}
}
else
......@@ -1473,38 +1590,10 @@ expand_vector_operations_1 (gimple_stmt_iterator *gsi)
&& INTEGRAL_TYPE_P (TREE_TYPE (type)))
op = optab_for_tree_code (MINUS_EXPR, type, optab_default);
/* For very wide vectors, try using a smaller vector mode. */
compute_type = type;
if (!VECTOR_MODE_P (TYPE_MODE (type)) && op)
{
tree vector_compute_type
= type_for_widest_vector_mode (TREE_TYPE (type), op);
if (vector_compute_type != NULL_TREE
&& (TYPE_VECTOR_SUBPARTS (vector_compute_type)
< TYPE_VECTOR_SUBPARTS (compute_type))
&& (optab_handler (op, TYPE_MODE (vector_compute_type))
!= CODE_FOR_nothing))
compute_type = vector_compute_type;
}
/* If we are breaking a BLKmode vector into smaller pieces,
type_for_widest_vector_mode has already looked into the optab,
so skip these checks. */
if (compute_type == NULL_TREE)
compute_type = get_compute_type (code, op, type);
if (compute_type == type)
{
compute_mode = TYPE_MODE (compute_type);
if (VECTOR_MODE_P (compute_mode))
{
if (op && optab_handler (op, compute_mode) != CODE_FOR_nothing)
return;
if (code == MULT_HIGHPART_EXPR
&& can_mult_highpart_p (compute_mode,
TYPE_UNSIGNED (compute_type)))
return;
}
/* There is no operation in hardware, so fall back to scalars. */
compute_type = TREE_TYPE (type);
}
return;
gcc_assert (code != VEC_LSHIFT_EXPR && code != VEC_RSHIFT_EXPR);
new_rhs = expand_vector_operation (gsi, type, compute_type, stmt, code);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment