Commit e9619a8a by Richard Henderson

arm: Implement vec_perm and vec_perm_const for NEON.

From-SVN: r182299
parent 8c75d41b
......@@ -244,4 +244,7 @@ extern const struct tune_params *current_tune;
extern int vfp3_const_double_for_fract_bits (rtx);
#endif /* RTX_CODE */
extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
extern bool arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
#endif /* ! GCC_ARM_PROTOS_H */
......@@ -267,6 +267,9 @@ static unsigned int arm_autovectorize_vector_sizes (void);
static int arm_default_branch_cost (bool, bool);
static int arm_cortex_a5_branch_cost (bool, bool);
static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
const unsigned char *sel);
/* Table of machine attributes. */
static const struct attribute_spec arm_attribute_table[] =
......@@ -604,6 +607,10 @@ static const struct attribute_spec arm_attribute_table[] =
#define TARGET_PREFERRED_RENAME_CLASS \
arm_preferred_rename_class
#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
arm_vectorize_vec_perm_const_ok
struct gcc_target targetm = TARGET_INITIALIZER;
/* Obstack for minipool constant handling. */
......@@ -25065,5 +25072,522 @@ vfp3_const_double_for_fract_bits (rtx operand)
return 0;
}
#include "gt-arm.h"
#define MAX_VECT_LEN 16
struct expand_vec_perm_d
{
rtx target, op0, op1;
unsigned char perm[MAX_VECT_LEN];
enum machine_mode vmode;
unsigned char nelt;
bool one_vector_p;
bool testing_p;
};
/* Generate a variable permutation. */
static void
arm_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
{
enum machine_mode vmode = GET_MODE (target);
bool one_vector_p = rtx_equal_p (op0, op1);
gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
gcc_checking_assert (GET_MODE (op0) == vmode);
gcc_checking_assert (GET_MODE (op1) == vmode);
gcc_checking_assert (GET_MODE (sel) == vmode);
gcc_checking_assert (TARGET_NEON);
if (one_vector_p)
{
if (vmode == V8QImode)
emit_insn (gen_neon_vtbl1v8qi (target, op0, sel));
else
emit_insn (gen_neon_vtbl1v16qi (target, op0, sel));
}
else
{
enum machine_mode mode1, mode2;
rtx pair, part;
if (vmode == V8QImode)
mode1 = DImode, mode2 = TImode;
else
mode1 = TImode, mode2 = OImode;
pair = gen_reg_rtx (mode2);
emit_insn (gen_rtx_CLOBBER (VOIDmode, pair));
part = simplify_gen_subreg (mode1, pair, mode2,
subreg_lowpart_offset (mode1, mode2));
emit_move_insn (part, gen_lowpart (mode1, op0));
part = simplify_gen_subreg (mode1, pair, mode2,
subreg_highpart_offset (mode1, mode2));
emit_move_insn (part, gen_lowpart (mode1, op1));
if (vmode == V8QImode)
emit_insn (gen_neon_vtbl2v8qi (target, pair, sel));
else
emit_insn (gen_neon_vtbl2v16qi (target, pair, sel));
}
}
void
arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
{
enum machine_mode vmode = GET_MODE (target);
unsigned int i, nelt = GET_MODE_NUNITS (vmode);
bool one_vector_p = rtx_equal_p (op0, op1);
rtx rmask[MAX_VECT_LEN], mask;
/* TODO: ARM's VTBL indexing is little-endian. In order to handle GCC's
numbering of elements for big-endian, we must reverse the order. */
gcc_checking_assert (!BYTES_BIG_ENDIAN);
/* The VTBL instruction does not use a modulo index, so we must take care
of that ourselves. */
mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1);
for (i = 0; i < nelt; ++i)
rmask[i] = mask;
mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask));
sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
arm_expand_vec_perm_1 (target, op0, op1, sel);
}
/* Generate or test for an insn that supports a constant permutation. */
/* Recognize patterns for the VUZP insns. */
static bool
arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
{
unsigned int i, odd, mask, nelt = d->nelt;
rtx out0, out1, in0, in1, x;
rtx (*gen)(rtx, rtx, rtx, rtx);
if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
return false;
/* Note that these are little-endian tests. Adjust for big-endian later. */
if (d->perm[0] == 0)
odd = 0;
else if (d->perm[0] == 1)
odd = 1;
else
return false;
mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
for (i = 0; i < nelt; i++)
{
unsigned elt = (i * 2 + odd) & mask;
if (d->perm[i] != elt)
return false;
}
/* Success! */
if (d->testing_p)
return true;
switch (d->vmode)
{
case V16QImode: gen = gen_neon_vuzpv16qi_internal; break;
case V8QImode: gen = gen_neon_vuzpv8qi_internal; break;
case V8HImode: gen = gen_neon_vuzpv8hi_internal; break;
case V4HImode: gen = gen_neon_vuzpv4hi_internal; break;
case V4SImode: gen = gen_neon_vuzpv4si_internal; break;
case V2SImode: gen = gen_neon_vuzpv2si_internal; break;
case V2SFmode: gen = gen_neon_vuzpv2sf_internal; break;
case V4SFmode: gen = gen_neon_vuzpv4sf_internal; break;
default:
gcc_unreachable ();
}
in0 = d->op0;
in1 = d->op1;
if (BYTES_BIG_ENDIAN)
{
x = in0, in0 = in1, in1 = x;
odd = !odd;
}
out0 = d->target;
out1 = gen_reg_rtx (d->vmode);
if (odd)
x = out0, out0 = out1, out1 = x;
emit_insn (gen (out0, in0, in1, out1));
return true;
}
/* Recognize patterns for the VZIP insns. */
static bool
arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
{
unsigned int i, high, mask, nelt = d->nelt;
rtx out0, out1, in0, in1, x;
rtx (*gen)(rtx, rtx, rtx, rtx);
if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
return false;
/* Note that these are little-endian tests. Adjust for big-endian later. */
high = nelt / 2;
if (d->perm[0] == high)
;
else if (d->perm[0] == 0)
high = 0;
else
return false;
mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
for (i = 0; i < nelt / 2; i++)
{
unsigned elt = (i + high) & mask;
if (d->perm[i * 2] != elt)
return false;
elt = (elt + nelt) & mask;
if (d->perm[i * 2 + 1] != elt)
return false;
}
/* Success! */
if (d->testing_p)
return true;
switch (d->vmode)
{
case V16QImode: gen = gen_neon_vzipv16qi_internal; break;
case V8QImode: gen = gen_neon_vzipv8qi_internal; break;
case V8HImode: gen = gen_neon_vzipv8hi_internal; break;
case V4HImode: gen = gen_neon_vzipv4hi_internal; break;
case V4SImode: gen = gen_neon_vzipv4si_internal; break;
case V2SImode: gen = gen_neon_vzipv2si_internal; break;
case V2SFmode: gen = gen_neon_vzipv2sf_internal; break;
case V4SFmode: gen = gen_neon_vzipv4sf_internal; break;
default:
gcc_unreachable ();
}
in0 = d->op0;
in1 = d->op1;
if (BYTES_BIG_ENDIAN)
{
x = in0, in0 = in1, in1 = x;
high = !high;
}
out0 = d->target;
out1 = gen_reg_rtx (d->vmode);
if (high)
x = out0, out0 = out1, out1 = x;
emit_insn (gen (out0, in0, in1, out1));
return true;
}
/* Recognize patterns for the VREV insns. */
static bool
arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
{
unsigned int i, j, diff, nelt = d->nelt;
rtx (*gen)(rtx, rtx, rtx);
if (!d->one_vector_p)
return false;
diff = d->perm[0];
switch (diff)
{
case 7:
switch (d->vmode)
{
case V16QImode: gen = gen_neon_vrev64v16qi; break;
case V8QImode: gen = gen_neon_vrev64v8qi; break;
default:
return false;
}
break;
case 3:
switch (d->vmode)
{
case V16QImode: gen = gen_neon_vrev32v16qi; break;
case V8QImode: gen = gen_neon_vrev32v8qi; break;
case V8HImode: gen = gen_neon_vrev64v8hi; break;
case V4HImode: gen = gen_neon_vrev64v4hi; break;
default:
return false;
}
break;
case 1:
switch (d->vmode)
{
case V16QImode: gen = gen_neon_vrev16v16qi; break;
case V8QImode: gen = gen_neon_vrev16v8qi; break;
case V8HImode: gen = gen_neon_vrev32v8hi; break;
case V4HImode: gen = gen_neon_vrev32v4hi; break;
case V4SImode: gen = gen_neon_vrev64v4si; break;
case V2SImode: gen = gen_neon_vrev64v2si; break;
case V4SFmode: gen = gen_neon_vrev64v4sf; break;
case V2SFmode: gen = gen_neon_vrev64v2sf; break;
default:
return false;
}
break;
default:
return false;
}
for (i = 0; i < nelt; i += diff)
for (j = 0; j <= diff; j += 1)
if (d->perm[i + j] != i + diff - j)
return false;
/* Success! */
if (d->testing_p)
return true;
/* ??? The third operand is an artifact of the builtin infrastructure
and is ignored by the actual instruction. */
emit_insn (gen (d->target, d->op0, const0_rtx));
return true;
}
/* Recognize patterns for the VTRN insns. */
static bool
arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
{
unsigned int i, odd, mask, nelt = d->nelt;
rtx out0, out1, in0, in1, x;
rtx (*gen)(rtx, rtx, rtx, rtx);
if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
return false;
/* Note that these are little-endian tests. Adjust for big-endian later. */
if (d->perm[0] == 0)
odd = 0;
else if (d->perm[0] == 1)
odd = 1;
else
return false;
mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
for (i = 0; i < nelt; i += 2)
{
if (d->perm[i] != i + odd)
return false;
if (d->perm[i + 1] != ((i + nelt + odd) & mask))
return false;
}
/* Success! */
if (d->testing_p)
return true;
switch (d->vmode)
{
case V16QImode: gen = gen_neon_vtrnv16qi_internal; break;
case V8QImode: gen = gen_neon_vtrnv8qi_internal; break;
case V8HImode: gen = gen_neon_vtrnv8hi_internal; break;
case V4HImode: gen = gen_neon_vtrnv4hi_internal; break;
case V4SImode: gen = gen_neon_vtrnv4si_internal; break;
case V2SImode: gen = gen_neon_vtrnv2si_internal; break;
case V2SFmode: gen = gen_neon_vtrnv2sf_internal; break;
case V4SFmode: gen = gen_neon_vtrnv4sf_internal; break;
default:
gcc_unreachable ();
}
in0 = d->op0;
in1 = d->op1;
if (BYTES_BIG_ENDIAN)
{
x = in0, in0 = in1, in1 = x;
odd = !odd;
}
out0 = d->target;
out1 = gen_reg_rtx (d->vmode);
if (odd)
x = out0, out0 = out1, out1 = x;
emit_insn (gen (out0, in0, in1, out1));
return true;
}
/* The NEON VTBL instruction is a fully variable permuation that's even
stronger than what we expose via VEC_PERM_EXPR. What it doesn't do
is mask the index operand as VEC_PERM_EXPR requires. Therefore we
can do slightly better by expanding this as a constant where we don't
have to apply a mask. */
static bool
arm_evpc_neon_vtbl (struct expand_vec_perm_d *d)
{
rtx rperm[MAX_VECT_LEN], sel;
enum machine_mode vmode = d->vmode;
unsigned int i, nelt = d->nelt;
/* TODO: ARM's VTBL indexing is little-endian. In order to handle GCC's
numbering of elements for big-endian, we must reverse the order. */
if (BYTES_BIG_ENDIAN)
return false;
if (d->testing_p)
return true;
/* Generic code will try constant permutation twice. Once with the
original mode and again with the elements lowered to QImode.
So wait and don't do the selector expansion ourselves. */
if (vmode != V8QImode && vmode != V16QImode)
return false;
for (i = 0; i < nelt; ++i)
rperm[i] = GEN_INT (d->perm[i]);
sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
sel = force_reg (vmode, sel);
arm_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
return true;
}
static bool
arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
/* The pattern matching functions above are written to look for a small
number to begin the sequence (0, 1, N/2). If we begin with an index
from the second operand, we can swap the operands. */
if (d->perm[0] >= d->nelt)
{
unsigned i, nelt = d->nelt;
rtx x;
for (i = 0; i < nelt; ++i)
d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
x = d->op0;
d->op0 = d->op1;
d->op1 = x;
}
if (TARGET_NEON)
{
if (arm_evpc_neon_vuzp (d))
return true;
if (arm_evpc_neon_vzip (d))
return true;
if (arm_evpc_neon_vrev (d))
return true;
if (arm_evpc_neon_vtrn (d))
return true;
return arm_evpc_neon_vtbl (d);
}
return false;
}
/* Expand a vec_perm_const pattern. */
bool
arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
{
struct expand_vec_perm_d d;
int i, nelt, which;
d.target = target;
d.op0 = op0;
d.op1 = op1;
d.vmode = GET_MODE (target);
gcc_assert (VECTOR_MODE_P (d.vmode));
d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
d.testing_p = false;
for (i = which = 0; i < nelt; ++i)
{
rtx e = XVECEXP (sel, 0, i);
int ei = INTVAL (e) & (2 * nelt - 1);
which |= (ei < nelt ? 1 : 2);
d.perm[i] = ei;
}
switch (which)
{
default:
gcc_unreachable();
case 3:
d.one_vector_p = false;
if (!rtx_equal_p (op0, op1))
break;
/* The elements of PERM do not suggest that only the first operand
is used, but both operands are identical. Allow easier matching
of the permutation by folding the permutation into the single
input vector. */
/* FALLTHRU */
case 2:
for (i = 0; i < nelt; ++i)
d.perm[i] &= nelt - 1;
d.op0 = op1;
d.one_vector_p = true;
break;
case 1:
d.op1 = op0;
d.one_vector_p = true;
break;
}
return arm_expand_vec_perm_const_1 (&d);
}
/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK. */
static bool
arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
const unsigned char *sel)
{
struct expand_vec_perm_d d;
unsigned int i, nelt, which;
bool ret;
d.vmode = vmode;
d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
d.testing_p = true;
memcpy (d.perm, sel, nelt);
/* Categorize the set of elements in the selector. */
for (i = which = 0; i < nelt; ++i)
{
unsigned char e = d.perm[i];
gcc_assert (e < 2 * nelt);
which |= (e < nelt ? 1 : 2);
}
/* For all elements from second vector, fold the elements to first. */
if (which == 2)
for (i = 0; i < nelt; ++i)
d.perm[i] -= nelt;
/* Check whether the mask can be applied to the vector type. */
d.one_vector_p = (which != 3);
d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
if (!d.one_vector_p)
d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
start_sequence ();
ret = arm_expand_vec_perm_const_1 (&d);
end_sequence ();
return ret;
}
#include "gt-arm.h"
......@@ -3876,6 +3876,65 @@
[(set_attr "neon_type" "neon_bp_3cycle")]
)
;; These two are used by the vec_perm infrastructure for V16QImode.
(define_insn_and_split "neon_vtbl1v16qi"
[(set (match_operand:V16QI 0 "s_register_operand" "=w")
(unspec:V16QI [(match_operand:V16QI 1 "s_register_operand" "w")
(match_operand:V16QI 2 "s_register_operand" "w")]
UNSPEC_VTBL))]
"TARGET_NEON"
"#"
"&& reload_completed"
[(const_int 0)]
{
rtx op0, op1, op2, part0, part2;
unsigned ofs;
op0 = operands[0];
op1 = gen_lowpart (TImode, operands[1]);
op2 = operands[2];
ofs = subreg_lowpart_offset (V8QImode, V16QImode);
part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
ofs = subreg_highpart_offset (V8QImode, V16QImode);
part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
DONE;
})
(define_insn_and_split "neon_vtbl2v16qi"
[(set (match_operand:V16QI 0 "s_register_operand" "=w")
(unspec:V16QI [(match_operand:OI 1 "s_register_operand" "w")
(match_operand:V16QI 2 "s_register_operand" "w")]
UNSPEC_VTBL))]
"TARGET_NEON"
"#"
"&& reload_completed"
[(const_int 0)]
{
rtx op0, op1, op2, part0, part2;
unsigned ofs;
op0 = operands[0];
op1 = operands[1];
op2 = operands[2];
ofs = subreg_lowpart_offset (V8QImode, V16QImode);
part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
ofs = subreg_highpart_offset (V8QImode, V16QImode);
part0 = simplify_subreg (V8QImode, op0, V16QImode, ofs);
part2 = simplify_subreg (V8QImode, op2, V16QImode, ofs);
emit_insn (gen_neon_vtbl2v8qi (part0, op1, part2));
DONE;
})
(define_insn "neon_vtbx1v8qi"
[(set (match_operand:V8QI 0 "s_register_operand" "=w")
(unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
......
......@@ -108,3 +108,29 @@
|| (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))"
{
})
(define_expand "vec_perm_const<mode>"
[(match_operand:VALL 0 "s_register_operand" "")
(match_operand:VALL 1 "s_register_operand" "")
(match_operand:VALL 2 "s_register_operand" "")
(match_operand:<V_cmp_result> 3 "" "")]
"TARGET_NEON
|| (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (<MODE>mode))"
{
if (arm_expand_vec_perm_const (operands[0], operands[1],
operands[2], operands[3]))
DONE;
else
FAIL;
})
(define_expand "vec_perm<mode>"
[(match_operand:VE 0 "s_register_operand" "")
(match_operand:VE 1 "s_register_operand" "")
(match_operand:VE 2 "s_register_operand" "")
(match_operand:VE 3 "s_register_operand" "")]
"TARGET_NEON && !BYTES_BIG_ENDIAN"
{
arm_expand_vec_perm (operands[0], operands[1], operands[2], operands[3]);
DONE;
})
......@@ -2725,7 +2725,8 @@ proc check_effective_target_vect_perm { } {
verbose "check_effective_target_vect_perm: using cached result" 2
} else {
set et_vect_perm_saved 0
if { [istarget powerpc*-*-*]
if { [is-effective-target arm_neon_ok]
|| [istarget powerpc*-*-*]
|| [istarget spu-*-*]
|| [istarget i?86-*-*]
|| [istarget x86_64-*-*] } {
......@@ -2748,7 +2749,8 @@ proc check_effective_target_vect_perm_byte { } {
verbose "check_effective_target_vect_perm_byte: using cached result" 2
} else {
set et_vect_perm_byte_saved 0
if { [istarget powerpc*-*-*]
if { [is-effective-target arm_neon_ok]
|| [istarget powerpc*-*-*]
|| [istarget spu-*-*] } {
set et_vect_perm_byte_saved 1
}
......@@ -2769,7 +2771,8 @@ proc check_effective_target_vect_perm_short { } {
verbose "check_effective_target_vect_perm_short: using cached result" 2
} else {
set et_vect_perm_short_saved 0
if { [istarget powerpc*-*-*]
if { [is-effective-target arm_neon_ok]
|| [istarget powerpc*-*-*]
|| [istarget spu-*-*] } {
set et_vect_perm_short_saved 1
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment