Commit 9371aecc by Christophe Lyon Committed by Christophe Lyon

[AArch64_be] Fix vtbl[34] and vtbx4

2015-10-12  Christophe Lyon  <christophe.lyon@linaro.org>

	* config/aarch64/aarch64-simd-builtins.def: Update builtins
	tables: add tbl3 and tbx4.
	* config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
	(aarch64_tbx4v8qi): New.
	* config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
	(vtbl4_s8, vtbl4_u8, vtbl4_p8, vtbx4_s8, vtbx4_u8, vtbx4_p8):
	Rewrite using builtin functions.
	* config/aarch64/iterators.md (UNSPEC_TBX): New.

From-SVN: r228716
parent 4f59f9f2
2015-10-12 Christophe Lyon <christophe.lyon@linaro.org>
* config/aarch64/aarch64-simd-builtins.def: Update builtins
tables: add tbl3 and tbx4.
* config/aarch64/aarch64-simd.md (aarch64_tbl3v8qi): New.
(aarch64_tbx4v8qi): New.
* config/aarch64/arm_neon.h (vtbl3_s8, vtbl3_u8, vtbl3_p8)
(vtbl4_s8, vtbl4_u8, vtbl4_p8, vtbx4_s8, vtbx4_u8, vtbx4_p8):
Rewrite using builtin functions.
* config/aarch64/iterators.md (UNSPEC_TBX): New.
2015-10-12 Uros Bizjak <ubizjak@gmail.com>
* config/rs6000/rs6000.h (RS6000_ALIGN): Implement using
......@@ -407,3 +407,8 @@
VAR1 (BINOPP, crypto_pmull, 0, di)
VAR1 (BINOPP, crypto_pmull, 0, v2di)
/* Implemented by aarch64_tbl3v8qi. */
VAR1 (BINOP, tbl3, 0, v8qi)
/* Implemented by aarch64_tbx4v8qi. */
VAR1 (TERNOP, tbx4, 0, v8qi)
......@@ -4729,6 +4729,27 @@
[(set_attr "type" "neon_tbl2_q")]
)
(define_insn "aarch64_tbl3v8qi"
[(set (match_operand:V8QI 0 "register_operand" "=w")
(unspec:V8QI [(match_operand:OI 1 "register_operand" "w")
(match_operand:V8QI 2 "register_operand" "w")]
UNSPEC_TBL))]
"TARGET_SIMD"
"tbl\\t%S0.8b, {%S1.16b - %T1.16b}, %S2.8b"
[(set_attr "type" "neon_tbl3")]
)
(define_insn "aarch64_tbx4v8qi"
[(set (match_operand:V8QI 0 "register_operand" "=w")
(unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0")
(match_operand:OI 2 "register_operand" "w")
(match_operand:V8QI 3 "register_operand" "w")]
UNSPEC_TBX))]
"TARGET_SIMD"
"tbx\\t%S0.8b, {%S2.16b - %T2.16b}, %S3.8b"
[(set_attr "type" "neon_tbl4")]
)
(define_insn_and_split "aarch64_combinev16qi"
[(set (match_operand:OI 0 "register_operand" "=w")
(unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
......
......@@ -11274,13 +11274,14 @@ vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
{
int8x8_t result;
int8x16x2_t temp;
__builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
__asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
"tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
: "=w"(result)
: "Q"(temp), "w"(idx)
: "v16", "v17", "memory");
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[0], 0);
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[1], 1);
result = __builtin_aarch64_tbl3v8qi (__o, idx);
return result;
}
......@@ -11289,13 +11290,14 @@ vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
{
uint8x8_t result;
uint8x16x2_t temp;
__builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
__asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
"tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
: "=w"(result)
: "Q"(temp), "w"(idx)
: "v16", "v17", "memory");
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[0], 0);
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[1], 1);
result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
return result;
}
......@@ -11304,13 +11306,14 @@ vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
{
poly8x8_t result;
poly8x16x2_t temp;
__builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
__asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
"tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
: "=w"(result)
: "Q"(temp), "w"(idx)
: "v16", "v17", "memory");
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[0], 0);
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[1], 1);
result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
return result;
}
......@@ -11319,13 +11322,14 @@ vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
{
int8x8_t result;
int8x16x2_t temp;
__builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
__asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
"tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
: "=w"(result)
: "Q"(temp), "w"(idx)
: "v16", "v17", "memory");
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[0], 0);
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[1], 1);
result = __builtin_aarch64_tbl3v8qi (__o, idx);
return result;
}
......@@ -11334,13 +11338,14 @@ vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
{
uint8x8_t result;
uint8x16x2_t temp;
__builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
__asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
"tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
: "=w"(result)
: "Q"(temp), "w"(idx)
: "v16", "v17", "memory");
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[0], 0);
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[1], 1);
result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
return result;
}
......@@ -11349,13 +11354,14 @@ vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
{
poly8x8_t result;
poly8x16x2_t temp;
__builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
__asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
"tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
: "=w"(result)
: "Q"(temp), "w"(idx)
: "v16", "v17", "memory");
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[0], 0);
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[1], 1);
result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
return result;
}
......@@ -11395,51 +11401,6 @@ vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
return result;
}
__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx)
{
int8x8_t result = r;
int8x16x2_t temp;
temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
__asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
"tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
: "+w"(result)
: "Q"(temp), "w"(idx)
: "v16", "v17", "memory");
return result;
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx)
{
uint8x8_t result = r;
uint8x16x2_t temp;
temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
__asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
"tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
: "+w"(result)
: "Q"(temp), "w"(idx)
: "v16", "v17", "memory");
return result;
}
__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx)
{
poly8x8_t result = r;
poly8x16x2_t temp;
temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
__asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t"
"tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t"
: "+w"(result)
: "Q"(temp), "w"(idx)
: "v16", "v17", "memory");
return result;
}
/* End of temporary inline asm. */
/* Start of optimal implementations in approved order. */
......@@ -23904,6 +23865,58 @@ vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx)
return vbsl_p8 (__mask, __tbl, __r);
}
/* vtbx4 */
__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
{
int8x8_t result;
int8x16x2_t temp;
__builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[0], 0);
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[1], 1);
result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx);
return result;
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
{
uint8x8_t result;
uint8x16x2_t temp;
__builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[0], 0);
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[1], 1);
result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
(int8x8_t)__idx);
return result;
}
__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
{
poly8x8_t result;
poly8x16x2_t temp;
__builtin_aarch64_simd_oi __o;
temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[0], 0);
__o = __builtin_aarch64_set_qregoiv16qi (__o,
(int8x16_t) temp.val[1], 1);
result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
(int8x8_t)__idx);
return result;
}
/* vtrn */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
......
......@@ -273,6 +273,7 @@
UNSPEC_USHLL ; Used in aarch64-simd.md.
UNSPEC_ADDP ; Used in aarch64-simd.md.
UNSPEC_TBL ; Used in vector permute patterns.
UNSPEC_TBX ; Used in vector permute patterns.
UNSPEC_CONCAT ; Used in vector permute patterns.
UNSPEC_ZIP1 ; Used in vector permute patterns.
UNSPEC_ZIP2 ; Used in vector permute patterns.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment