Commit 39235689 by Kyrylo Tkachov Committed by Kyrylo Tkachov

neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64.

[gcc/]
2013-12-20  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

	* config/arm/neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64.
	* config/arm/arm_neon.h: Regenerate.
	* config/arm/neon-docgen.ml: Add vceq_p64 and vtst_p64.
	* doc/arm-neon-intrinsics.texi: Regenerate.


[gcc/testsuite/]
2013-12-20  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

	* gcc.target/arm/neon-vceq_p64.c: New test.
	* gcc.target/arm/neon-vtst_p64.c: Likewise.

From-SVN: r206151
parent 1fc017b6
2013-12-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/arm/neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64.
* config/arm/arm_neon.h: Regenerate.
* config/arm/neon-docgen.ml: Add vceq_p64 and vtst_p64.
* doc/arm-neon-intrinsics.texi: Regenerate.
2013-12-20 Vladimir Makarov <vmakarov@redhat.com>
* config/arm/arm.h (THUMB_SECONDARY_OUTPUT_RELOAD_CLASS): Return NO_REGS
......@@ -13278,6 +13278,41 @@ vstrq_p128 (poly128_t * __ptr, poly128_t __val)
#endif
}
/* The vceq_p64 intrinsic does not map to a single instruction.
Instead we emulate it by performing a 32-bit variant of the vceq
and applying a pairwise min reduction to the result.
vceq_u32 will produce two 32-bit halves, each of which will contain either
all ones or all zeros depending on whether the corresponding 32-bit
halves of the poly64_t were equal. The whole poly64_t values are equal
if and only if both halves are equal, i.e. vceq_u32 returns all ones.
If the result is all zeroes for any half then the whole result is zeroes.
This is what the pairwise min reduction achieves. */
__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
vceq_p64 (poly64x1_t __a, poly64x1_t __b)
{
uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
uint32x2_t __c = vceq_u32 (__t_a, __t_b);
uint32x2_t __m = vpmin_u32 (__c, __c);
return vreinterpret_u64_u32 (__m);
}
/* The vtst_p64 intrinsic does not map to a single instruction.
We emulate it in way similar to vceq_p64 above but here we do
a reduction with max since if any two corresponding bits
in the two poly64_t's match, then the whole result must be all ones. */
__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
vtst_p64 (poly64x1_t __a, poly64x1_t __b)
{
uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
uint32x2_t __c = vtst_u32 (__t_a, __t_b);
uint32x2_t __m = vpmax_u32 (__c, __c);
return vreinterpret_u64_u32 (__m);
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
{
......
......@@ -340,6 +340,14 @@ let crypto_doc =
@end itemize
@itemize @bullet
@item uint64x1_t vceq_p64 (poly64x1_t, poly64x1_t)
@end itemize
@itemize @bullet
@item uint64x1_t vtst_p64 (poly64x1_t, poly64x1_t)
@end itemize
@itemize @bullet
@item uint32_t vsha1h_u32 (uint32_t)
@*@emph{Form of expected instruction(s):} @code{sha1h.32 @var{q0}, @var{q1}}
@end itemize
......
......@@ -2208,6 +2208,41 @@ vstrq_p128 (poly128_t * __ptr, poly128_t __val)
#endif
}
/* The vceq_p64 intrinsic does not map to a single instruction.
Instead we emulate it by performing a 32-bit variant of the vceq
and applying a pairwise min reduction to the result.
vceq_u32 will produce two 32-bit halves, each of which will contain either
all ones or all zeros depending on whether the corresponding 32-bit
halves of the poly64_t were equal. The whole poly64_t values are equal
if and only if both halves are equal, i.e. vceq_u32 returns all ones.
If the result is all zeroes for any half then the whole result is zeroes.
This is what the pairwise min reduction achieves. */
__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
vceq_p64 (poly64x1_t __a, poly64x1_t __b)
{
uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
uint32x2_t __c = vceq_u32 (__t_a, __t_b);
uint32x2_t __m = vpmin_u32 (__c, __c);
return vreinterpret_u64_u32 (__m);
}
/* The vtst_p64 intrinsic does not map to a single instruction.
We emulate it in way similar to vceq_p64 above but here we do
a reduction with max since if any two corresponding bits
in the two poly64_t's match, then the whole result must be all ones. */
__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
vtst_p64 (poly64x1_t __a, poly64x1_t __b)
{
uint32x2_t __t_a = vreinterpret_u32_p64 (__a);
uint32x2_t __t_b = vreinterpret_u32_p64 (__b);
uint32x2_t __c = vtst_u32 (__t_a, __t_b);
uint32x2_t __m = vpmax_u32 (__c, __c);
return vreinterpret_u64_u32 (__m);
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
{
......
......@@ -11939,6 +11939,14 @@
@end itemize
@itemize @bullet
@item uint64x1_t vceq_p64 (poly64x1_t, poly64x1_t)
@end itemize
@itemize @bullet
@item uint64x1_t vtst_p64 (poly64x1_t, poly64x1_t)
@end itemize
@itemize @bullet
@item uint32_t vsha1h_u32 (uint32_t)
@*@emph{Form of expected instruction(s):} @code{sha1h.32 @var{q0}, @var{q1}}
@end itemize
......
2013-12-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.target/arm/neon-vceq_p64.c: New test.
* gcc.target/arm/neon-vtst_p64.c: Likewise.
2013-12-20 Bingfeng Mei <bmei@broadcom.com>
PR tree-optimization/59544
......
/* { dg-do run } */
/* { dg-require-effective-target arm_crypto_ok } */
/* { dg-require-effective-target arm_neon_hw } */
/* { dg-add-options arm_crypto } */
#include "arm_neon.h"
#include <stdio.h>
extern void abort (void);
int
main (void)
{
uint64_t args[] = { 0x0, 0xdeadbeef, ~0xdeadbeef, 0xffff,
~0xffff, 0xffffffff, ~0xffffffff, ~0x0 };
int i, j;
for (i = 0; i < sizeof (args) / sizeof (args[0]); ++i)
{
for (j = 0; j < sizeof (args) / sizeof (args[0]); ++j)
{
uint64_t a1 = args[i];
uint64_t a2 = args[j];
uint64_t res = vceq_p64 (vreinterpret_p64_u64 (a1),
vreinterpret_p64_u64 (a2));
uint64_t exp = (a1 == a2) ? ~0x0 : 0x0;
if (res != exp)
{
fprintf (stderr, "vceq_p64 (a1= %lx, a2= %lx)"
" returned %lx, expected %lx\n",
a1, a2, res, exp);
abort ();
}
}
}
return 0;
}
/* { dg-do run } */
/* { dg-require-effective-target arm_crypto_ok } */
/* { dg-require-effective-target arm_neon_hw } */
/* { dg-add-options arm_crypto } */
#include "arm_neon.h"
#include <stdio.h>
extern void abort (void);
int
main (void)
{
uint64_t args[] = { 0x0, 0xdeadbeef, ~0xdeadbeef, 0xffff,
~0xffff, 0xffffffff, ~0xffffffff, ~0x0 };
int i, j;
for (i = 0; i < sizeof (args) / sizeof (args[0]); ++i)
{
for (j = 0; j < sizeof (args) / sizeof (args[0]); ++j)
{
uint64_t a1 = args[i];
uint64_t a2 = args[j];
uint64_t res = vtst_p64 (vreinterpret_p64_u64 (a1),
vreinterpret_p64_u64 (a2));
uint64_t exp = (a1 & a2) ? ~0x0 : 0x0;
if (res != exp)
{
fprintf (stderr, "vtst_p64 (a1= %lx, a2= %lx)"
" returned %lx, expected %lx\n",
a1, a2, res, exp);
abort ();
}
}
}
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment