Commit 9a0317e7 by Paul A. Clarke Committed by Paul Clarke

[rs6000] Fix x86 SSSE3 compatibility implementations and testcases

This patch is the analog to r266868-r266870, but for SSSE3.
The SSSE3 tests had been inadvertently made to PASS without actually running
the test code. Actually running the code turned up some previously undetected
issues.

This patch fixes some issues in the implementations, fixes up the tests
to use a union for the test data, which avoids strict aliasing issues,
and enables the tests to actually run (by removing a dependency on
__BUILTIN_CPU_SUPPORTS).

Also, there's a fairly insignificant change in the testcases that walk
through the data as pairs of vectors from:
  [0] and [1]
  [2] and [3]
  ...
  [n-4] and [n-3]
  [n-2] and [n-1]

to:
  [0] and [1]
  [1] and [2]
  ...
  [n-3] and [n-2]
  [n-2] and [n-1]

Since the testcases compute the correct answers based on the input, no
other changes were necessary to effect the change.

2018-12-19  Paul A. Clarke  <pc@us.ibm.com>

[gcc]

	* config/rs6000/tmmintrin.h (_mm_hadds_epi16): Vector lanes swapped.
	(_mm_hsub_epi32): Likewise.
	(_mm_shuffle_epi8): Fix reversed interpretation of parameters.
	(_mm_shuffle_pi8): Likewise.
	(_mm_addubs_pi16): Likewise.

[gcc/testsuite]

	* gcc.target/powerpc/ssse3-check.h: Enable tests to run.
	* gcc.target/powerpc/ssse3-pabsb.c: Code fixes for strict aliasing
	issues.
	* gcc.target/powerpc/ssse3-pabsd.c: Likewise.
	* gcc.target/powerpc/ssse3-palignr.c: Likewise.
	* gcc.target/powerpc/ssse3-phaddd.c: Likewise.
	* gcc.target/powerpc/ssse3-phaddsw.c: Likewise.
	* gcc.target/powerpc/ssse3-phaddw.c: Likewise.
	* gcc.target/powerpc/ssse3-phsubd.c: Likewise.
	* gcc.target/powerpc/ssse3-phsubw.c: Likewise.
	* gcc.target/powerpc/ssse3-pmulhrsw.c: Likewise.
	* gcc.target/powerpc/ssse3-pshufb.c: Likewise.
	* gcc.target/powerpc/ssse3-psignb.c: Likewise.
	* gcc.target/powerpc/ssse3-psignd.c: Likewise.
	* gcc.target/powerpc/ssse3-psignw.c: Likewise.
	* gcc.target/powerpc/ssse3-vals.h: Provide input data as a union.

From-SVN: r267271
parent ec5e6814
2018-12-19 Paul A. Clarke <pc@us.ibm.com>
* config/rs6000/tmmintrin.h (_mm_hadds_epi16): Vector lanes swapped.
(_mm_hsub_epi32): Likewise.
(_mm_shuffle_epi8): Fix reversed interpretation of parameters.
(_mm_shuffle_pi8): Likewise.
(_mm_addubs_pi16): Likewise.
2018-12-19 Thomas Preud'homme <thomas.preudhomme@linaro.org> 2018-12-19 Thomas Preud'homme <thomas.preudhomme@linaro.org>
* config/arm/arm.h (TARGET_HARD_FLOAT): Restrict to TARGET_32BIT * config/arm/arm.h (TARGET_HARD_FLOAT): Restrict to TARGET_32BIT
...@@ -228,7 +228,7 @@ _mm_hadds_epi16 (__m128i __A, __m128i __B) ...@@ -228,7 +228,7 @@ _mm_hadds_epi16 (__m128i __A, __m128i __B)
__v4si __C = { 0 }, __D = { 0 }; __v4si __C = { 0 }, __D = { 0 };
__C = vec_sum4s ((__v8hi) __A, __C); __C = vec_sum4s ((__v8hi) __A, __C);
__D = vec_sum4s ((__v8hi) __B, __D); __D = vec_sum4s ((__v8hi) __B, __D);
__C = (__v4si) vec_packs (__D, __C); __C = (__v4si) vec_packs (__C, __D);
return (__m128i) __C; return (__m128i) __C;
} }
...@@ -264,8 +264,8 @@ _mm_hsub_epi32 (__m128i __A, __m128i __B) ...@@ -264,8 +264,8 @@ _mm_hsub_epi32 (__m128i __A, __m128i __B)
{ 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
const __v16qu __Q = const __v16qu __Q =
{ 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
__v4si __C = vec_perm ((__v4si) __B, (__v4si) __A, __P); __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
__v4si __D = vec_perm ((__v4si) __B, (__v4si) __A, __Q); __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
return (__m128i) vec_sub (__C, __D); return (__m128i) vec_sub (__C, __D);
} }
...@@ -332,7 +332,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) ...@@ -332,7 +332,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi8 (__m128i __A, __m128i __B) _mm_shuffle_epi8 (__m128i __A, __m128i __B)
{ {
const __v16qi __zero = { 0 }; const __v16qi __zero = { 0 };
__vector __bool char __select = vec_cmplt ((__v16qi) __A, __zero); __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
__v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B); __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
return (__m128i) vec_sel (__C, __zero, __select); return (__m128i) vec_sel (__C, __zero, __select);
} }
...@@ -344,7 +344,7 @@ _mm_shuffle_pi8 (__m64 __A, __m64 __B) ...@@ -344,7 +344,7 @@ _mm_shuffle_pi8 (__m64 __A, __m64 __B)
const __v16qi __zero = { 0 }; const __v16qi __zero = { 0 };
__v16qi __C = (__v16qi) (__v2du) { __A, __A }; __v16qi __C = (__v16qi) (__v2du) { __A, __A };
__v16qi __D = (__v16qi) (__v2du) { __B, __B }; __v16qi __D = (__v16qi) (__v2du) { __B, __B };
__vector __bool char __select = vec_cmplt ((__v16qi) __C, __zero); __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
__C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D); __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
__C = vec_sel (__C, __zero, __select); __C = vec_sel (__C, __zero, __select);
return (__m64) ((__v2du) (__C))[0]; return (__m64) ((__v2du) (__C))[0];
...@@ -423,11 +423,11 @@ extern __inline __m128i ...@@ -423,11 +423,11 @@ extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__)) __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_epi16 (__m128i __A, __m128i __B) _mm_maddubs_epi16 (__m128i __A, __m128i __B)
{ {
__v8hi __C = vec_unpackh ((__v16qi) __A);
__v8hi __D = vec_unpackl ((__v16qi) __A);
__v8hi __unsigned = vec_splats ((signed short) 0x00ff); __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
__v8hi __E = vec_and (vec_unpackh ((__v16qi) __B), __unsigned); __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
__v8hi __F = vec_and (vec_unpackl ((__v16qi) __B), __unsigned); __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
__v8hi __E = vec_unpackh ((__v16qi) __B);
__v8hi __F = vec_unpackl ((__v16qi) __B);
__C = vec_mul (__C, __E); __C = vec_mul (__C, __E);
__D = vec_mul (__D, __F); __D = vec_mul (__D, __F);
const __v16qu __odds = const __v16qu __odds =
...@@ -445,10 +445,10 @@ _mm_maddubs_pi16 (__m64 __A, __m64 __B) ...@@ -445,10 +445,10 @@ _mm_maddubs_pi16 (__m64 __A, __m64 __B)
{ {
__v8hi __C = (__v8hi) (__v2du) { __A, __A }; __v8hi __C = (__v8hi) (__v2du) { __A, __A };
__C = vec_unpackl ((__v16qi) __C); __C = vec_unpackl ((__v16qi) __C);
const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
__C = vec_and (__C, __unsigned);
__v8hi __D = (__v8hi) (__v2du) { __B, __B }; __v8hi __D = (__v8hi) (__v2du) { __B, __B };
__D = vec_unpackl ((__v16qi) __D); __D = vec_unpackl ((__v16qi) __D);
const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
__D = vec_and (__D, __unsigned);
__D = vec_mul (__C, __D); __D = vec_mul (__C, __D);
const __v16qu __odds = const __v16qu __odds =
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
......
2018-12-19 Paul A. Clarke <pc@us.ibm.com>
* gcc.target/powerpc/ssse3-check.h: Enable tests to run.
* gcc.target/powerpc/ssse3-pabsb.c: Code fixes for strict aliasing
issues.
* gcc.target/powerpc/ssse3-pabsd.c: Likewise.
* gcc.target/powerpc/ssse3-palignr.c: Likewise.
* gcc.target/powerpc/ssse3-phaddd.c: Likewise.
* gcc.target/powerpc/ssse3-phaddsw.c: Likewise.
* gcc.target/powerpc/ssse3-phaddw.c: Likewise.
* gcc.target/powerpc/ssse3-phsubd.c: Likewise.
* gcc.target/powerpc/ssse3-phsubw.c: Likewise.
* gcc.target/powerpc/ssse3-pmulhrsw.c: Likewise.
* gcc.target/powerpc/ssse3-pshufb.c: Likewise.
* gcc.target/powerpc/ssse3-psignb.c: Likewise.
* gcc.target/powerpc/ssse3-psignd.c: Likewise.
* gcc.target/powerpc/ssse3-psignw.c: Likewise.
* gcc.target/powerpc/ssse3-vals.h: Provide input data as a union.
2018-12-19 Thomas Preud'homme <thomas.preudhomme@linaro.org> 2018-12-19 Thomas Preud'homme <thomas.preudhomme@linaro.org>
* gcc.target/arm/cmse/baseline/softfp.c: Force an FPU. * gcc.target/arm/cmse/baseline/softfp.c: Force an FPU.
......
...@@ -19,24 +19,9 @@ do_test (void) ...@@ -19,24 +19,9 @@ do_test (void)
int int
main () main ()
{ {
#ifdef __BUILTIN_CPU_SUPPORTS__ do_test ();
/* Most SSE intrinsic operations can be implemented via VMX
instructions, but some operations may be faster / simpler
using the POWER8 VSX instructions. This is especially true
when we are transferring / converting to / from __m64 types.
The direct register transfer instructions from POWER8 are
especially important. So we test for arch_2_07. */
if (__builtin_cpu_supports ("arch_2_07"))
{
do_test ();
#ifdef DEBUG #ifdef DEBUG
printf ("PASSED\n"); printf ("PASSED\n");
#endif #endif
}
#ifdef DEBUG
else
printf ("SKIPPED\n");
#endif
#endif /* __BUILTIN_CPU_SUPPORTS__ */
return 0; return 0;
} }
...@@ -18,61 +18,57 @@ ...@@ -18,61 +18,57 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_pabsb (int *i1, int *r) ssse3_test_pabsb (__m64 *i1, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *r = _mm_abs_pi8 (*i1);
*(__m64 *) r = _mm_abs_pi8 (t1);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_pabsb128 (int *i1, int *r) ssse3_test_pabsb128 (__m128i *i1, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *r = _mm_abs_epi8 (*i1);
*(__m128i *) r = _mm_abs_epi8 (t1);
} }
/* Routine to manually compute the results */ /* Routine to manually compute the results */
static void static void
compute_correct_result (int *i1, int *r) compute_correct_result (signed char *i1, signed char *r)
{ {
char *b1 = (char *) i1;
char *bout = (char *) r;
int i; int i;
for (i = 0; i < 16; i++) for (i = 0; i < 16; i++)
if (b1[i] < 0) if (i1[i] < 0)
bout[i] = -b1[i]; r[i] = -i1[i];
else else
bout[i] = b1[i]; r[i] = i1[i];
} }
static void static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 4) for (i = 0; i < ARRAY_SIZE (vals); i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result(&vals[i + 0], ck); compute_correct_result(&vals[i].b[0], &ck.b[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_pabsb (&vals[i + 0], &r[0]); ssse3_test_pabsb (&vals[i].ll[0], &r.ll[0]);
ssse3_test_pabsb (&vals[i + 2], &r[2]); ssse3_test_pabsb (&vals[i].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_pabsb128 (&vals[i + 0], r); ssse3_test_pabsb128 (&vals[i].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -19,21 +19,19 @@ ...@@ -19,21 +19,19 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_pabsd (int *i1, int *r) ssse3_test_pabsd (__m64 *i1, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *r = _mm_abs_pi32 (*i1);
*(__m64 *) r = _mm_abs_pi32 (t1);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_pabsd128 (int *i1, int *r) ssse3_test_pabsd128 (__m128i *i1, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *r = _mm_abs_epi32 (*i1);
*(__m128i *) r = _mm_abs_epi32 (t1);
} }
/* Routine to manually compute the results */ /* Routine to manually compute the results */
...@@ -53,25 +51,25 @@ static void ...@@ -53,25 +51,25 @@ static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 4) for (i = 0; i < ARRAY_SIZE (vals); i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result(&vals[i + 0], ck); compute_correct_result(&vals[i].w[0], &ck.w[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_pabsd (&vals[i + 0], &r[0]); ssse3_test_pabsd (&vals[i].ll[0], &r.ll[0]);
ssse3_test_pabsd (&vals[i + 2], &r[2]); ssse3_test_pabsd (&vals[i].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_pabsd128 (&vals[i + 0], r); ssse3_test_pabsd128 (&vals[i].m[0], &r.m[0]);
fail += chk_128(ck, r); fail += chk_128(ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -19,61 +19,57 @@ ...@@ -19,61 +19,57 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_pabsw (int *i1, int *r) ssse3_test_pabsw (__m64 *i1, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *r = _mm_abs_pi16 (*i1);
*(__m64 *) r = _mm_abs_pi16 (t1);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_pabsw128 (int *i1, int *r) ssse3_test_pabsw128 (__m128i *i1, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *r = _mm_abs_epi16 (*i1);
*(__m128i *) r = _mm_abs_epi16 (t1);
} }
/* Routine to manually compute the results */ /* Routine to manually compute the results */
static void static void
compute_correct_result (int *i1, int *r) compute_correct_result (short *i1, short *r)
{ {
short *s1 = (short *) i1;
short *sout = (short *) r;
int i; int i;
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)
if (s1[i] < 0) if (i1[i] < 0)
sout[i] = -s1[i]; r[i] = -i1[i];
else else
sout[i] = s1[i]; r[i] = i1[i];
} }
static void static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 4) for (i = 0; i < ARRAY_SIZE (vals); i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result (&vals[i + 0], ck); compute_correct_result (&vals[i].h[0], &ck.h[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_pabsw (&vals[i + 0], &r[0]); ssse3_test_pabsw (&vals[i].ll[0], &r.ll[0]);
ssse3_test_pabsw (&vals[i + 2], &r[2]); ssse3_test_pabsw (&vals[i].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_pabsw128 (&vals[i + 0], r); ssse3_test_pabsw128 (&vals[i].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -20,63 +20,60 @@ ...@@ -20,63 +20,60 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_palignr (int *i1, int *i2, unsigned int imm, int *r) ssse3_test_palignr (__m64 *i1, __m64 *i2, unsigned int imm, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1;
__m64 t2 = *(__m64 *) i2;
switch (imm) switch (imm)
{ {
case 0: case 0:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 0); *r = _mm_alignr_pi8 (*i1, *i2, 0);
break; break;
case 1: case 1:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 1); *r = _mm_alignr_pi8 (*i1, *i2, 1);
break; break;
case 2: case 2:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 2); *r = _mm_alignr_pi8 (*i1, *i2, 2);
break; break;
case 3: case 3:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 3); *r = _mm_alignr_pi8 (*i1, *i2, 3);
break; break;
case 4: case 4:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 4); *r = _mm_alignr_pi8 (*i1, *i2, 4);
break; break;
case 5: case 5:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 5); *r = _mm_alignr_pi8 (*i1, *i2, 5);
break; break;
case 6: case 6:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 6); *r = _mm_alignr_pi8 (*i1, *i2, 6);
break; break;
case 7: case 7:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 7); *r = _mm_alignr_pi8 (*i1, *i2, 7);
break; break;
case 8: case 8:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 8); *r = _mm_alignr_pi8 (*i1, *i2, 8);
break; break;
case 9: case 9:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 9); *r = _mm_alignr_pi8 (*i1, *i2, 9);
break; break;
case 10: case 10:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 10); *r = _mm_alignr_pi8 (*i1, *i2, 10);
break; break;
case 11: case 11:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 11); *r = _mm_alignr_pi8 (*i1, *i2, 11);
break; break;
case 12: case 12:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 12); *r = _mm_alignr_pi8 (*i1, *i2, 12);
break; break;
case 13: case 13:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 13); *r = _mm_alignr_pi8 (*i1, *i2, 13);
break; break;
case 14: case 14:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 14); *r = _mm_alignr_pi8 (*i1, *i2, 14);
break; break;
case 15: case 15:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 15); *r = _mm_alignr_pi8 (*i1, *i2, 15);
break; break;
default: default:
*(__m64 *) r = _mm_alignr_pi8 (t1, t2, 16); *r = _mm_alignr_pi8 (*i1, *i2, 16);
break; break;
} }
...@@ -86,122 +83,120 @@ ssse3_test_palignr (int *i1, int *i2, unsigned int imm, int *r) ...@@ -86,122 +83,120 @@ ssse3_test_palignr (int *i1, int *i2, unsigned int imm, int *r)
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_palignr128 (int *i1, int *i2, unsigned int imm, int *r) ssse3_test_palignr128 (__m128i *i1, __m128i *i2, unsigned int imm, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1;
__m128i t2 = *(__m128i *) i2;
switch (imm) switch (imm)
{ {
case 0: case 0:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 0); *r = _mm_alignr_epi8 (*i1, *i2, 0);
break; break;
case 1: case 1:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 1); *r = _mm_alignr_epi8 (*i1, *i2, 1);
break; break;
case 2: case 2:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 2); *r = _mm_alignr_epi8 (*i1, *i2, 2);
break; break;
case 3: case 3:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 3); *r = _mm_alignr_epi8 (*i1, *i2, 3);
break; break;
case 4: case 4:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 4); *r = _mm_alignr_epi8 (*i1, *i2, 4);
break; break;
case 5: case 5:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 5); *r = _mm_alignr_epi8 (*i1, *i2, 5);
break; break;
case 6: case 6:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 6); *r = _mm_alignr_epi8 (*i1, *i2, 6);
break; break;
case 7: case 7:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 7); *r = _mm_alignr_epi8 (*i1, *i2, 7);
break; break;
case 8: case 8:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 8); *r = _mm_alignr_epi8 (*i1, *i2, 8);
break; break;
case 9: case 9:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 9); *r = _mm_alignr_epi8 (*i1, *i2, 9);
break; break;
case 10: case 10:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 10); *r = _mm_alignr_epi8 (*i1, *i2, 10);
break; break;
case 11: case 11:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 11); *r = _mm_alignr_epi8 (*i1, *i2, 11);
break; break;
case 12: case 12:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 12); *r = _mm_alignr_epi8 (*i1, *i2, 12);
break; break;
case 13: case 13:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 13); *r = _mm_alignr_epi8 (*i1, *i2, 13);
break; break;
case 14: case 14:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 14); *r = _mm_alignr_epi8 (*i1, *i2, 14);
break; break;
case 15: case 15:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 15); *r = _mm_alignr_epi8 (*i1, *i2, 15);
break; break;
case 16: case 16:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 16); *r = _mm_alignr_epi8 (*i1, *i2, 16);
break; break;
case 17: case 17:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 17); *r = _mm_alignr_epi8 (*i1, *i2, 17);
break; break;
case 18: case 18:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 18); *r = _mm_alignr_epi8 (*i1, *i2, 18);
break; break;
case 19: case 19:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 19); *r = _mm_alignr_epi8 (*i1, *i2, 19);
break; break;
case 20: case 20:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 20); *r = _mm_alignr_epi8 (*i1, *i2, 20);
break; break;
case 21: case 21:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 21); *r = _mm_alignr_epi8 (*i1, *i2, 21);
break; break;
case 22: case 22:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 22); *r = _mm_alignr_epi8 (*i1, *i2, 22);
break; break;
case 23: case 23:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 23); *r = _mm_alignr_epi8 (*i1, *i2, 23);
break; break;
case 24: case 24:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 24); *r = _mm_alignr_epi8 (*i1, *i2, 24);
break; break;
case 25: case 25:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 25); *r = _mm_alignr_epi8 (*i1, *i2, 25);
break; break;
case 26: case 26:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 26); *r = _mm_alignr_epi8 (*i1, *i2, 26);
break; break;
case 27: case 27:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 27); *r = _mm_alignr_epi8 (*i1, *i2, 27);
break; break;
case 28: case 28:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 28); *r = _mm_alignr_epi8 (*i1, *i2, 28);
break; break;
case 29: case 29:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 29); *r = _mm_alignr_epi8 (*i1, *i2, 29);
break; break;
case 30: case 30:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 30); *r = _mm_alignr_epi8 (*i1, *i2, 30);
break; break;
case 31: case 31:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 31); *r = _mm_alignr_epi8 (*i1, *i2, 31);
break; break;
default: default:
*(__m128i *) r = _mm_alignr_epi8 (t1, t2, 32); *r = _mm_alignr_epi8 (*i1, *i2, 32);
break; break;
} }
} }
/* Routine to manually compute the results */ /* Routine to manually compute the results */
static void static void
compute_correct_result_128 (int *i1, int *i2, unsigned int imm, int *r) compute_correct_result_128 (signed char *i1, signed char *i2, unsigned int imm,
signed char *r)
{ {
char buf [32]; signed char buf [32];
char *bout = (char *) r;
int i; int i;
memcpy (&buf[0], i2, 16); memcpy (&buf[0], i2, 16);
...@@ -209,38 +204,38 @@ compute_correct_result_128 (int *i1, int *i2, unsigned int imm, int *r) ...@@ -209,38 +204,38 @@ compute_correct_result_128 (int *i1, int *i2, unsigned int imm, int *r)
for (i = 0; i < 16; i++) for (i = 0; i < 16; i++)
if (imm >= 32 || imm + i >= 32) if (imm >= 32 || imm + i >= 32)
bout[i] = 0; r[i] = 0;
else else
bout[i] = buf[imm + i]; r[i] = buf[imm + i];
} }
#ifndef __AVX__ #ifndef __AVX__
static void static void
compute_correct_result_64 (int *i1, int *i2, unsigned int imm, int *r) compute_correct_result_64 (signed char *i1, signed char *i2, unsigned int imm,
signed char *r)
{ {
char buf [16]; signed char buf [16];
char *bout = (char *)r;
int i; int i;
/* Handle the first half */ /* Handle the first half */
memcpy (&buf[0], i2, 8); memcpy (&buf[0], &i2[0], 8);
memcpy (&buf[8], i1, 8); memcpy (&buf[8], &i1[0], 8);
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)
if (imm >= 16 || imm + i >= 16) if (imm >= 16 || imm + i >= 16)
bout[i] = 0; r[i] = 0;
else else
bout[i] = buf[imm + i]; r[i] = buf[imm + i];
/* Handle the second half */ /* Handle the second half */
memcpy (&buf[0], &i2[2], 8); memcpy (&buf[0], &i2[8], 8);
memcpy (&buf[8], &i1[2], 8); memcpy (&buf[8], &i1[8], 8);
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)
if (imm >= 16 || imm + i >= 16) if (imm >= 16 || imm + i >= 16)
bout[i + 8] = 0; r[i + 8] = 0;
else else
bout[i + 8] = buf[imm + i]; r[i + 8] = buf[imm + i];
} }
#endif #endif
...@@ -248,30 +243,35 @@ static void ...@@ -248,30 +243,35 @@ static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
unsigned int imm; unsigned int imm;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 8) for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
for (imm = 0; imm < 100; imm++) for (imm = 0; imm < 100; imm++)
{ {
#ifndef __AVX__ #ifndef __AVX__
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result_64 (&vals[i + 0], &vals[i + 4], imm, ck); compute_correct_result_64 (&vals[i + 0].b[0],
&vals[i + 1].b[0], imm, &ck.b[0]);
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_palignr (&vals[i + 0], &vals[i + 4], imm, &r[0]); ssse3_test_palignr (&vals[i + 0].ll[0],
ssse3_test_palignr (&vals[i + 2], &vals[i + 6], imm, &r[2]); &vals[i + 1].ll[0], imm, &r.ll[0]);
fail += chk_128 (ck, r); ssse3_test_palignr (&vals[i + 0].ll[1],
&vals[i + 1].ll[1], imm, &r.ll[1]);
fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Recompute the results for 128-bits */ /* Recompute the results for 128-bits */
compute_correct_result_128 (&vals[i + 0], &vals[i + 4], imm, ck); compute_correct_result_128 (&vals[i + 0].b[0],
&vals[i + 1].b[0], imm, &ck.b[0]);
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_palignr128 (&vals[i + 0], &vals[i + 4], imm, r); ssse3_test_palignr128 (&vals[i + 0].m[0],
fail += chk_128 (ck, r); &vals[i + 1].m[0], imm, &r.m[0]);
fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -19,23 +19,19 @@ ...@@ -19,23 +19,19 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_phaddd (int *i1, int *i2, int *r) ssse3_test_phaddd (__m64 *i1, __m64 *i2, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *r = _mm_hadd_pi32 (*i1, *i2);
__m64 t2 = *(__m64 *) i2;
*(__m64 *) r = _mm_hadd_pi32 (t1, t2);
_mm_empty(); _mm_empty();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_phaddd128 (int *i1, int *i2, int *r) ssse3_test_phaddd128 (__m128i *i1, __m128i *i2, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *r = _mm_hadd_epi32 (*i1, *i2);
__m128i t2 = *(__m128i *) i2;
*(__m128i *) r = _mm_hadd_epi32 (t1, t2);
} }
/* Routine to manually compute the results */ /* Routine to manually compute the results */
...@@ -54,26 +50,25 @@ static void ...@@ -54,26 +50,25 @@ static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
for (i = 0; i < 256; i += 8)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result (&vals[i + 0], &vals[i + 4], ck); compute_correct_result (&vals[i + 0].w[0], &vals[i + 1].w[0], &ck.w[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_phaddd (&vals[i + 0], &vals[i + 2], &r[0]); ssse3_test_phaddd (&vals[i + 0].ll[0], &vals[i + 0].ll[1], &r.ll[0]);
ssse3_test_phaddd (&vals[i + 4], &vals[i + 6], &r[2]); ssse3_test_phaddd (&vals[i + 1].ll[0], &vals[i + 1].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_phaddd128 (&vals[i + 0], &vals[i + 4], r); ssse3_test_phaddd128 (&vals[i + 0].m[0], &vals[i + 1].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -19,23 +19,19 @@ ...@@ -19,23 +19,19 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_phaddsw (int *i1, int *i2, int *r) ssse3_test_phaddsw (__m64 *i1, __m64 *i2, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *r = _mm_hadds_pi16 (*i1, *i2);
__m64 t2 = *(__m64 *) i2;
*(__m64 *) r = _mm_hadds_pi16 (t1, t2);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_phaddsw128 (int *i1, int *i2, int *r) ssse3_test_phaddsw128 (__m128i *i1, __m128i *i2, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *(__m128i *) r = _mm_hadds_epi16 (*i1, *i2);
__m128i t2 = *(__m128i *) i2;
*(__m128i *) r = _mm_hadds_epi16 (t1, t2);
} }
static short static short
...@@ -52,42 +48,39 @@ signed_saturate_to_word (int x) ...@@ -52,42 +48,39 @@ signed_saturate_to_word (int x)
/* Routine to manually compute the results */ /* Routine to manually compute the results */
static void static void
compute_correct_result (int *i1, int *i2, int *r) compute_correct_result (short *i1, short *i2, short *r)
{ {
short *s1 = (short *) i1;
short *s2 = (short *) i2;
short *sout = (short *) r;
int i; int i;
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
sout[i] = signed_saturate_to_word(s1[2 * i] + s1[2 * i + 1]); r[i + 0] = signed_saturate_to_word(i1[2 * i] + i1[2 * i + 1]);
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
sout[i + 4] = signed_saturate_to_word(s2[2 * i] + s2[2 * i + 1]); r[i + 4] = signed_saturate_to_word(i2[2 * i] + i2[2 * i + 1]);
} }
static void static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 8) for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result (&vals[i + 0], &vals[i + 4], ck); compute_correct_result (&vals[i + 0].h[0], &vals[i + 1].h[0], &ck.h[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_phaddsw (&vals[i + 0], &vals[i + 2], &r[0]); ssse3_test_phaddsw (&vals[i + 0].ll[0], &vals[i + 0].ll[1], &r.ll[0]);
ssse3_test_phaddsw (&vals[i + 4], &vals[i + 6], &r[2]); ssse3_test_phaddsw (&vals[i + 1].ll[0], &vals[i + 1].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_phaddsw128 (&vals[i + 0], &vals[i + 4], r); ssse3_test_phaddsw128 (&vals[i + 0].m[0], &vals[i + 1].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -19,64 +19,57 @@ ...@@ -19,64 +19,57 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_phaddw (int *i1, int *i2, int *r) ssse3_test_phaddw (__m64 *i1, __m64 *i2, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *r = _mm_hadd_pi16 (*i1, *i2);
__m64 t2 = *(__m64 *) i2;
*(__m64 *) r = _mm_hadd_pi16 (t1, t2);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_phaddw128 (int *i1, int *i2, int *r) ssse3_test_phaddw128 (__m128i *i1, __m128i *i2, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *r = _mm_hadd_epi16 (*i1, *i2);
__m128i t2 = *(__m128i *) i2;
*(__m128i *) r = _mm_hadd_epi16 (t1, t2);
} }
/* Routine to manually compute the results */ /* Routine to manually compute the results */
static void static void
compute_correct_result(int *i1, int *i2, int *r) compute_correct_result(short *i1, short *i2, short *r)
{ {
short *s1 = (short *) i1;
short *s2 = (short *) i2;
short *sout = (short *) r;
int i; int i;
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
sout[i] = s1[2 * i] + s1[2 * i + 1]; r[i] = i1[2 * i] + i1[2 * i + 1];
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
sout[i + 4] = s2[2 * i] + s2[2 * i + 1]; r[i + 4] = i2[2 * i] + i2[2 * i + 1];
} }
static void static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 8) for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result (&vals[i + 0], &vals[i + 4], ck); compute_correct_result (&vals[i + 0].h[0], &vals[i + 1].h[0], &ck.h[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_phaddw (&vals[i + 0], &vals[i + 2], &r[0]); ssse3_test_phaddw (&vals[i + 0].ll[0], &vals[i + 0].ll[1], &r.ll[0]);
ssse3_test_phaddw (&vals[i + 4], &vals[i + 6], &r[2]); ssse3_test_phaddw (&vals[i + 1].ll[0], &vals[i + 1].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_phaddw128 (&vals[i + 0], &vals[i + 4], r); ssse3_test_phaddw128 (&vals[i + 0].m[0], &vals[i + 1].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -19,23 +19,19 @@ ...@@ -19,23 +19,19 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_phsubd (int *i1, int *i2, int *r) ssse3_test_phsubd (__m64 *i1, __m64 *i2, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *r = _mm_hsub_pi32 (*i1, *i2);
__m64 t2 = *(__m64 *) i2;
*(__m64 *) r = _mm_hsub_pi32(t1, t2);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_phsubd128 (int *i1, int *i2, int *r) ssse3_test_phsubd128 (__m128i *i1, __m128i *i2, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *(__m128i *) r = _mm_hsub_epi32 (*i1, *i2);
__m128i t2 = *(__m128i *) i2;
*(__m128i *) r = _mm_hsub_epi32 (t1, t2);
} }
/* Routine to manually compute the results */ /* Routine to manually compute the results */
...@@ -54,25 +50,25 @@ static void ...@@ -54,25 +50,25 @@ static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 8) for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result (&vals[i + 0], &vals[i + 4], ck); compute_correct_result (&vals[i + 0].w[0], &vals[i + 1].w[0], &ck.w[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_phsubd (&vals[i + 0], &vals[i + 2], &r[0]); ssse3_test_phsubd (&vals[i + 0].ll[0], &vals[i + 0].ll[1], &r.ll[0]);
ssse3_test_phsubd (&vals[i + 4], &vals[i + 6], &r[2]); ssse3_test_phsubd (&vals[i + 1].ll[0], &vals[i + 1].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_phsubd128 (&vals[i + 0], &vals[i + 4], r); ssse3_test_phsubd128 (&vals[i + 0].m[0], &vals[i + 1].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -19,25 +19,19 @@ ...@@ -19,25 +19,19 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_phsubsw (int *i1, int *i2, int *r) ssse3_test_phsubsw (__m64 *i1, __m64 *i2, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *(__m64 *) r = _mm_hsubs_pi16 (*i1, *i2);
__m64 t2 = *(__m64 *) i2;
*(__m64 *) r = _mm_hsubs_pi16 (t1, t2);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_phsubsw128 (int *i1, int *i2, int *r) ssse3_test_phsubsw128 (__m128i *i1, __m128i *i2, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *r = _mm_hsubs_epi16 (*i1, *i2);
__m128i t2 = *(__m128i *) i2;
*(__m128i *) r = _mm_hsubs_epi16 (t1, t2);
} }
static short static short
...@@ -54,43 +48,40 @@ signed_saturate_to_word (int x) ...@@ -54,43 +48,40 @@ signed_saturate_to_word (int x)
/* Routine to manually compute the results */ /* Routine to manually compute the results */
static void static void
compute_correct_result (int *i1, int *i2, int *r) compute_correct_result (short *i1, short *i2, short *r)
{ {
short *s1 = (short *) i1;
short *s2 = (short *) i2;
short *sout = (short *) r;
int i; int i;
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
sout[i] = signed_saturate_to_word (s1[2 * i] - s1[2 * i + 1]); r[i] = signed_saturate_to_word (i1[2 * i] - i1[2 * i + 1]);
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
sout[i + 4] = signed_saturate_to_word (s2[2 * i] - s2[2 * i + 1]); r[i + 4] = signed_saturate_to_word (i2[2 * i] - i2[2 * i + 1]);
} }
static void static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 8) for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result (&vals[i + 0], &vals[i + 4], ck); compute_correct_result (&vals[i + 0].h[0], &vals[i + 1].h[0], &ck.h[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_phsubsw (&vals[i + 0], &vals[i + 2], &r[0]); ssse3_test_phsubsw (&vals[i + 0].ll[0], &vals[i + 0].ll[1], &r.ll[0]);
ssse3_test_phsubsw (&vals[i + 4], &vals[i + 6], &r[2]); ssse3_test_phsubsw (&vals[i + 1].ll[0], &vals[i + 1].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_phsubsw128 (&vals[i + 0], &vals[i + 4], r); ssse3_test_phsubsw128 (&vals[i + 0].m[0], &vals[i + 1].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -18,64 +18,56 @@ ...@@ -18,64 +18,56 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_phsubw (int *i1, int *i2, int *r) ssse3_test_phsubw (__m64 *i1, __m64 *i2, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *(__m64 *) r = _mm_hsub_pi16 (*i1, *i2);
__m64 t2 = *(__m64 *) i2;
*(__m64 *) r = _mm_hsub_pi16 (t1, t2);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_phsubw128 (int *i1, int *i2, int *r) ssse3_test_phsubw128 (__m128i *i1, __m128i *i2, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *(__m128i *) r = _mm_hsub_epi16 (*i1, *i2);
__m128i t2 = *(__m128i *) i2;
*(__m128i *) r = _mm_hsub_epi16 (t1, t2);
} }
/* Routine to manually compute the results */ /* Routine to manually compute the results */
static void static void
compute_correct_result (int *i1, int *i2, int *r) compute_correct_result (short *i1, short *i2, short *r)
{ {
short *s1 = (short *) i1;
short *s2 = (short *) i2;
short *sout = (short *) r;
int i; int i;
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
sout[i] = s1[2 * i] - s1[2 * i + 1]; r[i] = i1[2 * i] - i1[2 * i + 1];
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
sout[i + 4] = s2[2 * i] - s2[2 * i + 1]; r[i + 4] = i2[2 * i] - i2[2 * i + 1];
} }
static void static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 8) for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result (&vals[i + 0], &vals[i + 4], ck); compute_correct_result (&vals[i + 0].h[0], &vals[i + 1].h[0], &ck.h[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_phsubw (&vals[i + 0], &vals[i + 2], &r[0]); ssse3_test_phsubw (&vals[i + 0].ll[0], &vals[i + 0].ll[1], &r.ll[0]);
ssse3_test_phsubw (&vals[i + 4], &vals[i + 6], &r[2]); ssse3_test_phsubw (&vals[i + 1].ll[0], &vals[i + 1].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_phsubw128 (&vals[i + 0], &vals[i + 4], r); ssse3_test_phsubw128 (&vals[i + 0].m[0], &vals[i + 1].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -19,23 +19,19 @@ ...@@ -19,23 +19,19 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_pmaddubsw (int *i1, int *i2, int *r) ssse3_test_pmaddubsw (__m64 *i1, __m64 *i2, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *(__m64 *) r = _mm_maddubs_pi16 (*i1, *i2);
__m64 t2 = *(__m64 *) i2;
*(__m64 *) r = _mm_maddubs_pi16 (t1, t2);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_pmaddubsw128 (int *i1, int *i2, int *r) ssse3_test_pmaddubsw128 (__m128i *i1, __m128i *i2, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *r = _mm_maddubs_epi16 (*i1, *i2);
__m128i t2 = *(__m128i *) i2;
*(__m128i *) r = _mm_maddubs_epi16 (t1, t2);
} }
static short static short
...@@ -52,19 +48,16 @@ signed_saturate_to_word(int x) ...@@ -52,19 +48,16 @@ signed_saturate_to_word(int x)
/* Routine to manually compute the results */ /* Routine to manually compute the results */
static void static void
compute_correct_result (int *i1, int *i2, int *r) compute_correct_result (unsigned char *i1, signed char *i2, short *r)
{ {
unsigned char *ub1 = (unsigned char *) i1;
char *sb2 = (char *) i2;
short *sout = (short *) r;
int t0; int t0;
int i; int i;
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)
{ {
t0 = ((int) ub1[2 * i] * (int) sb2[2 * i] + t0 = ((int) i1[2 * i] * (int) i2[2 * i] +
(int) ub1[2 * i + 1] * (int) sb2[2 * i + 1]); (int) i1[2 * i + 1] * (int) i2[2 * i + 1]);
sout[i] = signed_saturate_to_word (t0); r[i] = signed_saturate_to_word (t0);
} }
} }
...@@ -72,25 +65,25 @@ static void ...@@ -72,25 +65,25 @@ static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 8) for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result (&vals[i + 0], &vals[i + 4], ck); compute_correct_result (&vals[i + 0].ub[0], &vals[i + 1].b[0], &ck.h[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_pmaddubsw (&vals[i + 0], &vals[i + 4], &r[0]); ssse3_test_pmaddubsw (&vals[i + 0].ll[0], &vals[i + 1].ll[0], &r.ll[0]);
ssse3_test_pmaddubsw (&vals[i + 2], &vals[i + 6], &r[2]); ssse3_test_pmaddubsw (&vals[i + 0].ll[1], &vals[i + 1].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_pmaddubsw128 (&vals[i + 0], &vals[i + 4], r); ssse3_test_pmaddubsw128 (&vals[i + 0].m[0], &vals[i + 1].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -19,39 +19,32 @@ ...@@ -19,39 +19,32 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_pmulhrsw (int *i1, int *i2, int *r) ssse3_test_pmulhrsw (__m64 *i1, __m64 *i2, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *r = _mm_mulhrs_pi16 (*i1, *i2);
__m64 t2 = *(__m64 *) i2;
*(__m64 *) r = _mm_mulhrs_pi16 (t1, t2);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_pmulhrsw128 (int *i1, int *i2, int *r) ssse3_test_pmulhrsw128 (__m128i *i1, __m128i *i2, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *r = _mm_mulhrs_epi16 (*i1, *i2);
__m128i t2 = *(__m128i *) i2;
*(__m128i *) r = _mm_mulhrs_epi16 (t1, t2);
} }
/* Routine to manually compute the results */ /* Routine to manually compute the results */
static void static void
compute_correct_result (int *i1, int *i2, int *r) compute_correct_result (short *i1, short *i2, short *r)
{ {
short *s1 = (short *) i1;
short *s2 = (short *) i2;
short *sout = (short *) r;
int t0; int t0;
int i; int i;
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)
{ {
t0 = (((int) s1[i] * (int) s2[i]) >> 14) + 1; t0 = (((int) i1[i] * (int) i2[i]) >> 14) + 1;
sout[i] = (short) (t0 >> 1); r[i] = (short) (t0 >> 1);
} }
} }
...@@ -59,25 +52,25 @@ static void ...@@ -59,25 +52,25 @@ static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 8) for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result (&vals[i + 0], &vals[i + 4], ck); compute_correct_result (&vals[i + 0].h[0], &vals[i + 1].h[0], &ck.h[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_pmulhrsw (&vals[i + 0], &vals[i + 4], &r[0]); ssse3_test_pmulhrsw (&vals[i + 0].ll[0], &vals[i + 1].ll[0], &r.ll[0]);
ssse3_test_pmulhrsw (&vals[i + 2], &vals[i + 6], &r[2]); ssse3_test_pmulhrsw (&vals[i + 0].ll[1], &vals[i + 1].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_pmulhrsw128 (&vals[i + 0], &vals[i + 4], r); ssse3_test_pmulhrsw128 (&vals[i + 0].m[0], &vals[i + 1].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -19,65 +19,55 @@ ...@@ -19,65 +19,55 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_pshufb (int *i1, int *i2, int *r) ssse3_test_pshufb (__m64 *i1, __m64 *i2, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *r = _mm_shuffle_pi8 (*i1, *i2);
__m64 t2 = *(__m64 *) i2;
*(__m64 *)r = _mm_shuffle_pi8 (t1, t2);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_pshufb128 (int *i1, int *i2, int *r) ssse3_test_pshufb128 (__m128i *i1, __m128i *i2, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *r = _mm_shuffle_epi8 (*i1, *i2);
__m128i t2 = *(__m128i *) i2;
*(__m128i *)r = _mm_shuffle_epi8 (t1, t2);
} }
#ifndef __AVX__ #ifndef __AVX__
/* Routine to manually compute the results */ /* Routine to manually compute the results */
static void static void
compute_correct_result_64 (int *i1, int *i2, int *r) compute_correct_result_64 (signed char *i1, signed char *i2, signed char *r)
{ {
char *b1 = (char *) i1;
char *b2 = (char *) i2;
char *bout = (char *) r;
int i; int i;
char select; char select;
for (i = 0; i < 16; i++) for (i = 0; i < 16; i++)
{ {
select = b2[i]; select = i2[i];
if (select & 0x80) if (select & 0x80)
bout[i] = 0; r[i] = 0;
else if (i < 8) else if (i < 8)
bout[i] = b1[select & 0x7]; r[i] = i1[select & 0x7];
else else
bout[i] = b1[8 + (select & 0x7)]; r[i] = i1[8 + (select & 0x7)];
} }
} }
#endif #endif
static void static void
compute_correct_result_128 (int *i1, int *i2, int *r) compute_correct_result_128 (signed char *i1, signed char *i2, signed char *r)
{ {
char *b1 = (char *) i1;
char *b2 = (char *) i2;
char *bout = (char *) r;
int i; int i;
char select; char select;
for (i = 0; i < 16; i++) for (i = 0; i < 16; i++)
{ {
select = b2[i]; select = i2[i];
if (select & 0x80) if (select & 0x80)
bout[i] = 0; r[i] = 0;
else else
bout[i] = b1[select & 0xf]; r[i] = i1[select & 0xf];
} }
} }
...@@ -85,28 +75,28 @@ static void ...@@ -85,28 +75,28 @@ static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 8) for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
{ {
#ifndef __AVX__ #ifndef __AVX__
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result_64 (&vals[i + 0], &vals[i + 4], ck); compute_correct_result_64 (&vals[i + 0].b[0], &vals[i + 1].b[0], &ck.b[0]);
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_pshufb (&vals[i + 0], &vals[i + 4], &r[0]); ssse3_test_pshufb (&vals[i + 0].ll[0], &vals[i + 1].ll[0], &r.ll[0]);
ssse3_test_pshufb (&vals[i + 2], &vals[i + 6], &r[2]); ssse3_test_pshufb (&vals[i + 0].ll[1], &vals[i + 1].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Recompute the result for 128-bits */ /* Recompute the result for 128-bits */
compute_correct_result_128 (&vals[i + 0], &vals[i + 4], ck); compute_correct_result_128 (&vals[i + 0].b[0], &vals[i + 1].b[0], &ck.b[0]);
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_pshufb128 (&vals[i + 0], &vals[i + 4], r); ssse3_test_pshufb128 (&vals[i + 0].m[0], &vals[i + 1].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -18,66 +18,59 @@ ...@@ -18,66 +18,59 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_psignb (int *i1, int *i2, int *r) ssse3_test_psignb (__m64 *i1, __m64 *i2, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *r = _mm_sign_pi8 (*i1, *i2);
__m64 t2 = *(__m64 *) i2;
*(__m64 *) r = _mm_sign_pi8 (t1, t2);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_psignb128 (int *i1, int *i2, int *r) ssse3_test_psignb128 (__m128i *i1, __m128i *i2, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *r = _mm_sign_epi8 (*i1, *i2);
__m128i t2 = *(__m128i *) i2;
*(__m128i *) r = _mm_sign_epi8 (t1, t2);
} }
/* Routine to manually compute the results */ /* Routine to manually compute the results */
static void static void
compute_correct_result (int *i1, int *i2, int *r) compute_correct_result (signed char *i1, signed char *i2, signed char *r)
{ {
char *b1 = (char *) i1;
char *b2 = (char *) i2;
char *bout = (char *) r;
int i; int i;
for (i = 0; i < 16; i++) for (i = 0; i < 16; i++)
if (b2[i] < 0) if (i2[i] < 0)
bout[i] = -b1[i]; r[i] = -i1[i];
else if (b2[i] == 0) else if (i2[i] == 0)
bout[i] = 0; r[i] = 0;
else else
bout[i] = b1[i]; r[i] = i1[i];
} }
static void static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 8) for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result (&vals[i + 0], &vals[i + 4], ck); compute_correct_result (&vals[i + 0].b[0], &vals[i + 1].b[0], &ck.b[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_psignb (&vals[i + 0], &vals[i + 4], &r[0]); ssse3_test_psignb (&vals[i + 0].ll[0], &vals[i + 1].ll[0], &r.ll[0]);
ssse3_test_psignb (&vals[i + 2], &vals[i + 6], &r[2]); ssse3_test_psignb (&vals[i + 0].ll[1], &vals[i + 1].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_psignb128 (&vals[i + 0], &vals[i + 4], r); ssse3_test_psignb128 (&vals[i + 0].m[0], &vals[i + 1].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -18,23 +18,19 @@ ...@@ -18,23 +18,19 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_psignd (int *i1, int *i2, int *r) ssse3_test_psignd (__m64 *i1, __m64 *i2, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *r = _mm_sign_pi32 (*i1, *i2);
__m64 t2 = *(__m64 *) i2;
*(__m64 *) r = _mm_sign_pi32 (t1, t2);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_psignd128 (int *i1, int *i2, int *r) ssse3_test_psignd128 (__m128i *i1, __m128i *i2, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *r = _mm_sign_epi32 (*i1, *i2);
__m128i t2 = *(__m128i *) i2;
*(__m128i *)r = _mm_sign_epi32 (t1, t2);
} }
/* Routine to manually compute the results */ /* Routine to manually compute the results */
...@@ -56,25 +52,25 @@ static void ...@@ -56,25 +52,25 @@ static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 8) for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result (&vals[i + 0], &vals[i + 4], ck); compute_correct_result (&vals[i + 0].w[0], &vals[i + 1].w[0], &ck.w[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_psignd (&vals[i + 0], &vals[i + 4], &r[0]); ssse3_test_psignd (&vals[i + 0].ll[0], &vals[i + 1].ll[0], &r.ll[0]);
ssse3_test_psignd (&vals[i + 2], &vals[i + 6], &r[2]); ssse3_test_psignd (&vals[i + 0].ll[1], &vals[i + 1].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_psignd128 (&vals[i + 0], &vals[i + 4], r); ssse3_test_psignd128 (&vals[i + 0].m[0], &vals[i + 1].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
...@@ -18,66 +18,59 @@ ...@@ -18,66 +18,59 @@
#ifndef __AVX__ #ifndef __AVX__
/* Test the 64-bit form */ /* Test the 64-bit form */
static void static void
ssse3_test_psignw (int *i1, int *i2, int *r) ssse3_test_psignw (__m64 *i1, __m64 *i2, __m64 *r)
{ {
__m64 t1 = *(__m64 *) i1; *r = _mm_sign_pi16 (*i1, *i2);
__m64 t2 = *(__m64 *) i2;
*(__m64 *) r = _mm_sign_pi16 (t1, t2);
_mm_empty (); _mm_empty ();
} }
#endif #endif
/* Test the 128-bit form */ /* Test the 128-bit form */
static void static void
ssse3_test_psignw128 (int *i1, int *i2, int *r) ssse3_test_psignw128 (__m128i *i1, __m128i *i2, __m128i *r)
{ {
/* Assumes incoming pointers are 16-byte aligned */ /* Assumes incoming pointers are 16-byte aligned */
__m128i t1 = *(__m128i *) i1; *r = _mm_sign_epi16 (*i1, *i2);
__m128i t2 = *(__m128i *) i2;
*(__m128i *) r = _mm_sign_epi16 (t1, t2);
} }
/* Routine to manually compute the results */ /* Routine to manually compute the results */
static void static void
compute_correct_result (int *i1, int *i2, int *r) compute_correct_result (short *i1, short *i2, short *r)
{ {
short *s1 = (short *) i1;
short *s2 = (short *) i2;
short *sout = (short *) r;
int i; int i;
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)
if (s2[i] < 0) if (i2[i] < 0)
sout[i] = -s1[i]; r[i] = -i1[i];
else if (s2[i] == 0) else if (i2[i] == 0)
sout[i] = 0; r[i] = 0;
else else
sout[i] = s1[i]; r[i] = i1[i];
} }
static void static void
TEST (void) TEST (void)
{ {
int i; int i;
int r [4] __attribute__ ((aligned(16))); union data r __attribute__ ((aligned(16)));
int ck [4]; union data ck;
int fail = 0; int fail = 0;
for (i = 0; i < 256; i += 8) for (i = 0; i < ARRAY_SIZE (vals) - 1; i++)
{ {
/* Manually compute the result */ /* Manually compute the result */
compute_correct_result (&vals[i + 0], &vals[i + 4], ck); compute_correct_result (&vals[i + 0].h[0], &vals[i + 1].h[0], &ck.h[0]);
#ifndef __AVX__ #ifndef __AVX__
/* Run the 64-bit tests */ /* Run the 64-bit tests */
ssse3_test_psignw (&vals[i + 0], &vals[i + 4], &r[0]); ssse3_test_psignw (&vals[i + 0].ll[0], &vals[i + 1].ll[0], &r.ll[0]);
ssse3_test_psignw (&vals[i + 2], &vals[i + 6], &r[2]); ssse3_test_psignw (&vals[i + 0].ll[1], &vals[i + 1].ll[1], &r.ll[1]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
#endif #endif
/* Run the 128-bit tests */ /* Run the 128-bit tests */
ssse3_test_psignw128 (&vals[i + 0], &vals[i + 4], r); ssse3_test_psignw128 (&vals[i + 0].m[0], &vals[i + 1].m[0], &r.m[0]);
fail += chk_128 (ck, r); fail += chk_128 (ck.m[0], r.m[0]);
} }
if (fail != 0) if (fail != 0)
......
#include <tmmintrin.h>
/* Routine to check correctness of the results */ /* Routine to check correctness of the results */
static int static int
chk_128 (int *v1, int *v2) chk_128 (__m128i v1, __m128i v2)
{ {
int i; return (v1[0] != v2[0]) || (v1[1] != v2[1]);
int n_fails = 0;
for (i = 0; i < 4; i++)
if (v1[i] != v2[i])
n_fails += 1;
return n_fails;
} }
static int vals [256] __attribute__ ((aligned(16))) = static union data {
int w[4];
signed char b[16];
unsigned char ub[16];
short h[8];
unsigned long long ll[2];
__m128i m[1];
} vals[] __attribute__ ((aligned(16))) =
{ {
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x5be800ee, 0x4f2d7b15, { { 0x00000000, 0x00000000, 0x00000000, 0x00000000 } },
0x409d9291, 0xdd95f27f, 0x423986e3, 0x21a4d2cd, 0xa7056d84, 0x4f4e5a3b, { { 0x5be800ee, 0x4f2d7b15, 0x409d9291, 0xdd95f27f } },
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, { { 0x423986e3, 0x21a4d2cd, 0xa7056d84, 0x4f4e5a3b } },
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, { { 0x00000000, 0x00000000, 0x00000000, 0x00000000 } },
0x73ef0244, 0xcd836329, 0x847f634f, 0xa7e3abcf, 0xb4c14764, 0x1ef42c06, { { 0x00000000, 0x00000000, 0x00000000, 0x00000000 } },
0x504f29ac, 0x4ae7ca73, 0xaddde3c9, 0xf63ded2e, 0xa5d3553d, 0xa52ae05f, { { 0x00000000, 0x00000000, 0x00000000, 0x00000000 } },
0x6fd3c83a, 0x7dc2b300, 0x76b05de7, 0xea8ebae5, 0x549568dd, 0x172f0358, { { 0x73ef0244, 0xcd836329, 0x847f634f, 0xa7e3abcf } },
0x917eadf0, 0x796fb0a7, 0xb39381af, 0xd0591d61, 0x731d2f17, 0xbc4b6f5d, { { 0xb4c14764, 0x1ef42c06, 0x504f29ac, 0x4ae7ca73 } },
0x8ec664c2, 0x3c199c19, 0x9c81db12, 0x6d85913b, 0x486107a9, 0xab6f4b26, { { 0xaddde3c9, 0xf63ded2e, 0xa5d3553d, 0xa52ae05f } },
0x5630d37c, 0x20836e85, 0x40d4e746, 0xdfbaba36, 0xbeacaa69, 0xb3c84083, { { 0x6fd3c83a, 0x7dc2b300, 0x76b05de7, 0xea8ebae5 } },
0x8a688eb4, 0x08cde481, 0x66e7a190, 0x74ee1639, 0xb3942a19, 0xe0c40471, { { 0x549568dd, 0x172f0358, 0x917eadf0, 0x796fb0a7 } },
0x9b789489, 0x9751207a, 0x543a1524, 0x41da7ad6, 0x614bb563, 0xf86f57b1, { { 0xb39381af, 0xd0591d61, 0x731d2f17, 0xbc4b6f5d } },
0x69e62199, 0x2150cb12, 0x9ed74062, 0x429471f4, 0xad28502b, 0xf2e2d4d5, { { 0x8ec664c2, 0x3c199c19, 0x9c81db12, 0x6d85913b } },
0x45b6ce09, 0xaaa5e649, 0xb46da484, 0x0a637515, 0xae7a3212, 0x5afc784c, { { 0x486107a9, 0xab6f4b26, 0x5630d37c, 0x20836e85 } },
0x776cfbbe, 0x9c542bb2, 0x64193aa8, 0x16e8a655, 0x4e3d2f92, 0xe05d7b72, { { 0x40d4e746, 0xdfbaba36, 0xbeacaa69, 0xb3c84083 } },
0x89854ebc, 0x8c318814, 0xb81e76e0, 0x3f2625f5, 0x61b44852, 0x5209d7ad, { { 0x8a688eb4, 0x08cde481, 0x66e7a190, 0x74ee1639 } },
0x842fe317, 0xd3cfcca1, 0x8d287cc7, 0x80f0c9a8, 0x4215f4e5, 0x563993d6, { { 0xb3942a19, 0xe0c40471, 0x9b789489, 0x9751207a } },
0x5d627433, 0xc4449e35, 0x5b4fe009, 0x3ef92286, 0xacbc8927, 0x549ab870, { { 0x543a1524, 0x41da7ad6, 0x614bb563, 0xf86f57b1 } },
0x9ac5b959, 0xed8f1c91, 0x7ecf02cd, 0x989c0e8b, 0xa31d6918, 0x1dc2bcc1, { { 0x69e62199, 0x2150cb12, 0x9ed74062, 0x429471f4 } },
0x99d3f3cc, 0x6857acc8, 0x45d7324a, 0xaebdf2e6, 0x7af2f2ae, 0x09716f73, { { 0xad28502b, 0xf2e2d4d5, 0x45b6ce09, 0xaaa5e649 } },
0x7816e694, 0xc65493c0, 0x9f7e87bc, 0xaa96cd40, 0xbfb5bfc6, 0x01a2cce7, { { 0xb46da484, 0x0a637515, 0xae7a3212, 0x5afc784c } },
0x5f1d8c46, 0x45303efb, 0xb24607c3, 0xef2009a7, 0xba873753, 0xbefb14bc, { { 0x776cfbbe, 0x9c542bb2, 0x64193aa8, 0x16e8a655 } },
0x74e53cd3, 0x70124708, 0x6eb4bdbd, 0xf3ba5e43, 0x4c94085f, 0x0c03e7e0, { { 0x4e3d2f92, 0xe05d7b72, 0x89854ebc, 0x8c318814 } },
0x9a084931, 0x62735424, 0xaeee77c5, 0xdb34f90f, 0x6860cbdd, 0xaf77cf9f, { { 0xb81e76e0, 0x3f2625f5, 0x61b44852, 0x5209d7ad } },
0x95b28158, 0x23bd70d7, 0x9fbc3d88, 0x742e659e, 0x53bcfb48, 0xb8a63f6c, { { 0x842fe317, 0xd3cfcca1, 0x8d287cc7, 0x80f0c9a8 } },
0x4dcf3373, 0x2b168627, 0x4fe20745, 0xd0af5e94, 0x22514e6a, 0xb8ef25c2, { { 0x4215f4e5, 0x563993d6, 0x5d627433, 0xc4449e35 } },
0x89ec781a, 0x13d9002b, 0x6d724500, 0x7fdbf63f, 0xb0e9ced5, 0xf919e0f3, { { 0x5b4fe009, 0x3ef92286, 0xacbc8927, 0x549ab870 } },
0x00fef203, 0x8905d47a, 0x434e7517, 0x4aef8e2c, 0x689f51e8, 0xe513b7c3, { { 0x9ac5b959, 0xed8f1c91, 0x7ecf02cd, 0x989c0e8b } },
0x72bbc5d2, 0x3a222f74, 0x05c3a0f9, 0xd5489d82, 0xb41fbe83, 0xec5d305f, { { 0xa31d6918, 0x1dc2bcc1, 0x99d3f3cc, 0x6857acc8 } },
0x5ea02b0b, 0xb176065b, 0xa8eb404e, 0x80349117, 0x210fd49e, 0x43898d0e, { { 0x45d7324a, 0xaebdf2e6, 0x7af2f2ae, 0x09716f73 } },
0x6c151b9c, 0x8742df18, 0x7b64de73, 0x1dbf52b2, 0x55c9cb19, 0xeb841f10, { { 0x7816e694, 0xc65493c0, 0x9f7e87bc, 0xaa96cd40 } },
0x10b8ae76, 0x0764ecb6, 0xb7479018, 0x2672cb3f, 0x7ac9ac90, 0x4be5332c, { { 0xbfb5bfc6, 0x01a2cce7, 0x5f1d8c46, 0x45303efb } },
0x8f1a0615, 0x4efb7a77, 0x16551a85, 0xdb2c3d66, 0x49179c07, 0x5dc4657e, { { 0xb24607c3, 0xef2009a7, 0xba873753, 0xbefb14bc } },
0x5e76907e, 0xd7486a9c, 0x445204a4, 0x65cdc426, 0x33f86ded, 0xcba95dda, { { 0x74e53cd3, 0x70124708, 0x6eb4bdbd, 0xf3ba5e43 } },
0x83351f16, 0xfedefad9, 0x639b620f, 0x86896a64, 0xba4099ba, 0x965f4a21, { { 0x4c94085f, 0x0c03e7e0, 0x9a084931, 0x62735424 } },
0x1247154f, 0x25604c42, 0x5862d692, 0xb1e9149e, 0x612516a5, 0x02c49bf8, { { 0xaeee77c5, 0xdb34f90f, 0x6860cbdd, 0xaf77cf9f } },
0x631212bf, 0x9f69f54e, 0x168b63b0, 0x310a25ba, 0xa42a59cd, 0x084f0af9, { { 0x95b28158, 0x23bd70d7, 0x9fbc3d88, 0x742e659e } },
0x44a06cec, 0x5c0cda40, 0xb932d721, 0x7c42bb0d, 0x213cd3f0, 0xedc7f5a4, { { 0x53bcfb48, 0xb8a63f6c, 0x4dcf3373, 0x2b168627 } },
0x7fb85859, 0x6b3da5ea, 0x61cd591e, 0xe8e9aa08, 0x4361fc34, 0x53d40d2a, { { 0x4fe20745, 0xd0af5e94, 0x22514e6a, 0xb8ef25c2 } },
0x0511ad1b, 0xf996b44c, 0xb5ead756, 0xc022138d, 0x6172adf1, 0xa4a0a3b4, { { 0x89ec781a, 0x13d9002b, 0x6d724500, 0x7fdbf63f } },
0x8c2977b8, 0xa8e482ed, 0x04fcdd6b, 0x3f7b85d4, 0x4fca1e46, 0xa392ddca, { { 0xb0e9ced5, 0xf919e0f3, 0x00fef203, 0x8905d47a } },
0x569fc791, 0x346a706c, 0x543bf3eb, 0x895b3cde, 0x2146bb80, 0x26b3c168, { { 0x434e7517, 0x4aef8e2c, 0x689f51e8, 0xe513b7c3 } },
0x929998db, 0x1ea472c9, 0x7207b36b, 0x6a8f10d4 { { 0x72bbc5d2, 0x3a222f74, 0x05c3a0f9, 0xd5489d82 } },
{ { 0xb41fbe83, 0xec5d305f, 0x5ea02b0b, 0xb176065b } },
{ { 0xa8eb404e, 0x80349117, 0x210fd49e, 0x43898d0e } },
{ { 0x6c151b9c, 0x8742df18, 0x7b64de73, 0x1dbf52b2 } },
{ { 0x55c9cb19, 0xeb841f10, 0x10b8ae76, 0x0764ecb6 } },
{ { 0xb7479018, 0x2672cb3f, 0x7ac9ac90, 0x4be5332c } },
{ { 0x8f1a0615, 0x4efb7a77, 0x16551a85, 0xdb2c3d66 } },
{ { 0x49179c07, 0x5dc4657e, 0x5e76907e, 0xd7486a9c } },
{ { 0x445204a4, 0x65cdc426, 0x33f86ded, 0xcba95dda } },
{ { 0x83351f16, 0xfedefad9, 0x639b620f, 0x86896a64 } },
{ { 0xba4099ba, 0x965f4a21, 0x1247154f, 0x25604c42 } },
{ { 0x5862d692, 0xb1e9149e, 0x612516a5, 0x02c49bf8 } },
{ { 0x631212bf, 0x9f69f54e, 0x168b63b0, 0x310a25ba } },
{ { 0xa42a59cd, 0x084f0af9, 0x44a06cec, 0x5c0cda40 } },
{ { 0xb932d721, 0x7c42bb0d, 0x213cd3f0, 0xedc7f5a4 } },
{ { 0x7fb85859, 0x6b3da5ea, 0x61cd591e, 0xe8e9aa08 } },
{ { 0x4361fc34, 0x53d40d2a, 0x0511ad1b, 0xf996b44c } },
{ { 0xb5ead756, 0xc022138d, 0x6172adf1, 0xa4a0a3b4 } },
{ { 0x8c2977b8, 0xa8e482ed, 0x04fcdd6b, 0x3f7b85d4 } },
{ { 0x4fca1e46, 0xa392ddca, 0x569fc791, 0x346a706c } },
{ { 0x543bf3eb, 0x895b3cde, 0x2146bb80, 0x26b3c168 } },
{ { 0x929998db, 0x1ea472c9, 0x7207b36b, 0x6a8f10d4 } }
}; };
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment