Commit 05629956 by H.J. Lu Committed by H.J. Lu

i386-cpuid.h (bit_SSE4_1): New.

2007-05-22  H.J. Lu  <hongjiu.lu@intel.com>

	* gcc.dg/i386-cpuid.h (bit_SSE4_1): New.
	(bit_SSE4_2): Likewise.
	(bit_POPCNT): Likewise.

	* gcc.target/i386/i386.exp (check_effective_target_sse4): New.
	Check if assembler supports SSE4 instructions.

	* gcc.target/i386/sse4_1-blendpd.c: New file.
	* gcc.target/i386/sse4_1-blendps.c: Likewise.
	* gcc.target/i386/sse4_1-blendvpd.c: Likewise.
	* gcc.target/i386/sse4_1-blendvps.c: Likewise.
	* gcc.target/i386/sse4_1-check.h: Likewise.
	* gcc.target/i386/sse4_1-dppd-1.c: Likewise.
	* gcc.target/i386/sse4_1-dppd-2.c: Likewise.
	* gcc.target/i386/sse4_1-dpps-1.c: Likewise.
	* gcc.target/i386/sse4_1-dpps-2.c: Likewise.
	* gcc.target/i386/sse4_1-extractps.c: Likewise.
	* gcc.target/i386/sse4_1-insertps-1.c: Likewise.
	* gcc.target/i386/sse4_1-insertps-2.c: Likewise.
	* gcc.target/i386/sse4_1-movntdqa.c: Likewise.
	* gcc.target/i386/sse4_1-mpsadbw.c: Likewise.
	* gcc.target/i386/sse4_1-packusdw.c: Likewise.
	* gcc.target/i386/sse4_1-pblendvb.c: Likewise.
	* gcc.target/i386/sse4_1-pblendw.c: Likewise.
	* gcc.target/i386/sse4_1-pcmpeqq.c: Likewise.
	* gcc.target/i386/sse4_1-pextrb.c: Likewise.
	* gcc.target/i386/sse4_1-pextrd.c: Likewise.
	* gcc.target/i386/sse4_1-pextrq.c: Likewise.
	* gcc.target/i386/sse4_1-pextrw.c: Likewise.
	* gcc.target/i386/sse4_1-phminposuw.c: Likewise.
	* gcc.target/i386/sse4_1-pinsrb.c: Likewise.
	* gcc.target/i386/sse4_1-pinsrd.c: Likewise.
	* gcc.target/i386/sse4_1-pinsrq.c: Likewise.
	* gcc.target/i386/sse4_1-pmaxsb.c: Likewise.
	* gcc.target/i386/sse4_1-pmaxsd.c: Likewise.
	* gcc.target/i386/sse4_1-pmaxud.c: Likewise.
	* gcc.target/i386/sse4_1-pmaxuw.c: Likewise.
	* gcc.target/i386/sse4_1-pminsb.c: Likewise.
	* gcc.target/i386/sse4_1-pminsd.c: Likewise.
	* gcc.target/i386/sse4_1-pminud.c: Likewise.
	* gcc.target/i386/sse4_1-pminuw.c: Likewise.
	* gcc.target/i386/sse4_1-pmovsxbd.c: Likewise.
	* gcc.target/i386/sse4_1-pmovsxbq.c: Likewise.
	* gcc.target/i386/sse4_1-pmovsxbw.c: Likewise.
	* gcc.target/i386/sse4_1-pmovsxdq.c: Likewise.
	* gcc.target/i386/sse4_1-pmovsxwd.c: Likewise.
	* gcc.target/i386/sse4_1-pmovsxwq.c: Likewise.
	* gcc.target/i386/sse4_1-pmovzxbd.c: Likewise.
	* gcc.target/i386/sse4_1-pmovzxbq.c: Likewise.
	* gcc.target/i386/sse4_1-pmovzxbw.c: Likewise.
	* gcc.target/i386/sse4_1-pmovzxdq.c: Likewise.
	* gcc.target/i386/sse4_1-pmovzxwd.c: Likewise.
	* gcc.target/i386/sse4_1-pmovzxwq.c: Likewise.
	* gcc.target/i386/sse4_1-pmuldq.c: Likewise.
	* gcc.target/i386/sse4_1-pmulld.c: Likewise.
	* gcc.target/i386/sse4_1-ptest-1.c: Likewise.
	* gcc.target/i386/sse4_1-ptest-2.c: Likewise.
	* gcc.target/i386/sse4_1-ptest-3.c: Likewise.
	* gcc.target/i386/sse4_1-round.h: Likewise.
	* gcc.target/i386/sse4_1-roundpd-1.c: Likewise.
	* gcc.target/i386/sse4_1-roundpd-2.c: Likewise.
	* gcc.target/i386/sse4_1-roundpd-3.c: Likewise.
	* gcc.target/i386/sse4_1-roundps-1.c: Likewise.
	* gcc.target/i386/sse4_1-roundps-2.c: Likewise.
	* gcc.target/i386/sse4_1-roundps-3.c: Likewise.
	* gcc.target/i386/sse4_1-roundsd-1.c: Likewise.
	* gcc.target/i386/sse4_1-roundsd-2.c: Likewise.
	* gcc.target/i386/sse4_1-roundsd-3.c: Likewise.
	* gcc.target/i386/sse4_1-roundsd-4.c: Likewise.
	* gcc.target/i386/sse4_1-roundss-1.c: Likewise.
	* gcc.target/i386/sse4_1-roundss-2.c: Likewise.
	* gcc.target/i386/sse4_1-roundss-3.c: Likewise.
	* gcc.target/i386/sse4_1-roundss-4.c: Likewise.

From-SVN: r124947
parent 9a5cee02
2007-05-22 H.J. Lu <hongjiu.lu@intel.com>
* gcc.dg/i386-cpuid.h (bit_SSE4_1): New.
(bit_SSE4_2): Likewise.
(bit_POPCNT): Likewise.
* gcc.target/i386/i386.exp (check_effective_target_sse4): New.
Check if assembler supports SSE4 instructions.
* gcc.target/i386/sse4_1-blendpd.c: New file.
* gcc.target/i386/sse4_1-blendps.c: Likewise.
* gcc.target/i386/sse4_1-blendvpd.c: Likewise.
* gcc.target/i386/sse4_1-blendvps.c: Likewise.
* gcc.target/i386/sse4_1-check.h: Likewise.
* gcc.target/i386/sse4_1-dppd-1.c: Likewise.
* gcc.target/i386/sse4_1-dppd-2.c: Likewise.
* gcc.target/i386/sse4_1-dpps-1.c: Likewise.
* gcc.target/i386/sse4_1-dpps-2.c: Likewise.
* gcc.target/i386/sse4_1-extractps.c: Likewise.
* gcc.target/i386/sse4_1-insertps-1.c: Likewise.
* gcc.target/i386/sse4_1-insertps-2.c: Likewise.
* gcc.target/i386/sse4_1-movntdqa.c: Likewise.
* gcc.target/i386/sse4_1-mpsadbw.c: Likewise.
* gcc.target/i386/sse4_1-packusdw.c: Likewise.
* gcc.target/i386/sse4_1-pblendvb.c: Likewise.
* gcc.target/i386/sse4_1-pblendw.c: Likewise.
* gcc.target/i386/sse4_1-pcmpeqq.c: Likewise.
* gcc.target/i386/sse4_1-pextrb.c: Likewise.
* gcc.target/i386/sse4_1-pextrd.c: Likewise.
* gcc.target/i386/sse4_1-pextrq.c: Likewise.
* gcc.target/i386/sse4_1-pextrw.c: Likewise.
* gcc.target/i386/sse4_1-phminposuw.c: Likewise.
* gcc.target/i386/sse4_1-pinsrb.c: Likewise.
* gcc.target/i386/sse4_1-pinsrd.c: Likewise.
* gcc.target/i386/sse4_1-pinsrq.c: Likewise.
* gcc.target/i386/sse4_1-pmaxsb.c: Likewise.
* gcc.target/i386/sse4_1-pmaxsd.c: Likewise.
* gcc.target/i386/sse4_1-pmaxud.c: Likewise.
* gcc.target/i386/sse4_1-pmaxuw.c: Likewise.
* gcc.target/i386/sse4_1-pminsb.c: Likewise.
* gcc.target/i386/sse4_1-pminsd.c: Likewise.
* gcc.target/i386/sse4_1-pminud.c: Likewise.
* gcc.target/i386/sse4_1-pminuw.c: Likewise.
* gcc.target/i386/sse4_1-pmovsxbd.c: Likewise.
* gcc.target/i386/sse4_1-pmovsxbq.c: Likewise.
* gcc.target/i386/sse4_1-pmovsxbw.c: Likewise.
* gcc.target/i386/sse4_1-pmovsxdq.c: Likewise.
* gcc.target/i386/sse4_1-pmovsxwd.c: Likewise.
* gcc.target/i386/sse4_1-pmovsxwq.c: Likewise.
* gcc.target/i386/sse4_1-pmovzxbd.c: Likewise.
* gcc.target/i386/sse4_1-pmovzxbq.c: Likewise.
* gcc.target/i386/sse4_1-pmovzxbw.c: Likewise.
* gcc.target/i386/sse4_1-pmovzxdq.c: Likewise.
* gcc.target/i386/sse4_1-pmovzxwd.c: Likewise.
* gcc.target/i386/sse4_1-pmovzxwq.c: Likewise.
* gcc.target/i386/sse4_1-pmuldq.c: Likewise.
* gcc.target/i386/sse4_1-pmulld.c: Likewise.
* gcc.target/i386/sse4_1-ptest-1.c: Likewise.
* gcc.target/i386/sse4_1-ptest-2.c: Likewise.
* gcc.target/i386/sse4_1-ptest-3.c: Likewise.
* gcc.target/i386/sse4_1-round.h: Likewise.
* gcc.target/i386/sse4_1-roundpd-1.c: Likewise.
* gcc.target/i386/sse4_1-roundpd-2.c: Likewise.
* gcc.target/i386/sse4_1-roundpd-3.c: Likewise.
* gcc.target/i386/sse4_1-roundps-1.c: Likewise.
* gcc.target/i386/sse4_1-roundps-2.c: Likewise.
* gcc.target/i386/sse4_1-roundps-3.c: Likewise.
* gcc.target/i386/sse4_1-roundsd-1.c: Likewise.
* gcc.target/i386/sse4_1-roundsd-2.c: Likewise.
* gcc.target/i386/sse4_1-roundsd-3.c: Likewise.
* gcc.target/i386/sse4_1-roundsd-4.c: Likewise.
* gcc.target/i386/sse4_1-roundss-1.c: Likewise.
* gcc.target/i386/sse4_1-roundss-2.c: Likewise.
* gcc.target/i386/sse4_1-roundss-3.c: Likewise.
* gcc.target/i386/sse4_1-roundss-4.c: Likewise.
2007-05-22 Francois-Xavier Coudert <fxcoudert@gcc.gnu.org>
PR fortran/31627
......@@ -5,6 +5,9 @@
/* %ecx */
#define bit_SSE3 (1 << 0)
#define bit_SSSE3 (1 << 9)
#define bit_SSE4_1 (1 << 19)
#define bit_SSE4_2 (1 << 20)
#define bit_POPCNT (1 << 23)
/* %edx */
#define bit_CMOV (1 << 15)
......
......@@ -37,6 +37,20 @@ proc check_effective_target_ssse3 { } {
} "-O2 -mssse3" ]
}
# Return 1 if sse4 instructions can be compiled.
proc check_effective_target_sse4 { } {
return [check_no_compiler_messages sse4.1 object {
typedef long long __m128i __attribute__ ((__vector_size__ (16)));
typedef int __v4si __attribute__ ((__vector_size__ (16)));
__m128i _mm_mullo_epi32 (__m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X,
(__v4si)__Y);
}
} "-O2 -msse4.1" ]
}
# Return 1 if sse4a instructions can be compiled.
proc check_effective_target_sse4a { } {
return [check_no_compiler_messages sse4a object {
......
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define NUM 20
#ifndef MASK
#define MASK 0x03
#endif
static void
init_blendpd (double *src1, double *src2)
{
int i, sign = 1;
for (i = 0; i < NUM * 2; i++)
{
src1[i] = i * i * sign;
src2[i] = (i + 20) * sign;
sign = -sign;
}
}
static int
check_blendpd (__m128d *dst, double *src1, double *src2)
{
double tmp[2];
int j;
memcpy (&tmp[0], src1, sizeof (tmp));
for(j = 0; j < 2; j++)
if ((MASK & (1 << j)))
tmp[j] = src2[j];
return memcmp (dst, &tmp[0], sizeof (tmp));
}
static void
sse4_1_test (void)
{
__m128d x, y;
union
{
__m128d x[NUM];
double d[NUM * 2];
} dst, src1, src2;
union
{
__m128d x;
double d[2];
} src3;
int i;
init_blendpd (src1.d, src2.d);
/* Check blendpd imm8, m128, xmm */
for (i = 0; i < NUM; i++)
{
dst.x[i] = _mm_blend_pd (src1.x[i], src2.x[i], MASK);
if (check_blendpd (&dst.x[i], &src1.d[i * 2], &src2.d[i * 2]))
abort ();
}
/* Check blendpd imm8, xmm, xmm */
src3.x = _mm_setzero_pd ();
x = _mm_blend_pd (dst.x[2], src3.x, MASK);
y = _mm_blend_pd (src3.x, dst.x[2], MASK);
if (check_blendpd (&x, &dst.d[4], &src3.d[0]))
abort ();
if (check_blendpd (&y, &src3.d[0], &dst.d[4]))
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define NUM 20
#ifndef MASK
#define MASK 0x0f
#endif
static void
init_blendps (float *src1, float *src2)
{
int i, sign = 1;
for (i = 0; i < NUM * 4; i++)
{
src1[i] = i * i * sign;
src2[i] = (i + 20) * sign;
sign = -sign;
}
}
static int
check_blendps (__m128 *dst, float *src1, float *src2)
{
float tmp[4];
int j;
memcpy (&tmp[0], src1, sizeof (tmp));
for (j = 0; j < 4; j++)
if ((MASK & (1 << j)))
tmp[j] = src2[j];
return memcmp (dst, &tmp[0], sizeof (tmp));
}
static void
sse4_1_test (void)
{
__m128 x, y;
union
{
__m128 x[NUM];
float f[NUM * 4];
} dst, src1, src2;
union
{
__m128 x;
float f[4];
} src3;
int i;
init_blendps (src1.f, src2.f);
/* Check blendps imm8, m128, xmm */
for (i = 0; i < NUM; i++)
{
dst.x[i] = _mm_blend_ps (src1.x[i], src2.x[i], MASK);
if (check_blendps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4]))
abort ();
}
/* Check blendps imm8, xmm, xmm */
x = _mm_blend_ps (dst.x[2], src3.x, MASK);
y = _mm_blend_ps (src3.x, dst.x[2], MASK);
if (check_blendps (&x, &dst.f[8], &src3.f[0]))
abort ();
if (check_blendps (&y, &src3.f[0], &dst.f[8]))
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define NUM 20
static void
init_blendvpd (double *src1, double *src2, double *mask)
{
int i, msk, sign = 1;
msk = -1;
for (i = 0; i < NUM * 2; i++)
{
if((i % 2) == 0)
msk++;
src1[i] = i* (i + 1) * sign;
src2[i] = (i + 20) * sign;
mask[i] = (i + 120) * i;
if( (msk & (1 << (i % 2))))
mask[i] = -mask[i];
sign = -sign;
}
}
static int
check_blendvpd (__m128d *dst, double *src1, double *src2,
double *mask)
{
double tmp[2];
int j;
memcpy (&tmp[0], src1, sizeof (tmp));
for (j = 0; j < 2; j++)
if (mask [j] < 0.0)
tmp[j] = src2[j];
return memcmp (dst, &tmp[0], sizeof (tmp));
}
static void
sse4_1_test (void)
{
union
{
__m128d x[NUM];
double d[NUM * 2];
} dst, src1, src2, mask;
int i;
init_blendvpd (src1.d, src2.d, mask.d);
for (i = 0; i < NUM; i++)
{
dst.x[i] = _mm_blendv_pd (src1.x[i], src2.x[i], mask.x[i]);
if (check_blendvpd (&dst.x[i], &src1.d[i * 2], &src2.d[i * 2],
&mask.d[i * 2]))
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define NUM 20
static void
init_blendvps (float *src1, float *src2, float *mask)
{
int i, msk, sign = 1;
msk = -1;
for (i = 0; i < NUM * 4; i++)
{
if((i % 4) == 0)
msk++;
src1[i] = i* (i + 1) * sign;
src2[i] = (i + 20) * sign;
mask[i] = (i + 120) * i;
if( (msk & (1 << (i % 4))))
mask[i] = -mask[i];
sign = -sign;
}
}
static int
check_blendvps (__m128 *dst, float *src1, float *src2,
float *mask)
{
float tmp[4];
int j;
memcpy (&tmp[0], src1, sizeof (tmp));
for (j = 0; j < 4; j++)
if (mask [j] < 0.0)
tmp[j] = src2[j];
return memcmp (dst, &tmp[0], sizeof (tmp));
}
static void
sse4_1_test (void)
{
union
{
__m128 x[NUM];
float f[NUM * 4];
} dst, src1, src2, mask;
int i;
init_blendvps (src1.f, src2.f, mask.f);
for (i = 0; i < NUM; i++)
{
dst.x[i] = _mm_blendv_ps (src1.x[i], src2.x[i], mask.x[i]);
if (check_blendvps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4],
&mask.f[i * 4]))
abort ();
}
}
#include <stdio.h>
#include <stdlib.h>
#include "../auto-host.h"
#include "../../gcc.dg/i386-cpuid.h"
static void sse4_1_test (void);
int
main ()
{
unsigned long cpu_facilities;
cpu_facilities = i386_cpuid_ecx ();
/* Run SSE4.1 test only if host has SSE4.1 support. */
if ((cpu_facilities & bit_SSE4_1))
sse4_1_test ();
exit (0);
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define lmskN 0x00
#define lmsk0 0x01
#define lmsk1 0x02
#define lmsk01 0x03
#define hmskA 0x30
#define hmsk0 0x10
#define hmsk1 0x20
#define hmsk01 0x30
#define hmskN 0x00
#ifndef HIMASK
#define HIMASK hmskA
#endif
static void
sse4_1_test (void)
{
union
{
__m128d x;
double d[2];
} val1, val2, res[4];
int masks[4];
int i, j;
val1.d[0] = 2.;
val1.d[1] = 3.;
val2.d[0] = 10.;
val2.d[1] = 100.;
res[0].x = _mm_dp_pd (val1.x, val2.x, HIMASK | lmskN);
res[1].x = _mm_dp_pd (val1.x, val2.x, HIMASK | lmsk0);
res[2].x = _mm_dp_pd (val1.x, val2.x, HIMASK | lmsk1);
res[3].x = _mm_dp_pd (val1.x, val2.x, HIMASK | lmsk01);
masks[0] = HIMASK | lmskN;
masks[1] = HIMASK | lmsk0;
masks[2] = HIMASK | lmsk1;
masks[3] = HIMASK | lmsk01;
for (i = 0; i < 4; i++)
{
double tmp = 0.;
for (j = 0; j < 2; j++)
if (HIMASK & (0x10 << j))
tmp = tmp + (val1.d[j] * val2.d[j]);
for (j = 0; j < 2; j++)
if ((masks[i] & (1 << j)) && res[i].d[j] != tmp)
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define lmskN 0x00
#define lmsk0 0x01
#define lmsk1 0x02
#define lmsk01 0x03
#define hmskA 0x30
#define hmsk0 0x10
#define hmsk1 0x20
#define hmsk01 0x30
#define hmskN 0x00
#ifndef HIMASK
#define HIMASK hmskA
#endif
#ifndef LOMASK
#define LOMASK lmsk01
#endif
static void
sse4_1_test (void)
{
union
{
__m128d x;
double d[2];
} val1[4], val2[4], res[4], chk[4];
int i, j;
double tmp;
for (i = 0; i < 4; i++)
{
val1[i].d [0] = 2.;
val1[i].d [1] = 3.;
val2[i].d [0] = 10.;
val2[i].d [1] = 100.;
tmp = 0.;
for (j = 0; j < 2; j++)
if ((HIMASK & (0x10 << j)))
tmp += val1[i].d [j] * val2[i].d [j];
for (j = 0; j < 2; j++)
if ((LOMASK & (1 << j)))
chk[i].d[j] = tmp;
}
for (i = 0; i < 4; i++)
{
res[i].x = _mm_dp_pd (val1[i].x, val2[i].x, HIMASK | LOMASK);
if (memcmp (&res[i], &chk[i], sizeof (chk[i])))
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define lmskN 0x00
#define lmsk0 0x01
#define lmsk1 0x02
#define lmsk2 0x04
#define lmsk3 0x08
#define lmsk01 0x03
#define lmsk02 0x05
#define lmsk03 0x09
#define lmsk12 0x06
#define lmsk13 0x0A
#define lmsk23 0x0C
#define lmskA 0x0F
#define hmskN 0x00
#define hmskA 0xF0
#define hmsk0 0x10
#define hmsk1 0x20
#define hmsk2 0x40
#define hmsk3 0x80
#define hmsk01 0x30
#define hmsk02 0x50
#define hmsk03 0x90
#define hmsk12 0x60
#define hmsk13 0xA0
#define hmsk23 0xC0
#ifndef HIMASK
#define HIMASK hmskA
#endif
static void
sse4_1_test (void)
{
union
{
__m128 x;
float f[4];
} val1, val2, res[16];
int masks[16];
int i, j;
val1.f[0] = 2.;
val1.f[1] = 3.;
val1.f[2] = 4.;
val1.f[3] = 5.;
val2.f[0] = 10.;
val2.f[1] = 100.;
val2.f[2] = 1000.;
val2.f[3] = 10000.;
res[0].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk0);
res[1].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk1);
res[2].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk2);
res[3].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk3);
res[4].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk01);
res[5].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk02);
res[6].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk03);
res[7].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk12);
res[8].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk13);
res[9].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk23);
res[10].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk0));
res[11].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk1));
res[12].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk2));
res[13].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk3));
res[14].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmskN);
res[15].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmskA);
masks[0] = HIMASK | lmsk0;
masks[1] = HIMASK | lmsk1;
masks[2] = HIMASK | lmsk2;
masks[3] = HIMASK | lmsk3;
masks[4] = HIMASK | lmsk01;
masks[5] = HIMASK | lmsk02;
masks[6] = HIMASK | lmsk03;
masks[7] = HIMASK | lmsk12;
masks[8] = HIMASK | lmsk13;
masks[9] = HIMASK | lmsk23;
masks[10] = HIMASK | (0x0F & ~lmsk0);
masks[11] = HIMASK | (0x0F & ~lmsk1);
masks[12] = HIMASK | (0x0F & ~lmsk2);
masks[13] = HIMASK | (0x0F & ~lmsk3);
masks[14] = HIMASK | lmskN;
masks[15] = HIMASK | lmskA;
for (i = 0; i <= 15; i++)
{
float tmp = 0.;
for (j = 0; j < 4; j++)
if ((HIMASK & (0x10 << j)))
tmp += val1.f[j] * val2.f[j];
for (j = 0; j < 4; j++)
if ((masks[i] & (1 << j)) && res[i].f[j] != tmp)
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define lmskN 0x00
#define lmsk0 0x01
#define lmsk1 0x02
#define lmsk2 0x04
#define lmsk3 0x08
#define lmsk01 0x03
#define lmsk02 0x05
#define lmsk03 0x09
#define lmsk12 0x06
#define lmsk13 0x0A
#define lmsk23 0x0C
#define lmskA 0x0F
#define hmskN 0x00
#define hmskA 0xF0
#define hmsk0 0x10
#define hmsk1 0x20
#define hmsk2 0x40
#define hmsk3 0x80
#define hmsk01 0x30
#define hmsk02 0x50
#define hmsk03 0x90
#define hmsk12 0x60
#define hmsk13 0xA0
#define hmsk23 0xC0
#ifndef HIMASK
#define HIMASK hmskA
#endif
#ifndef LOMASK
#define LOMASK lmskA
#endif
static void
sse4_1_test (void)
{
union
{
__m128 x;
float f[4];
} val1[16], val2[16], res[16], chk[16];
int i,j;
float tmp;
for (i = 0; i < 16; i++)
{
val1[i].f[0] = 2.;
val1[i].f[1] = 3.;
val1[i].f[2] = 4.;
val1[i].f[3] = 5.;
val2[i].f[0] = 10.;
val2[i].f[1] = 100.;
val2[i].f[2] = 1000.;
val2[i].f[3] = 10000.;
tmp = 0.;
for (j = 0; j < 4; j++)
if ((HIMASK & (0x10 << j)))
tmp += val1[i].f [j] * val2[i].f [j];
for (j = 0; j < 4; j++)
if ((LOMASK & (1 << j)))
chk[i].f[j] = tmp;
}
for (i = 0; i < 16; i++)
{
res[i].x = _mm_dp_ps (val1[i].x, val2[i].x, HIMASK | LOMASK);
if (memcmp (&res[i], &chk[i], sizeof (chk[i])))
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
int masks[4];
#define msk0 0x00
#define msk1 0x01
#define msk2 0x02
#define msk3 0x03
static void
sse4_1_test (void)
{
union
{
__m128 x;
float f[4];
} val1, val2;
union
{
int i;
float f;
} res[4];
float resm[4];
int i;
val1.f[0] = 10.;
val1.f[1] = 2.;
val1.f[2] = 3.;
val1.f[3] = 40.;
val2.f[0] = 77.;
val2.f[1] = 21.;
val2.f[2] = 34.;
val2.f[3] = 49.;
res[0].i = _mm_extract_ps (val1.x, msk0);
res[1].i = _mm_extract_ps (val1.x, msk1);
res[2].i = _mm_extract_ps (val1.x, msk2);
res[3].i = _mm_extract_ps (val1.x, msk3);
_MM_EXTRACT_FLOAT (resm[0], val2.x, msk0);
_MM_EXTRACT_FLOAT (resm[1], val2.x, msk1);
_MM_EXTRACT_FLOAT (resm[2], val2.x, msk2);
_MM_EXTRACT_FLOAT (resm[3], val2.x, msk3);
masks[0] = msk0;
masks[1] = msk1;
masks[2] = msk2;
masks[3] = msk3;
for( i=0; i < 4; i++ )
{
if (res[i].f != val1.f[masks[i]])
abort ();
if (resm[i] != val2.f[masks[i]])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define msk0 0x01
#define msk1 0x10
#define msk2 0x29
#define msk3 0x30
#define msk4 0xFC
#define msk5 0x05
#define msk6 0x0A
#define msk7 0x0F
static void
sse4_1_test (void)
{
union
{
__m128 x;
float f[4];
} res[8], val1, val2, tmp;
int masks[8];
int i, j;
val2.f[0] = 55.0;
val2.f[1] = 55.0;
val2.f[2] = 55.0;
val2.f[3] = 55.0;
val1.f[0] = 1.;
val1.f[1] = 2.;
val1.f[2] = 3.;
val1.f[3] = 4.;
res[0].x = _mm_insert_ps (val2.x, val1.x, msk0);
res[1].x = _mm_insert_ps (val2.x, val1.x, msk1);
res[2].x = _mm_insert_ps (val2.x, val1.x, msk2);
res[3].x = _mm_insert_ps (val2.x, val1.x, msk3);
masks[0] = msk0;
masks[1] = msk1;
masks[2] = msk2;
masks[3] = msk3;
for (i = 0; i < 4; i++)
res[i + 4].x = _mm_insert_ps (val2.x, val1.x, msk4);
masks[4] = msk4;
masks[5] = msk4;
masks[6] = msk4;
masks[7] = msk4;
for (i=0; i < 8; i++)
{
tmp = val2;
tmp.f[(masks[i] & 0x30) >> 4] = val1.f[(masks[i] & 0xC0) >> 6];
for (j = 0; j < 4; j++)
if (masks[i] & (0x1 << j))
tmp.f[j] = 0.f;
if (memcmp (&res[i], &tmp, sizeof (tmp)))
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
static void
sse4_1_test (void)
{
union
{
__m128 x;
float f[4];
} vals[4], val;
int i, j;
val.f[0]= 1.;
val.f[1]= 2.;
val.f[2]= 3.;
val.f[3]= 4.;
vals[0].x = _MM_PICK_OUT_PS (val.x, 0);
vals[1].x = _MM_PICK_OUT_PS (val.x, 1);
vals[2].x = _MM_PICK_OUT_PS (val.x, 2);
vals[3].x = _MM_PICK_OUT_PS (val.x, 3);
for (i = 0; i < 4; i++)
for (j = 0; j < 4; j++)
if ((j != 0 && vals[i].f[j] != 0)
|| (j == 0 && vals[i].f[j] != val.f[i]))
abort ();
if (_MM_MK_INSERTPS_NDX(0, 0, 0x1) != 0x01
|| _MM_MK_INSERTPS_NDX(0, 1, 0x2) != 0x12
|| _MM_MK_INSERTPS_NDX(0, 2, 0x3) != 0x23
|| _MM_MK_INSERTPS_NDX(0, 3, 0x4) != 0x34
|| _MM_MK_INSERTPS_NDX(1, 0, 0x5) != 0x45
|| _MM_MK_INSERTPS_NDX(1, 1, 0x6) != 0x56
|| _MM_MK_INSERTPS_NDX(2, 2, 0x7) != 0xA7
|| _MM_MK_INSERTPS_NDX(3, 3, 0x8) != 0xF8)
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define NUM 20
static void
init_movntdqa (int *src)
{
int i, j, sign = 1;
for (i = 0; i < NUM; i++)
for (j = 0; j < 4; j++)
{
src[i * 4 + j] = j * i * i * sign;
sign = -sign;
}
}
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM];
int i[NUM * 4];
} dst, src;
int i;
init_movntdqa (src.i);
for (i = 0; i < NUM; i++)
dst.x[i] = _mm_stream_load_si128 (&src.x[i]);
for (i = 0; i < NUM; i++)
if (memcmp (&dst.x[i], &src.x[i], sizeof(src.x[i])))
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define msk0 0xC0
#define msk1 0x01
#define msk2 0xF2
#define msk3 0x03
#define msk4 0x84
#define msk5 0x05
#define msk6 0xE6
#define msk7 0x67
static __m128i
compute_mpsadbw (unsigned char *v1, unsigned char *v2, int mask)
{
union
{
__m128i x;
unsigned short s[8];
} ret;
unsigned char s[4];
int i, j;
int offs1, offs2;
offs2 = 4 * (mask & 3);
for (i = 0; i < 4; i++)
s[i] = v2[offs2 + i];
offs1 = 4 * ((mask & 4) >> 2);
for (j = 0; j < 8; j++)
{
ret.s[j] = 0;
for (i = 0; i < 4; i++)
ret.s[j] += abs (v1[offs1 + j + i] - s[i]);
}
return ret.x;
}
static void
sse4_1_test (void)
{
union
{
__m128i x;
unsigned int i[4];
unsigned char c[16];
} val1, val2, val3 [8];
__m128i res[8], tmp;
unsigned char masks[8];
int i;
val1.i[0] = 0x35251505;
val1.i[1] = 0x75655545;
val1.i[2] = 0xB5A59585;
val1.i[3] = 0xF5E5D5C5;
val2.i[0] = 0x31211101;
val2.i[1] = 0x71615141;
val2.i[2] = 0xB1A19181;
val2.i[3] = 0xF1E1D1C1;
for (i=0; i < 8; i++)
switch (i % 3)
{
case 1:
val3[i].i[0] = 0xF1E1D1C1;
val3[i].i[1] = 0xB1A19181;
val3[i].i[2] = 0x71615141;
val3[i].i[3] = 0x31211101;
break;
default:
val3[i].x = val2.x;
break;
}
/* Check mpsadbw imm8, xmm, xmm. */
res[0] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk0);
res[1] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk1);
res[2] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk2);
res[3] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk3);
res[4] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk4);
res[5] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk5);
res[6] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk6);
res[7] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk7);
masks[0] = msk0;
masks[1] = msk1;
masks[2] = msk2;
masks[3] = msk3;
masks[4] = msk4;
masks[5] = msk5;
masks[6] = msk6;
masks[7] = msk7;
for (i=0; i < 8; i++)
{
tmp = compute_mpsadbw (val1.c, val2.c, masks[i]);
if (memcmp (&tmp, &res[i], sizeof (tmp)))
abort ();
}
/* Check mpsadbw imm8, m128, xmm. */
for (i=0; i < 8; i++)
{
res[i] = _mm_mpsadbw_epu8 (val1.x, val3[i].x, msk4);
masks[i] = msk4;
}
for (i=0; i < 8; i++)
{
tmp = compute_mpsadbw (val1.c, val3[i].c, masks[i]);
if (memcmp (&tmp, &res[i], sizeof (tmp)))
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 64
static unsigned short
int_to_ushort (int iVal)
{
unsigned short sVal;
if (iVal < 0)
sVal = 0;
else if (iVal > 0xffff)
sVal = 0xffff;
else sVal = iVal;
return sVal;
}
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 4];
int i[NUM];
} src1, src2;
union
{
__m128i x[NUM / 4];
unsigned short s[NUM * 2];
} dst;
int i, sign = 1;
for (i = 0; i < NUM; i++)
{
src1.i[i] = i * i * sign;
src2.i[i] = (i + 20) * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 4)
dst.x[i / 4] = _mm_packus_epi32 (src1.x [i / 4], src2.x [i / 4]);
for (i = 0; i < NUM; i ++)
{
int dstIndex;
unsigned short sVal;
sVal = int_to_ushort (src1.i[i]);
dstIndex = (i % 4) + (i / 4) * 8;
if (sVal != dst.s[dstIndex])
abort ();
sVal = int_to_ushort (src2.i[i]);
dstIndex += 4;
if (sVal != dst.s[dstIndex])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define NUM 20
static void
init_pblendvb (unsigned char *src1, unsigned char *src2,
unsigned char *mask)
{
int i, sign = 1;
for (i = 0; i < NUM * 16; i++)
{
src1[i] = i* i * sign;
src2[i] = (i + 20) * sign;
mask[i] = (i % 3) + ((i * (14 + sign))
^ (src1[i] | src2[i] | (i*3)));
sign = -sign;
}
}
static int
check_pblendvb (__m128i *dst, unsigned char *src1,
unsigned char *src2, unsigned char *mask)
{
unsigned char tmp[16];
int j;
memcpy (&tmp[0], src1, sizeof (tmp));
for (j = 0; j < 16; j++)
if (mask [j] & 0x80)
tmp[j] = src2[j];
return memcmp (dst, &tmp[0], sizeof (tmp));
}
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM];
unsigned char c[NUM * 16];
} dst, src1, src2, mask;
int i;
init_pblendvb (src1.c, src2.c, mask.c);
for (i = 0; i < NUM; i++)
{
dst.x[i] = _mm_blendv_epi8 (src1.x[i], src2.x[i], mask.x[i]);
if (check_pblendvb (&dst.x[i], &src1.c[i * 16], &src2.c[i * 16],
&mask.c[i * 16]))
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define NUM 20
#ifndef MASK
#define MASK 0x0f
#endif
static void
init_pblendw (short *src1, short *src2)
{
int i, sign = 1;
for (i = 0; i < NUM * 8; i++)
{
src1[i] = i * i * sign;
src2[i] = (i + 20) * sign;
sign = -sign;
}
}
static int
check_pblendw (__m128i *dst, short *src1, short *src2)
{
short tmp[8];
int j;
memcpy (&tmp[0], src1, sizeof (tmp));
for (j = 0; j < 8; j++)
if ((MASK & (1 << j)))
tmp[j] = src2[j];
return memcmp (dst, &tmp[0], sizeof (tmp));
}
static void
sse4_1_test (void)
{
__m128i x, y;
union
{
__m128i x[NUM];
short s[NUM * 8];
} dst, src1, src2;
union
{
__m128i x;
short s[8];
} src3;
int i;
init_pblendw (src1.s, src2.s);
/* Check pblendw imm8, m128, xmm */
for (i = 0; i < NUM; i++)
{
dst.x[i] = _mm_blend_epi16 (src1.x[i], src2.x[i], MASK);
if (check_pblendw (&dst.x[i], &src1.s[i * 8], &src2.s[i * 8]))
abort ();
}
/* Check pblendw imm8, xmm, xmm */
src3.x = _mm_setzero_si128 ();
x = _mm_blend_epi16 (dst.x[2], src3.x, MASK);
y = _mm_blend_epi16 (src3.x, dst.x[2], MASK);
if (check_pblendw (&x, &dst.s[16], &src3.s[0]))
abort ();
if (check_pblendw (&y, &src3.s[0], &dst.s[16]))
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 64
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 2];
long long ll[NUM];
} dst, src1, src2;
int i, sign=1;
long long is_eq;
for (i = 0; i < NUM; i++)
{
src1.ll[i] = i * i * sign;
src2.ll[i] = (i + 20) * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 2)
dst.x [i / 2] = _mm_cmpeq_epi64(src1.x [i / 2], src2.x [i / 2]);
for (i = 0; i < NUM; i++)
{
is_eq = src1.ll[i] == src2.ll[i] ? 0xffffffffffffffffLL : 0LL;
if (is_eq != dst.ll[i])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define msk0 0
#define msk1 1
#define msk2 2
#define msk3 3
#define msk4 4
#define msk5 5
#define msk6 6
#define msk7 7
#define msk8 8
#define msk9 9
#define msk10 10
#define msk11 11
#define msk12 12
#define msk13 13
#define msk14 14
#define msk15 15
static void
sse4_1_test (void)
{
union
{
__m128i x;
int i[4];
char c[16];
} val1;
int res[16], masks[16];
int i;
val1.i[0] = 0x04030201;
val1.i[1] = 0x08070605;
val1.i[2] = 0x0C0B0A09;
val1.i[3] = 0x100F0E0D;
res[0] = _mm_extract_epi8 (val1.x, msk0);
res[1] = _mm_extract_epi8 (val1.x, msk1);
res[2] = _mm_extract_epi8 (val1.x, msk2);
res[3] = _mm_extract_epi8 (val1.x, msk3);
res[4] = _mm_extract_epi8 (val1.x, msk4);
res[5] = _mm_extract_epi8 (val1.x, msk5);
res[6] = _mm_extract_epi8 (val1.x, msk6);
res[7] = _mm_extract_epi8 (val1.x, msk7);
res[8] = _mm_extract_epi8 (val1.x, msk8);
res[9] = _mm_extract_epi8 (val1.x, msk9);
res[10] = _mm_extract_epi8 (val1.x, msk10);
res[11] = _mm_extract_epi8 (val1.x, msk11);
res[12] = _mm_extract_epi8 (val1.x, msk12);
res[13] = _mm_extract_epi8 (val1.x, msk13);
res[14] = _mm_extract_epi8 (val1.x, msk14);
res[15] = _mm_extract_epi8 (val1.x, msk15);
masks[0] = msk0;
masks[1] = msk1;
masks[2] = msk2;
masks[3] = msk3;
masks[4] = msk4;
masks[5] = msk5;
masks[6] = msk6;
masks[7] = msk7;
masks[8] = msk8;
masks[9] = msk9;
masks[10] = msk10;
masks[11] = msk11;
masks[12] = msk12;
masks[13] = msk13;
masks[14] = msk14;
masks[15] = msk15;
for (i = 0; i < 16; i++)
if (res[i] != val1.c [masks[i]])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define msk0 0
#define msk1 1
#define msk2 2
#define msk3 3
static void
sse4_1_test (void)
{
union
{
__m128i x;
int i[4];
} val1;
int res[4], masks[4];
int i;
val1.i[0] = 0x04030201;
val1.i[1] = 0x08070605;
val1.i[2] = 0x0C0B0A09;
val1.i[3] = 0x100F0E0D;
res[0] = _mm_extract_epi32 (val1.x, msk0);
res[1] = _mm_extract_epi32 (val1.x, msk1);
res[2] = _mm_extract_epi32 (val1.x, msk2);
res[3] = _mm_extract_epi32 (val1.x, msk3);
masks[0] = msk0;
masks[1] = msk1;
masks[2] = msk2;
masks[3] = msk3;
for (i = 0; i < 4; i++)
if (res[i] != val1.i [masks[i]])
abort ();
}
/* { dg-do run { target { { i?86-*-* x86_64-*-* } && lp64 } } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define msk0 0
#define msk1 1
static void
sse4_1_test (void)
{
union
{
__m128i x;
long long ll[2];
} val1;
long long res[2];
int masks[2];
int i;
val1.ll[0] = 0x0807060504030201LL;
val1.ll[1] = 0x100F0E0D0C0B0A09LL;
res[0] = _mm_extract_epi64 (val1.x, msk0);
res[1] = _mm_extract_epi64 (val1.x, msk1);
masks[0] = msk0;
masks[1] = msk1;
for (i = 0; i < 2; i++)
if (res[i] != val1.ll [masks[i]])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define msk0 0
#define msk1 1
#define msk2 2
#define msk3 3
#define msk4 4
#define msk5 5
#define msk6 6
#define msk7 7
static void
sse4_1_test (void)
{
union
{
__m128i x;
int i[4];
short s[8];
} val1;
int res[8], masks[8];
int i;
val1.i[0] = 0x04030201;
val1.i[1] = 0x08070605;
val1.i[2] = 0x0C0B0A09;
val1.i[3] = 0x100F0E0D;
res[0] = _mm_extract_epi16 (val1.x, msk0);
res[1] = _mm_extract_epi16 (val1.x, msk1);
res[2] = _mm_extract_epi16 (val1.x, msk2);
res[3] = _mm_extract_epi16 (val1.x, msk3);
res[4] = _mm_extract_epi16 (val1.x, msk4);
res[5] = _mm_extract_epi16 (val1.x, msk5);
res[6] = _mm_extract_epi16 (val1.x, msk6);
res[7] = _mm_extract_epi16 (val1.x, msk7);
masks[0] = msk0;
masks[1] = msk1;
masks[2] = msk2;
masks[3] = msk3;
masks[4] = msk4;
masks[5] = msk5;
masks[6] = msk6;
masks[7] = msk7;
for (i = 0; i < 8; i++)
if (res[i] != val1.s [masks[i]])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 64
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM/8];
unsigned short s[NUM];
} src;
unsigned short minVal[NUM/8];
int minInd[NUM/8];
unsigned short minValScalar, minIndScalar;
int i, j, res;
for (i = 0; i < NUM; i++)
src.s[i] = i * i / (i + i / 3.14 + 1.0);
for (i = 0, j = 0; i < NUM; i += 8, j++)
{
res = _mm_cvtsi128_si32 (_mm_minpos_epu16 (src.x [i/8]));
minVal[j] = res & 0xffff;
minInd[j] = (res >> 16) & 0x3;
}
for (i = 0; i < NUM; i += 8)
{
minValScalar = src.s[i];
minIndScalar = 0;
for (j = i + 1; j < i + 8; j++)
if (minValScalar > src.s[j])
{
minValScalar = src.s[j];
minIndScalar = j - i;
}
if (minValScalar != minVal[i/8] && minIndScalar != minInd[i/8])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define msk0 0x00
#define msk1 0x01
#define msk2 0x02
#define msk3 0x03
#define msk4 0x04
#define msk5 0x05
#define msk6 0x06
#define msk7 0x07
#define msk8 0x08
#define msk9 0x09
#define mskA 0x0A
#define mskB 0x0B
#define mskC 0x0C
#define mskD 0x0D
#define mskE 0x0E
#define mskF 0x0F
static void
sse4_1_test (void)
{
union
{
__m128i x;
unsigned int i[4];
unsigned char c[16];
} res [16], val, tmp;
int masks[16];
unsigned char ins[4] = { 3, 4, 5, 6 };
int i;
val.i[0] = 0x35251505;
val.i[1] = 0x75655545;
val.i[2] = 0xB5A59585;
val.i[3] = 0xF5E5D5C5;
/* Check pinsrb imm8, r32, xmm. */
res[0].x = _mm_insert_epi8 (val.x, ins[0], msk0);
res[1].x = _mm_insert_epi8 (val.x, ins[0], msk1);
res[2].x = _mm_insert_epi8 (val.x, ins[0], msk2);
res[3].x = _mm_insert_epi8 (val.x, ins[0], msk3);
res[4].x = _mm_insert_epi8 (val.x, ins[0], msk4);
res[5].x = _mm_insert_epi8 (val.x, ins[0], msk5);
res[6].x = _mm_insert_epi8 (val.x, ins[0], msk6);
res[7].x = _mm_insert_epi8 (val.x, ins[0], msk7);
res[8].x = _mm_insert_epi8 (val.x, ins[0], msk8);
res[9].x = _mm_insert_epi8 (val.x, ins[0], msk9);
res[10].x = _mm_insert_epi8 (val.x, ins[0], mskA);
res[11].x = _mm_insert_epi8 (val.x, ins[0], mskB);
res[12].x = _mm_insert_epi8 (val.x, ins[0], mskC);
res[13].x = _mm_insert_epi8 (val.x, ins[0], mskD);
res[14].x = _mm_insert_epi8 (val.x, ins[0], mskE);
res[15].x = _mm_insert_epi8 (val.x, ins[0], mskF);
masks[0] = msk0;
masks[1] = msk1;
masks[2] = msk2;
masks[3] = msk3;
masks[4] = msk4;
masks[5] = msk5;
masks[6] = msk6;
masks[7] = msk7;
masks[8] = msk8;
masks[9] = msk9;
masks[10] = mskA;
masks[11] = mskB;
masks[12] = mskC;
masks[13] = mskD;
masks[14] = mskE;
masks[15] = mskF;
for (i = 0; i < 16; i++)
{
tmp.x = val.x;
tmp.c[masks[i]] = ins[0];
if (memcmp (&tmp, &res[i], sizeof (tmp)))
abort ();
}
/* Check pinsrb imm8, m8, xmm. */
for (i = 0; i < 16; i++)
{
res[i].x = _mm_insert_epi8 (val.x, ins[i % 4], msk0);
masks[i] = msk0;
}
for (i = 0; i < 16; i++)
{
tmp.x = val.x;
tmp.c[masks[i]] = ins[i % 4];
if (memcmp (&tmp, &res[i], sizeof (tmp)))
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define msk0 0x00
#define msk1 0x01
#define msk2 0x02
#define msk3 0x03
static void
sse4_1_test (void)
{
union
{
__m128i x;
unsigned int i[4];
} res [4], val, tmp;
static unsigned int ins[4] = { 3, 4, 5, 6 };
int masks[4];
int i;
val.i[0] = 55;
val.i[1] = 55;
val.i[2] = 55;
val.i[3] = 55;
/* Check pinsrd imm8, r32, xmm. */
res[0].x = _mm_insert_epi32 (val.x, ins[0], msk0);
res[1].x = _mm_insert_epi32 (val.x, ins[0], msk1);
res[2].x = _mm_insert_epi32 (val.x, ins[0], msk2);
res[3].x = _mm_insert_epi32 (val.x, ins[0], msk3);
masks[0] = msk0;
masks[1] = msk1;
masks[2] = msk2;
masks[3] = msk3;
for (i = 0; i < 4; i++)
{
tmp.x = val.x;
tmp.i[masks[i]] = ins[0];
if (memcmp (&tmp, &res[i], sizeof (tmp)))
abort ();
}
/* Check pinsrd imm8, m32, xmm. */
for (i = 0; i < 4; i++)
{
res[i].x = _mm_insert_epi32 (val.x, ins[i], msk0);
masks[i] = msk0;
}
for (i = 0; i < 4; i++)
{
tmp.x = val.x;
tmp.i[masks[i]] = ins[i];
if (memcmp (&tmp, &res[i], sizeof (tmp)))
abort ();
}
}
/* { dg-do run { target { { i?86-*-* x86_64-*-* } && lp64 } } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <string.h>
#define msk0 0x00
#define msk1 0x01
static void
sse4_1_test (void)
{
union
{
__m128i x;
unsigned long long ll[2];
} res [4], val, tmp;
int masks[4];
static unsigned long long ins[2] =
{ 0xAABBAABBAABBAABBLL, 0xCCDDCCDDCCDDCCDDLL };
int i;
val.ll[0] = 0x0807060504030201LL;
val.ll[1] = 0x100F0E0D0C0B0A09LL;
/* Check pinsrq imm8, r64, xmm. */
res[0].x = _mm_insert_epi64 (val.x, ins[0], msk0);
res[1].x = _mm_insert_epi64 (val.x, ins[0], msk1);
masks[0] = msk0;
masks[1] = msk1;
for (i = 0; i < 2; i++)
{
tmp.x = val.x;
tmp.ll[masks[i]] = ins[0];
if (memcmp (&tmp, &res[i], sizeof (tmp)))
abort ();
}
/* Check pinsrq imm8, m64, xmm. */
for (i = 0; i < 2; i++)
{
res[i].x = _mm_insert_epi64 (val.x, ins[i], msk0);
masks[i] = msk0;
}
for (i = 0; i < 2; i++)
{
tmp.x = val.x;
tmp.ll[masks[i]] = ins[i];
if (memcmp (&tmp, &res[i], sizeof (tmp)))
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 1024
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 16];
char i[NUM];
} dst, src1, src2;
int i, sign = 1;
char max;
for (i = 0; i < NUM; i++)
{
src1.i[i] = i * i * sign;
src2.i[i] = (i + 20) * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 16)
dst.x[i / 16] = _mm_max_epi8 (src1.x[i / 16], src2.x[i / 16]);
for (i = 0; i < NUM; i++)
{
max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
if (max != dst.i[i])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 64
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 4];
int i[NUM];
} dst, src1, src2;
int i, sign = 1;
int max;
for (i = 0; i < NUM; i++)
{
src1.i[i] = i * i * sign;
src2.i[i] = (i + 20) * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 4)
dst.x[i / 4] = _mm_max_epi32 (src1.x[i / 4], src2.x[i / 4]);
for (i = 0; i < NUM; i++)
{
max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
if (max != dst.i[i])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 64
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 4];
unsigned int i[NUM];
} dst, src1, src2;
int i;
unsigned int max;
for (i = 0; i < NUM; i++)
{
src1.i[i] = i * i;
src2.i[i] = i + 20;
if ((i % 4))
src2.i[i] |= 0x80000000;
}
for (i = 0; i < NUM; i += 4)
dst.x[i / 4] = _mm_max_epu32 (src1.x[i / 4], src2.x[i / 4]);
for (i = 0; i < NUM; i++)
{
max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
if (max != dst.i[i])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 64
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 8];
unsigned short i[NUM];
} dst, src1, src2;
int i;
unsigned short max;
for (i = 0; i < NUM; i++)
{
src1.i[i] = i * i;
src2.i[i] = i + 20;
if ((i % 8))
src2.i[i] |= 0x8000;
}
for (i = 0; i < NUM; i += 8)
dst.x[i / 8] = _mm_max_epu16 (src1.x[i / 8], src2.x[i / 8]);
for (i = 0; i < NUM; i++)
{
max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
if (max != dst.i[i])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 1024
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 16];
char i[NUM];
} dst, src1, src2;
int i, sign = 1;
char min;
for (i = 0; i < NUM; i++)
{
src1.i[i] = i * i * sign;
src2.i[i] = (i + 20) * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 16)
dst.x[i / 16] = _mm_min_epi8 (src1.x[i / 16], src2.x[i / 16]);
for (i = 0; i < NUM; i++)
{
min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
if (min != dst.i[i])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 64
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 4];
int i[NUM];
} dst, src1, src2;
int i, sign = 1;
int min;
for (i = 0; i < NUM; i++)
{
src1.i[i] = i * i * sign;
src2.i[i] = (i + 20) * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 4)
dst.x[i / 4] = _mm_min_epi32 (src1.x[i / 4], src2.x[i / 4]);
for (i = 0; i < NUM; i++)
{
min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
if (min != dst.i[i])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 64
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 4];
unsigned int i[NUM];
} dst, src1, src2;
int i;
unsigned int min;
for (i = 0; i < NUM; i++)
{
src1.i[i] = i * i;
src2.i[i] = i + 20;
if ((i % 4))
src2.i[i] |= 0x80000000;
}
for (i = 0; i < NUM; i += 4)
dst.x[i / 4] = _mm_min_epu32 (src1.x[i / 4], src2.x[i / 4]);
for (i = 0; i < NUM; i++)
{
min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
if (min != dst.i[i])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 64
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 8];
unsigned short i[NUM];
} dst, src1, src2;
int i;
unsigned short min;
for (i = 0; i < NUM; i++)
{
src1.i[i] = i * i;
src2.i[i] = i + 20;
if ((i % 8))
src2.i[i] |= 0x8000;
}
for (i = 0; i < NUM; i += 8)
dst.x[i / 8] = _mm_min_epu16 (src1.x[i / 8], src2.x[i / 8]);
for (i = 0; i < NUM; i++)
{
min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
if (min != dst.i[i])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 128
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 4];
int i[NUM];
char c[NUM * 4];
} dst, src;
int i, sign = 1;
for (i = 0; i < NUM; i++)
{
src.c[(i % 4) + (i / 4) * 16] = i * i * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 4)
dst.x [i / 4] = _mm_cvtepi8_epi32 (src.x [i / 4]);
for (i = 0; i < NUM; i++)
if (src.c[(i % 4) + (i / 4) * 16] != dst.i[i])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 128
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 2];
long long ll[NUM];
char c[NUM * 8];
} dst, src;
int i, sign = 1;
for (i = 0; i < NUM; i++)
{
src.c[(i % 2) + (i / 2) * 16] = i * i * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 2)
dst.x [i / 2] = _mm_cvtepi8_epi64 (src.x [i / 2]);
for (i = 0; i < NUM; i++)
if (src.c[(i % 2) + (i / 2) * 16] != dst.ll[i])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 128
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 8];
short s[NUM];
char c[NUM * 2];
} dst, src;
int i, sign = 1;
for (i = 0; i < NUM; i++)
{
src.c[(i % 8) + (i / 8) * 16] = i * i * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 8)
dst.x [i / 8] = _mm_cvtepi8_epi16 (src.x [i / 8]);
for (i = 0; i < NUM; i++)
if (src.c[(i % 8) + (i / 8) * 16] != dst.s[i])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 128
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 2];
long long ll[NUM];
int i[NUM * 2];
} dst, src;
int i, sign = 1;
for (i = 0; i < NUM; i++)
{
src.i[(i % 2) + (i / 2) * 4] = i * i * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 2)
dst.x [i / 2] = _mm_cvtepi32_epi64 (src.x [i / 2]);
for (i = 0; i < NUM; i++)
if (src.i[(i % 2) + (i / 2) * 4] != dst.ll[i])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 128
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 4];
int i[NUM];
short s[NUM * 2];
} dst, src;
int i, sign = 1;
for (i = 0; i < NUM; i++)
{
src.s[(i % 4) + (i / 4) * 8] = i * i * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 4)
dst.x [i / 4] = _mm_cvtepi16_epi32 (src.x [i / 4]);
for (i = 0; i < NUM; i++)
if (src.s[(i % 4) + (i / 4) * 8] != dst.i[i])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 128
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 2];
long long ll[NUM];
short s[NUM * 4];
} dst, src;
int i, sign = 1;
for (i = 0; i < NUM; i++)
{
src.s[(i % 2) + (i / 2) * 8] = i * i * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 2)
dst.x [i / 2] = _mm_cvtepi16_epi64 (src.x [i / 2]);
for (i = 0; i < NUM; i++)
if (src.s[(i % 2) + (i / 2) * 8] != dst.ll[i])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 128
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 4];
unsigned int i[NUM];
unsigned char c[NUM * 4];
} dst, src;
int i;
for (i = 0; i < NUM; i++)
{
src.c[(i % 4) + (i / 4) * 16] = i * i;
if ((i % 4))
src.c[(i % 4) + (i / 4) * 16] |= 0x80;
}
for (i = 0; i < NUM; i += 4)
dst.x [i / 4] = _mm_cvtepu8_epi32 (src.x [i / 4]);
for (i = 0; i < NUM; i++)
if (src.c[(i % 4) + (i / 4) * 16] != dst.i[i])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 128
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 2];
unsigned long long ll[NUM];
unsigned char c[NUM * 8];
} dst, src;
int i;
for (i = 0; i < NUM; i++)
{
src.c[(i % 2) + (i / 2) * 16] = i * i;
if ((i % 2))
src.c[(i % 2) + (i / 2) * 16] |= 0x80;
}
for (i = 0; i < NUM; i += 2)
dst.x [i / 2] = _mm_cvtepu8_epi64 (src.x [i / 2]);
for (i = 0; i < NUM; i++)
if (src.c[(i % 2) + (i / 2) * 16] != dst.ll[i])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 128
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 8];
unsigned short s[NUM];
unsigned char c[NUM * 2];
} dst, src;
int i;
for (i = 0; i < NUM; i++)
{
src.c[(i % 8) + (i / 8) * 16] = i * i;
if ((i % 4))
src.c[(i % 8) + (i / 8) * 16] |= 0x80;
}
for (i = 0; i < NUM; i += 8)
dst.x [i / 8] = _mm_cvtepu8_epi16 (src.x [i / 8]);
for (i = 0; i < NUM; i++)
if (src.c[(i % 8) + (i / 8) * 16] != dst.s[i])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 128
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 2];
unsigned long long ll[NUM];
unsigned int i[NUM * 2];
} dst, src;
int i;
for (i = 0; i < NUM; i++)
{
src.i[(i % 2) + (i / 2) * 4] = i * i;
if ((i % 2))
src.i[(i % 2) + (i / 2) * 4] |= 0x80000000;
}
for (i = 0; i < NUM; i += 2)
dst.x [i / 2] = _mm_cvtepu32_epi64 (src.x [i / 2]);
for (i = 0; i < NUM; i++)
if (src.i[(i % 2) + (i / 2) * 4] != dst.ll[i])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 128
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 4];
unsigned int i[NUM];
unsigned short s[NUM * 2];
} dst, src;
int i;
for (i = 0; i < NUM; i++)
{
src.s[(i % 4) + (i / 4) * 8] = i * i;
if ((i % 4))
src.s[(i % 4) + (i / 4) * 8] |= 0x8000;
}
for (i = 0; i < NUM; i += 4)
dst.x [i / 4] = _mm_cvtepu16_epi32 (src.x [i / 4]);
for (i = 0; i < NUM; i++)
if (src.s[(i % 4) + (i / 4) * 8] != dst.i[i])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 128
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 2];
unsigned long long ll[NUM];
unsigned short s[NUM * 4];
} dst, src;
int i;
for (i = 0; i < NUM; i++)
{
src.s[(i % 2) + (i / 2) * 8] = i * i;
if ((i % 2))
src.s[(i % 2) + (i / 2) * 8] |= 0x8000;
}
for (i = 0; i < NUM; i += 2)
dst.x [i / 2] = _mm_cvtepu16_epi64 (src.x [i / 2]);
for (i = 0; i < NUM; i++)
if (src.s[(i % 2) + (i / 2) * 8] != dst.ll[i])
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 64
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 2];
long long ll[NUM];
} dst;
union
{
__m128i x[NUM / 2];
int i[NUM * 2];
} src1, src2;
int i, sign = 1;
long long value;
for (i = 0; i < NUM; i += 2)
{
src1.i[i] = i * i * sign;
src2.i[i] = (i + 20) * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 2)
dst.x[i / 2] = _mm_mul_epi32 (src1.x[i / 2], src2.x[i / 2]);
for (i = 0; i < NUM; i++)
{
value = (long long) src1.i[i * 2] * (long long) src2.i[i * 2];
if (value != dst.ll[i])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#define NUM 64
static void
sse4_1_test (void)
{
union
{
__m128i x[NUM / 4];
int i[NUM];
} dst, src1, src2;
int i, sign = 1;
int value;
for (i = 0; i < NUM; i++)
{
src1.i[i] = i * i * sign;
src2.i[i] = (i + 20) * sign;
sign = -sign;
}
for (i = 0; i < NUM; i += 4)
dst.x[i / 4] = _mm_mullo_epi32 (src1.x[i / 4], src2.x[i / 4]);
for (i = 0; i < NUM; i++)
{
value = src1.i[i] * src2.i[i];
if (value != dst.i[i])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
static int
make_ptestz (__m128i m, __m128i v)
{
union
{
__m128i x;
unsigned char c[16];
} val, mask;
int i, z;
mask.x = m;
val.x = v;
z = 1;
for (i = 0; i < 16; i++)
if ((mask.c[i] & val.c[i]))
{
z = 0;
break;
}
return z;
}
static int
make_ptestc (__m128i m, __m128i v)
{
union
{
__m128i x;
unsigned char c[16];
} val, mask;
int i, c;
mask.x = m;
val.x = v;
c = 1;
for (i = 0; i < 16; i++)
if ((val.c[i] & ~mask.c[i]))
{
c = 0;
break;
}
return c;
}
static void
sse4_1_test (void)
{
union
{
__m128i x;
unsigned int i[4];
} val[4];
int i, j, l;
int res[32];
val[0].i[0] = 0x11111111;
val[0].i[1] = 0x00000000;
val[0].i[2] = 0x00000000;
val[0].i[3] = 0x11111111;
val[1].i[0] = 0x00000000;
val[1].i[1] = 0x11111111;
val[1].i[2] = 0x11111111;
val[1].i[3] = 0x00000000;
val[2].i[0] = 0;
val[2].i[1] = 0;
val[2].i[2] = 0;
val[2].i[3] = 0;
val[3].i[0] = 0xffffffff;
val[3].i[1] = 0xffffffff;
val[3].i[2] = 0xffffffff;
val[3].i[3] = 0xffffffff;
l = 0;
for(i = 0; i < 4; i++)
for(j = 0; j < 4; j++)
{
res[l++] = _mm_testz_si128 (val[j].x, val[i].x);
res[l++] = _mm_testc_si128 (val[j].x, val[i].x);
}
l = 0;
for(i = 0; i < 4; i++)
for(j = 0; j < 4; j++)
{
if (res[l++] != make_ptestz (val[j].x, val[i].x))
abort ();
if (res[l++] != make_ptestc (val[j].x, val[i].x))
abort ();
}
if (res[2] != _mm_testz_si128 (val[1].x, val[0].x))
abort ();
if (res[3] != _mm_testc_si128 (val[1].x, val[0].x))
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
static int
make_ptestnzc (__m128i m, __m128i v)
{
union
{
__m128i x;
unsigned char c[16];
} val, mask;
int i, z, c;
mask.x = m;
val.x = v;
z = c = 1;
for (i = 0; i < 16; i++)
{
if ((mask.c[i] & val.c[i]))
z = 0;
if ((~mask.c[i] & val.c[i]))
c = 0;
}
return (z == 0 && c == 0) ? 1 : 0;
}
static void
sse4_1_test (void)
{
union
{
__m128i x;
unsigned int i[4];
} val[4];
int i, j, l;
int res[32];
val[0].i[0] = 0x11111111;
val[0].i[1] = 0x00000000;
val[0].i[2] = 0x00000000;
val[0].i[3] = 0x11111111;
val[1].i[0] = 0x00000000;
val[1].i[1] = 0x11111111;
val[1].i[2] = 0x11111111;
val[1].i[3] = 0x00000000;
val[2].i[0] = 0;
val[2].i[1] = 0;
val[2].i[2] = 0;
val[2].i[3] = 0;
val[3].i[0] = 0xffffffff;
val[3].i[1] = 0xffffffff;
val[3].i[2] = 0xffffffff;
val[3].i[3] = 0xffffffff;
l = 0;
for(i = 0; i < 4; i++)
for(j = 0; j < 4; j++)
{
res[l++] = _mm_testnzc_si128 (val[j].x, val[i].x);
res[l++] = _mm_testnzc_si128 (val[j].x, val[i].x);
}
l = 0;
for(i = 0; i < 4; i++)
for(j = 0; j < 4; j++)
{
if (res[l++] != make_ptestnzc (val[j].x, val[i].x))
abort ();
if (res[l++] != make_ptestnzc (val[j].x, val[i].x))
abort ();
}
if (res[2] != _mm_testnzc_si128 (val[1].x, val[0].x))
abort ();
if (res[3] != _mm_testnzc_si128 (val[1].x, val[0].x))
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
static void
sse4_1_test (void)
{
union
{
__m128i x;
unsigned int i[4];
} val[4];
int correct_zeros[4];
int correct_ones[4];
int correct_mixed[4];
int zeros[4];
int ones[4];
int mixed[4];
int i;
__m128i v;
val[0].i[0] = 0x11111111;
val[0].i[1] = 0x00000000;
val[0].i[2] = 0x00000000;
val[0].i[3] = 0x11111111;
correct_zeros[0] = 0;
correct_ones[0] = 0;
correct_mixed[0] = 1;
val[1].i[0] = 0x00000000;
val[1].i[1] = 0x11111111;
val[1].i[2] = 0x11111111;
val[1].i[3] = 0x00000000;
correct_zeros[1] = 0;
correct_ones[1] = 0;
correct_mixed[1] = 1;
val[2].i[0] = 0;
val[2].i[1] = 0;
val[2].i[2] = 0;
val[2].i[3] = 0;
correct_zeros[2] = 1;
correct_ones[2] = 0;
correct_mixed[2] = 0;
val[3].i[0] = 0xffffffff;
val[3].i[1] = 0xffffffff;
val[3].i[2] = 0xffffffff;
val[3].i[3] = 0xffffffff;
correct_zeros[3] = 0;
correct_ones[3] = 1;
correct_mixed[3] = 0;
for (i=0; i < 4; i++)
zeros[i] = _mm_test_all_zeros (val[i].x, val[i].x);
for( i=0; i < 4; i++ )
ones[i] = _mm_test_all_ones (val[i].x);
v = _mm_cmpeq_epi32 (val[0].x, val[0].x);
for( i=0; i < 4; i++ )
mixed[i] = _mm_test_mix_ones_zeros (val[i].x, v);
for( i=0; i < 4; i++ )
{
if (zeros[i] != correct_zeros[i])
abort ();
if (ones[i] != correct_ones[i])
abort ();
if (mixed[i] != correct_mixed[i])
abort ();
}
}
#include <smmintrin.h>
#include <math.h>
#define NUM 64
static void
init_round (FP_T *src)
{
int i, sign = 1;
FP_T f = rand ();
for (i = 0; i < NUM; i++)
{
src[i] = (i + 1)* f * M_PI * sign;
if (i < (NUM / 2))
{
if ((i % 6) == 0)
f = f * src[i];
}
else if (i == (NUM / 2))
f = rand ();
else if ((i % 6) == 0)
f = 1 / (f * (i + 1) * src[i] * M_PI *sign);
sign = -sign;
}
}
static FP_T
do_round (FP_T f, int type)
{
short saved_cw, new_cw, clr_mask;
FP_T ret;
if ((type & 4))
{
type = 0;
clr_mask = 0xFFFF;
}
else
{
type = 0x003F | ((type & 3) << 10);
clr_mask = ~0x0C3F;
}
__asm__ ("fld" ASM_SUFFIX " %0" : : "m" (*&f));
__asm__ ("fstcw %0" : "=m" (*&saved_cw));
new_cw = saved_cw & clr_mask;
new_cw |= type;
__asm__ ("fldcw %0" : : "m" (*&new_cw));
__asm__ ("frndint\n"
"fstp" ASM_SUFFIX " %0\n" : "=m" (*&ret));
__asm__ ("fldcw %0" : : "m" (*&saved_cw));
return ret;
}
static void
sse4_1_test (void)
{
int i;
FP_T f;
union
{
VEC_T x[NUM / LOOP_INCREMENT];
FP_T f[NUM];
} dst, src;
init_round (src.f);
for (i = 0; i < NUM / LOOP_INCREMENT; i++)
dst.x[i] = ROUND_INTRIN (src.x[i], ROUND_MODE);
for (i = 0; i < NUM; i += CHECK_LOOP_INCREMENT)
{
f = do_round (src.f[i], CHECK_ROUND_MODE);
if (f != dst.f[i])
abort ();
}
if (_MM_FROUND_TO_NEAREST_INT != 0x00
|| _MM_FROUND_TO_NEG_INF != 0x01
|| _MM_FROUND_TO_POS_INF != 0x02
|| _MM_FROUND_TO_ZERO != 0x03
|| _MM_FROUND_CUR_DIRECTION != 0x04
|| _MM_FROUND_RAISE_EXC != 0x00
|| _MM_FROUND_NO_EXC != 0x08
|| _MM_FROUND_NINT != 0x00
|| _MM_FROUND_FLOOR != 0x01
|| _MM_FROUND_CEIL != 0x02
|| _MM_FROUND_TRUNC != 0x03
|| _MM_FROUND_RINT != 0x04
|| _MM_FROUND_NEARBYINT != 0x0C)
abort ();
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#define VEC_T __m128d
#define FP_T double
#define ASM_SUFFIX "l"
#define ROUND_INTRIN(x, mode) _mm_ceil_pd(x)
#define ROUND_MODE _MM_FROUND_CEIL
#define CHECK_ROUND_MODE 0x02
#define LOOP_INCREMENT 2
#define CHECK_LOOP_INCREMENT 1
#include "sse4_1-round.h"
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#define VEC_T __m128d
#define FP_T double
#define ASM_SUFFIX "l"
#define ROUND_INTRIN _mm_round_pd
#define ROUND_MODE _MM_FROUND_NINT
#define CHECK_ROUND_MODE 0x00
#define LOOP_INCREMENT 2
#define CHECK_LOOP_INCREMENT 1
#include "sse4_1-round.h"
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#define VEC_T __m128d
#define FP_T double
#define ASM_SUFFIX "l"
#define ROUND_INTRIN(x, mode) _mm_floor_pd(x)
#define ROUND_MODE _MM_FROUND_FLOOR
#define CHECK_ROUND_MODE 0x01
#define LOOP_INCREMENT 2
#define CHECK_LOOP_INCREMENT 1
#include "sse4_1-round.h"
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#define VEC_T __m128
#define FP_T float
#define ASM_SUFFIX "s"
#define ROUND_INTRIN(x, mode) _mm_ceil_ps(x)
#define ROUND_MODE _MM_FROUND_CEIL
#define CHECK_ROUND_MODE 0x02
#define LOOP_INCREMENT 4
#define CHECK_LOOP_INCREMENT 1
#include "sse4_1-round.h"
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#define VEC_T __m128
#define FP_T float
#define ASM_SUFFIX "s"
#define ROUND_INTRIN _mm_round_ps
#define ROUND_MODE _MM_FROUND_NINT
#define CHECK_ROUND_MODE 0x00
#define LOOP_INCREMENT 4
#define CHECK_LOOP_INCREMENT 1
#include "sse4_1-round.h"
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#define VEC_T __m128
#define FP_T float
#define ASM_SUFFIX "s"
#define ROUND_INTRIN(x, mode) _mm_floor_ps(x)
#define ROUND_MODE _MM_FROUND_FLOOR
#define CHECK_ROUND_MODE 0x01
#define LOOP_INCREMENT 4
#define CHECK_LOOP_INCREMENT 1
#include "sse4_1-round.h"
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#define VEC_T __m128d
#define FP_T double
#define ASM_SUFFIX "l"
#define ROUND_INTRIN(x, mode) _mm_ceil_sd(x, x)
#define ROUND_MODE _MM_FROUND_CEIL
#define CHECK_ROUND_MODE 0x02
#define LOOP_INCREMENT 2
#define CHECK_LOOP_INCREMENT 2
#include "sse4_1-round.h"
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#define VEC_T __m128d
#define FP_T double
#define ASM_SUFFIX "l"
#define ROUND_INTRIN(x, mode) _mm_round_sd(x, x, mode)
#define ROUND_MODE _MM_FROUND_NINT
#define CHECK_ROUND_MODE 0x00
#define LOOP_INCREMENT 2
#define CHECK_LOOP_INCREMENT 2
#include "sse4_1-round.h"
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#define VEC_T __m128d
#define FP_T double
#define ASM_SUFFIX "l"
#define ROUND_INTRIN(x, mode) _mm_floor_sd(x, x)
#define ROUND_MODE _MM_FROUND_FLOOR
#define CHECK_ROUND_MODE 0x01
#define LOOP_INCREMENT 2
#define CHECK_LOOP_INCREMENT 2
#include "sse4_1-round.h"
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <math.h>
#include <string.h>
#define NUM 64
static void
init_round (double *src)
{
int i, sign = 1;
double d = rand ();
for (i = 0; i < NUM; i++)
{
src[i] = (i + 1)* d * M_PI * sign;
if (i < (NUM / 2))
{
if ((i % 6) == 0)
d = d * src[i];
}
else if (i == (NUM / 2))
d = rand ();
else if ((i % 6) == 0)
d = 1 / (d * (i + 1) * src[i] * M_PI *sign);
sign = -sign;
}
}
static double
do_round (double f, int type)
{
short saved_cw, new_cw, clr_mask;
double ret;
if ((type & 4))
{
type = 0;
clr_mask = 0xFFFF;
}
else
{
type = 0x003F | ((type & 3) << 10);
clr_mask = ~0x0C3F;
}
__asm__ ("fldl %0" : : "m" (*&f));
__asm__ ("fstcw %0" : "=m" (*&saved_cw));
new_cw = saved_cw & clr_mask;
new_cw |= type;
__asm__ ("fldcw %0" : : "m" (*&new_cw));
__asm__ ("frndint\n"
"fstpl %0\n" : "=m" (*&ret));
__asm__ ("fldcw %0" : : "m" (*&saved_cw));
return ret;
}
static void
sse4_1_test (void)
{
int i;
double f;
union
{
__m128d x[NUM / 2];
double d[NUM];
} dst, src;
init_round (src.d);
memset (&dst, 0, NUM * sizeof(double));
for (i = 0; i < NUM / 2 ; i++)
dst.x[i] = _mm_round_sd (dst.x[i], src.x[i], _MM_FROUND_TRUNC);
for (i = 0; i < NUM; i += 2)
{
if (dst.d[i + 1] != 0.0)
abort ();
f = do_round (src.d[i], 0x03);
if (f != dst.d[i])
abort ();
}
}
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#define VEC_T __m128
#define FP_T float
#define ASM_SUFFIX "s"
#define ROUND_INTRIN(x, mode) _mm_ceil_ss(x, x)
#define ROUND_MODE _MM_FROUND_CEIL
#define CHECK_ROUND_MODE 0x02
#define LOOP_INCREMENT 4
#define CHECK_LOOP_INCREMENT 4
#include "sse4_1-round.h"
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#define VEC_T __m128
#define FP_T float
#define ASM_SUFFIX "s"
#define ROUND_INTRIN(x, mode) _mm_round_ss(x, x, mode)
#define ROUND_MODE _MM_FROUND_NINT
#define CHECK_ROUND_MODE 0x00
#define LOOP_INCREMENT 4
#define CHECK_LOOP_INCREMENT 4
#include "sse4_1-round.h"
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#define VEC_T __m128
#define FP_T float
#define ASM_SUFFIX "s"
#define ROUND_INTRIN(x, mode) _mm_floor_ss(x, x)
#define ROUND_MODE _MM_FROUND_FLOOR
#define CHECK_ROUND_MODE 0x01
#define LOOP_INCREMENT 4
#define CHECK_LOOP_INCREMENT 4
#include "sse4_1-round.h"
/* { dg-do run { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -msse4.1" } */
#include "sse4_1-check.h"
#include <smmintrin.h>
#include <math.h>
#include <string.h>
#define NUM 64
static void
init_round (float *src)
{
int i, sign = 1;
float f = rand ();
for (i = 0; i < NUM; i++)
{
src[i] = (i + 1)* f * M_PI * sign;
if (i < (NUM / 2))
{
if ((i % 6) == 0)
f = f * src[i];
}
else if (i == (NUM / 2))
f = rand ();
else if ((i % 6) == 0)
f = 1 / (f * (i + 1) * src[i] * M_PI *sign);
sign = -sign;
}
}
static float
do_round (float f, int type)
{
short saved_cw, new_cw, clr_mask;
float ret;
if ((type & 4))
{
type = 0;
clr_mask = 0xFFFF;
}
else
{
type = 0x003F | ((type & 3) << 10);
clr_mask = ~0x0C3F;
}
__asm__ ("flds %0" : : "m" (*&f));
__asm__ ("fstcw %0" : "=m" (*&saved_cw));
new_cw = saved_cw & clr_mask;
new_cw |= type;
__asm__ ("fldcw %0" : : "m" (*&new_cw));
__asm__ ("frndint\n"
"fstps %0\n" : "=m" (*&ret));
__asm__ ("fldcw %0" : : "m" (*&saved_cw));
return ret;
}
static void
sse4_1_test (void)
{
int i, j;
float f;
union
{
__m128 x[NUM / 4];
float f[NUM];
} dst, src;
init_round (src.f);
memset (&dst, 0, NUM * sizeof(float));
for (i = 0; i < NUM / 4 ; i++)
dst.x[i] = _mm_round_ss (dst.x[i], src.x[i], _MM_FROUND_RINT);
for (i = 0; i < NUM; i += 4)
{
for (j = 0; j < 3; j++)
if (dst.f[i + j + 1] != 0.0)
abort ();
f = do_round (src.f[i], 0x04);
if (f != dst.f[i])
abort ();
}
for (i = 0; i < NUM / 4 ; i++)
dst.x[i] = _mm_round_ss (dst.x[i], src.x[i], _MM_FROUND_NEARBYINT);
for (i = 0; i < NUM; i += 4)
{
for (j = 0; j < 3; j++)
if (dst.f[i + j + 1] != 0.0)
abort ();
f = do_round (src.f[i], 0x0c);
if (f != dst.f[i])
abort ();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment