Commit 972fcc76 by Stuart Hastings Committed by Stuart Hastings

mmintrin.h: Mark vector intrinsics always_inline.

2005-06-29  Stuart Hastings  <stuart@apple.com>

        * gcc/config/i386/mmintrin.h: Mark vector intrinsics always_inline.
        * gcc/config/i386/emmintrin.h: Likewise.
        * gcc/config/i386/pmmintrin.h: Likewise.
        * gcc/config/i386/xmmintrin.h: Likewise.

From-SVN: r101425
parent e972ccce
2005-06-29 Stuart Hastings <stuart@apple.com>
* gcc/config/i386/mmintrin.h: Mark vector intrinsics always_inline.
* gcc/config/i386/emmintrin.h: Likewise.
* gcc/config/i386/pmmintrin.h: Likewise.
* gcc/config/i386/xmmintrin.h: Likewise.
2005-06-29 Steve Ellcey <sje@cup.hp.com> 2005-06-29 Steve Ellcey <sje@cup.hp.com>
PR middle-end/21969 PR middle-end/21969
......
...@@ -48,89 +48,89 @@ typedef __v2df __m128d; ...@@ -48,89 +48,89 @@ typedef __v2df __m128d;
(((fp1) << 1) | (fp0)) (((fp1) << 1) | (fp0))
/* Create a vector with element 0 as F and the rest zero. */ /* Create a vector with element 0 as F and the rest zero. */
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_set_sd (double __F) _mm_set_sd (double __F)
{ {
return __extension__ (__m128d){ __F, 0 }; return __extension__ (__m128d){ __F, 0 };
} }
/* Create a vector with both elements equal to F. */ /* Create a vector with both elements equal to F. */
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_set1_pd (double __F) _mm_set1_pd (double __F)
{ {
return __extension__ (__m128d){ __F, __F }; return __extension__ (__m128d){ __F, __F };
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_set_pd1 (double __F) _mm_set_pd1 (double __F)
{ {
return _mm_set1_pd (__F); return _mm_set1_pd (__F);
} }
/* Create a vector with the lower value X and upper value W. */ /* Create a vector with the lower value X and upper value W. */
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_set_pd (double __W, double __X) _mm_set_pd (double __W, double __X)
{ {
return __extension__ (__m128d){ __X, __W }; return __extension__ (__m128d){ __X, __W };
} }
/* Create a vector with the lower value W and upper value X. */ /* Create a vector with the lower value W and upper value X. */
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_setr_pd (double __W, double __X) _mm_setr_pd (double __W, double __X)
{ {
return __extension__ (__m128d){ __W, __X }; return __extension__ (__m128d){ __W, __X };
} }
/* Create a vector of zeros. */ /* Create a vector of zeros. */
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_setzero_pd (void) _mm_setzero_pd (void)
{ {
return __extension__ (__m128d){ 0.0, 0.0 }; return __extension__ (__m128d){ 0.0, 0.0 };
} }
/* Sets the low DPFP value of A from the low value of B. */ /* Sets the low DPFP value of A from the low value of B. */
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_move_sd (__m128d __A, __m128d __B) _mm_move_sd (__m128d __A, __m128d __B)
{ {
return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
} }
/* Load two DPFP values from P. The address must be 16-byte aligned. */ /* Load two DPFP values from P. The address must be 16-byte aligned. */
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_load_pd (double const *__P) _mm_load_pd (double const *__P)
{ {
return *(__m128d *)__P; return *(__m128d *)__P;
} }
/* Load two DPFP values from P. The address need not be 16-byte aligned. */ /* Load two DPFP values from P. The address need not be 16-byte aligned. */
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_loadu_pd (double const *__P) _mm_loadu_pd (double const *__P)
{ {
return __builtin_ia32_loadupd (__P); return __builtin_ia32_loadupd (__P);
} }
/* Create a vector with all two elements equal to *P. */ /* Create a vector with all two elements equal to *P. */
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_load1_pd (double const *__P) _mm_load1_pd (double const *__P)
{ {
return _mm_set1_pd (*__P); return _mm_set1_pd (*__P);
} }
/* Create a vector with element 0 as *P and the rest zero. */ /* Create a vector with element 0 as *P and the rest zero. */
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_load_sd (double const *__P) _mm_load_sd (double const *__P)
{ {
return _mm_set_sd (*__P); return _mm_set_sd (*__P);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_load_pd1 (double const *__P) _mm_load_pd1 (double const *__P)
{ {
return _mm_load1_pd (__P); return _mm_load1_pd (__P);
} }
/* Load two DPFP values in reverse order. The address must be aligned. */ /* Load two DPFP values in reverse order. The address must be aligned. */
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_loadr_pd (double const *__P) _mm_loadr_pd (double const *__P)
{ {
__m128d __tmp = _mm_load_pd (__P); __m128d __tmp = _mm_load_pd (__P);
...@@ -138,34 +138,34 @@ _mm_loadr_pd (double const *__P) ...@@ -138,34 +138,34 @@ _mm_loadr_pd (double const *__P)
} }
/* Store two DPFP values. The address must be 16-byte aligned. */ /* Store two DPFP values. The address must be 16-byte aligned. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_store_pd (double *__P, __m128d __A) _mm_store_pd (double *__P, __m128d __A)
{ {
*(__m128d *)__P = __A; *(__m128d *)__P = __A;
} }
/* Store two DPFP values. The address need not be 16-byte aligned. */ /* Store two DPFP values. The address need not be 16-byte aligned. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_storeu_pd (double *__P, __m128d __A) _mm_storeu_pd (double *__P, __m128d __A)
{ {
__builtin_ia32_storeupd (__P, __A); __builtin_ia32_storeupd (__P, __A);
} }
/* Stores the lower DPFP value. */ /* Stores the lower DPFP value. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_store_sd (double *__P, __m128d __A) _mm_store_sd (double *__P, __m128d __A)
{ {
*__P = __builtin_ia32_vec_ext_v2df (__A, 0); *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_storel_pd (double *__P, __m128d __A) _mm_storel_pd (double *__P, __m128d __A)
{ {
_mm_store_sd (__P, __A); _mm_store_sd (__P, __A);
} }
/* Stores the upper DPFP value. */ /* Stores the upper DPFP value. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_storeh_pd (double *__P, __m128d __A) _mm_storeh_pd (double *__P, __m128d __A)
{ {
*__P = __builtin_ia32_vec_ext_v2df (__A, 1); *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
...@@ -173,240 +173,240 @@ _mm_storeh_pd (double *__P, __m128d __A) ...@@ -173,240 +173,240 @@ _mm_storeh_pd (double *__P, __m128d __A)
/* Store the lower DPFP value across two words. /* Store the lower DPFP value across two words.
The address must be 16-byte aligned. */ The address must be 16-byte aligned. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_store1_pd (double *__P, __m128d __A) _mm_store1_pd (double *__P, __m128d __A)
{ {
_mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0))); _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_store_pd1 (double *__P, __m128d __A) _mm_store_pd1 (double *__P, __m128d __A)
{ {
_mm_store1_pd (__P, __A); _mm_store1_pd (__P, __A);
} }
/* Store two DPFP values in reverse order. The address must be aligned. */ /* Store two DPFP values in reverse order. The address must be aligned. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_storer_pd (double *__P, __m128d __A) _mm_storer_pd (double *__P, __m128d __A)
{ {
_mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1))); _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_cvtsi128_si32 (__m128i __A) _mm_cvtsi128_si32 (__m128i __A)
{ {
return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
} }
#ifdef __x86_64__ #ifdef __x86_64__
static __inline long long static __inline long long __attribute__((__always_inline__))
_mm_cvtsi128_si64x (__m128i __A) _mm_cvtsi128_si64x (__m128i __A)
{ {
return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0); return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
} }
#endif #endif
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_add_pd (__m128d __A, __m128d __B) _mm_add_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_add_sd (__m128d __A, __m128d __B) _mm_add_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_sub_pd (__m128d __A, __m128d __B) _mm_sub_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_sub_sd (__m128d __A, __m128d __B) _mm_sub_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_mul_pd (__m128d __A, __m128d __B) _mm_mul_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_mul_sd (__m128d __A, __m128d __B) _mm_mul_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_div_pd (__m128d __A, __m128d __B) _mm_div_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_div_sd (__m128d __A, __m128d __B) _mm_div_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_sqrt_pd (__m128d __A) _mm_sqrt_pd (__m128d __A)
{ {
return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
} }
/* Return pair {sqrt (A[0), B[1]}. */ /* Return pair {sqrt (A[0), B[1]}. */
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_sqrt_sd (__m128d __A, __m128d __B) _mm_sqrt_sd (__m128d __A, __m128d __B)
{ {
__v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp); return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_min_pd (__m128d __A, __m128d __B) _mm_min_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_min_sd (__m128d __A, __m128d __B) _mm_min_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_max_pd (__m128d __A, __m128d __B) _mm_max_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_max_sd (__m128d __A, __m128d __B) _mm_max_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_and_pd (__m128d __A, __m128d __B) _mm_and_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_andnot_pd (__m128d __A, __m128d __B) _mm_andnot_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_or_pd (__m128d __A, __m128d __B) _mm_or_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_xor_pd (__m128d __A, __m128d __B) _mm_xor_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpeq_pd (__m128d __A, __m128d __B) _mm_cmpeq_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmplt_pd (__m128d __A, __m128d __B) _mm_cmplt_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmple_pd (__m128d __A, __m128d __B) _mm_cmple_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpgt_pd (__m128d __A, __m128d __B) _mm_cmpgt_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpge_pd (__m128d __A, __m128d __B) _mm_cmpge_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpneq_pd (__m128d __A, __m128d __B) _mm_cmpneq_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpnlt_pd (__m128d __A, __m128d __B) _mm_cmpnlt_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpnle_pd (__m128d __A, __m128d __B) _mm_cmpnle_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpngt_pd (__m128d __A, __m128d __B) _mm_cmpngt_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpnge_pd (__m128d __A, __m128d __B) _mm_cmpnge_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpord_pd (__m128d __A, __m128d __B) _mm_cmpord_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpunord_pd (__m128d __A, __m128d __B) _mm_cmpunord_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpeq_sd (__m128d __A, __m128d __B) _mm_cmpeq_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmplt_sd (__m128d __A, __m128d __B) _mm_cmplt_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmple_sd (__m128d __A, __m128d __B) _mm_cmple_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpgt_sd (__m128d __A, __m128d __B) _mm_cmpgt_sd (__m128d __A, __m128d __B)
{ {
return (__m128d) __builtin_ia32_movsd ((__v2df) __A, return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
...@@ -416,7 +416,7 @@ _mm_cmpgt_sd (__m128d __A, __m128d __B) ...@@ -416,7 +416,7 @@ _mm_cmpgt_sd (__m128d __A, __m128d __B)
__A)); __A));
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpge_sd (__m128d __A, __m128d __B) _mm_cmpge_sd (__m128d __A, __m128d __B)
{ {
return (__m128d) __builtin_ia32_movsd ((__v2df) __A, return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
...@@ -426,25 +426,25 @@ _mm_cmpge_sd (__m128d __A, __m128d __B) ...@@ -426,25 +426,25 @@ _mm_cmpge_sd (__m128d __A, __m128d __B)
__A)); __A));
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpneq_sd (__m128d __A, __m128d __B) _mm_cmpneq_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpnlt_sd (__m128d __A, __m128d __B) _mm_cmpnlt_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpnle_sd (__m128d __A, __m128d __B) _mm_cmpnle_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpngt_sd (__m128d __A, __m128d __B) _mm_cmpngt_sd (__m128d __A, __m128d __B)
{ {
return (__m128d) __builtin_ia32_movsd ((__v2df) __A, return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
...@@ -454,7 +454,7 @@ _mm_cmpngt_sd (__m128d __A, __m128d __B) ...@@ -454,7 +454,7 @@ _mm_cmpngt_sd (__m128d __A, __m128d __B)
__A)); __A));
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpnge_sd (__m128d __A, __m128d __B) _mm_cmpnge_sd (__m128d __A, __m128d __B)
{ {
return (__m128d) __builtin_ia32_movsd ((__v2df) __A, return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
...@@ -464,85 +464,85 @@ _mm_cmpnge_sd (__m128d __A, __m128d __B) ...@@ -464,85 +464,85 @@ _mm_cmpnge_sd (__m128d __A, __m128d __B)
__A)); __A));
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpord_sd (__m128d __A, __m128d __B) _mm_cmpord_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cmpunord_sd (__m128d __A, __m128d __B) _mm_cmpunord_sd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_comieq_sd (__m128d __A, __m128d __B) _mm_comieq_sd (__m128d __A, __m128d __B)
{ {
return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B); return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_comilt_sd (__m128d __A, __m128d __B) _mm_comilt_sd (__m128d __A, __m128d __B)
{ {
return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B); return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_comile_sd (__m128d __A, __m128d __B) _mm_comile_sd (__m128d __A, __m128d __B)
{ {
return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B); return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_comigt_sd (__m128d __A, __m128d __B) _mm_comigt_sd (__m128d __A, __m128d __B)
{ {
return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B); return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_comige_sd (__m128d __A, __m128d __B) _mm_comige_sd (__m128d __A, __m128d __B)
{ {
return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B); return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_comineq_sd (__m128d __A, __m128d __B) _mm_comineq_sd (__m128d __A, __m128d __B)
{ {
return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B); return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_ucomieq_sd (__m128d __A, __m128d __B) _mm_ucomieq_sd (__m128d __A, __m128d __B)
{ {
return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B); return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_ucomilt_sd (__m128d __A, __m128d __B) _mm_ucomilt_sd (__m128d __A, __m128d __B)
{ {
return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B); return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_ucomile_sd (__m128d __A, __m128d __B) _mm_ucomile_sd (__m128d __A, __m128d __B)
{ {
return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B); return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_ucomigt_sd (__m128d __A, __m128d __B) _mm_ucomigt_sd (__m128d __A, __m128d __B)
{ {
return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B); return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_ucomige_sd (__m128d __A, __m128d __B) _mm_ucomige_sd (__m128d __A, __m128d __B)
{ {
return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B); return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_ucomineq_sd (__m128d __A, __m128d __B) _mm_ucomineq_sd (__m128d __A, __m128d __B)
{ {
return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B); return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
...@@ -550,25 +550,25 @@ _mm_ucomineq_sd (__m128d __A, __m128d __B) ...@@ -550,25 +550,25 @@ _mm_ucomineq_sd (__m128d __A, __m128d __B)
/* Create a vector of Qi, where i is the element number. */ /* Create a vector of Qi, where i is the element number. */
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_set_epi64x (long long __q1, long long __q0) _mm_set_epi64x (long long __q1, long long __q0)
{ {
return __extension__ (__m128i)(__v2di){ __q0, __q1 }; return __extension__ (__m128i)(__v2di){ __q0, __q1 };
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_set_epi64 (__m64 __q1, __m64 __q0) _mm_set_epi64 (__m64 __q1, __m64 __q0)
{ {
return _mm_set_epi64x ((long long)__q1, (long long)__q0); return _mm_set_epi64x ((long long)__q1, (long long)__q0);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
{ {
return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
short __q3, short __q2, short __q1, short __q0) short __q3, short __q2, short __q1, short __q0)
{ {
...@@ -576,7 +576,7 @@ _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, ...@@ -576,7 +576,7 @@ _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
__q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
char __q11, char __q10, char __q09, char __q08, char __q11, char __q10, char __q09, char __q08,
char __q07, char __q06, char __q05, char __q04, char __q07, char __q06, char __q05, char __q04,
...@@ -590,31 +590,31 @@ _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, ...@@ -590,31 +590,31 @@ _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
/* Set all of the elements of the vector to A. */ /* Set all of the elements of the vector to A. */
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_set1_epi64x (long long __A) _mm_set1_epi64x (long long __A)
{ {
return _mm_set_epi64x (__A, __A); return _mm_set_epi64x (__A, __A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_set1_epi64 (__m64 __A) _mm_set1_epi64 (__m64 __A)
{ {
return _mm_set_epi64 (__A, __A); return _mm_set_epi64 (__A, __A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_set1_epi32 (int __A) _mm_set1_epi32 (int __A)
{ {
return _mm_set_epi32 (__A, __A, __A, __A); return _mm_set_epi32 (__A, __A, __A, __A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_set1_epi16 (short __A) _mm_set1_epi16 (short __A)
{ {
return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_set1_epi8 (char __A) _mm_set1_epi8 (char __A)
{ {
return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
...@@ -624,26 +624,26 @@ _mm_set1_epi8 (char __A) ...@@ -624,26 +624,26 @@ _mm_set1_epi8 (char __A)
/* Create a vector of Qi, where i is the element number. /* Create a vector of Qi, where i is the element number.
The parameter order is reversed from the _mm_set_epi* functions. */ The parameter order is reversed from the _mm_set_epi* functions. */
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_setr_epi64 (__m64 __q0, __m64 __q1) _mm_setr_epi64 (__m64 __q0, __m64 __q1)
{ {
return _mm_set_epi64 (__q1, __q0); return _mm_set_epi64 (__q1, __q0);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
{ {
return _mm_set_epi32 (__q3, __q2, __q1, __q0); return _mm_set_epi32 (__q3, __q2, __q1, __q0);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
short __q4, short __q5, short __q6, short __q7) short __q4, short __q5, short __q6, short __q7)
{ {
return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
char __q04, char __q05, char __q06, char __q07, char __q04, char __q05, char __q06, char __q07,
char __q08, char __q09, char __q10, char __q11, char __q08, char __q09, char __q10, char __q11,
...@@ -655,182 +655,182 @@ _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, ...@@ -655,182 +655,182 @@ _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
/* Create a vector with element 0 as *P and the rest zero. */ /* Create a vector with element 0 as *P and the rest zero. */
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_load_si128 (__m128i const *__P) _mm_load_si128 (__m128i const *__P)
{ {
return *__P; return *__P;
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_loadu_si128 (__m128i const *__P) _mm_loadu_si128 (__m128i const *__P)
{ {
return (__m128i) __builtin_ia32_loaddqu ((char const *)__P); return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_loadl_epi64 (__m128i const *__P) _mm_loadl_epi64 (__m128i const *__P)
{ {
return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P); return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_store_si128 (__m128i *__P, __m128i __B) _mm_store_si128 (__m128i *__P, __m128i __B)
{ {
*__P = __B; *__P = __B;
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_storeu_si128 (__m128i *__P, __m128i __B) _mm_storeu_si128 (__m128i *__P, __m128i __B)
{ {
__builtin_ia32_storedqu ((char *)__P, (__v16qi)__B); __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_storel_epi64 (__m128i *__P, __m128i __B) _mm_storel_epi64 (__m128i *__P, __m128i __B)
{ {
*(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0); *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_movepi64_pi64 (__m128i __B) _mm_movepi64_pi64 (__m128i __B)
{ {
return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0); return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_movpi64_epi64 (__m64 __A) _mm_movpi64_epi64 (__m64 __A)
{ {
return _mm_set_epi64 ((__m64)0LL, __A); return _mm_set_epi64 ((__m64)0LL, __A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_move_epi64 (__m128i __A) _mm_move_epi64 (__m128i __A)
{ {
return _mm_set_epi64 ((__m64)0LL, _mm_movepi64_pi64 (__A)); return _mm_set_epi64 ((__m64)0LL, _mm_movepi64_pi64 (__A));
} }
/* Create a vector of zeros. */ /* Create a vector of zeros. */
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_setzero_si128 (void) _mm_setzero_si128 (void)
{ {
return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cvtepi32_pd (__m128i __A) _mm_cvtepi32_pd (__m128i __A)
{ {
return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A); return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvtepi32_ps (__m128i __A) _mm_cvtepi32_ps (__m128i __A)
{ {
return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A); return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cvtpd_epi32 (__m128d __A) _mm_cvtpd_epi32 (__m128d __A)
{ {
return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A); return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cvtpd_pi32 (__m128d __A) _mm_cvtpd_pi32 (__m128d __A)
{ {
return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A); return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvtpd_ps (__m128d __A) _mm_cvtpd_ps (__m128d __A)
{ {
return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A); return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cvttpd_epi32 (__m128d __A) _mm_cvttpd_epi32 (__m128d __A)
{ {
return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A); return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cvttpd_pi32 (__m128d __A) _mm_cvttpd_pi32 (__m128d __A)
{ {
return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A); return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cvtpi32_pd (__m64 __A) _mm_cvtpi32_pd (__m64 __A)
{ {
return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A); return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cvtps_epi32 (__m128 __A) _mm_cvtps_epi32 (__m128 __A)
{ {
return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A); return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cvttps_epi32 (__m128 __A) _mm_cvttps_epi32 (__m128 __A)
{ {
return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A); return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cvtps_pd (__m128 __A) _mm_cvtps_pd (__m128 __A)
{ {
return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A); return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_cvtsd_si32 (__m128d __A) _mm_cvtsd_si32 (__m128d __A)
{ {
return __builtin_ia32_cvtsd2si ((__v2df) __A); return __builtin_ia32_cvtsd2si ((__v2df) __A);
} }
#ifdef __x86_64__ #ifdef __x86_64__
static __inline long long static __inline long long __attribute__((__always_inline__))
_mm_cvtsd_si64x (__m128d __A) _mm_cvtsd_si64x (__m128d __A)
{ {
return __builtin_ia32_cvtsd2si64 ((__v2df) __A); return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
} }
#endif #endif
static __inline int static __inline int __attribute__((__always_inline__))
_mm_cvttsd_si32 (__m128d __A) _mm_cvttsd_si32 (__m128d __A)
{ {
return __builtin_ia32_cvttsd2si ((__v2df) __A); return __builtin_ia32_cvttsd2si ((__v2df) __A);
} }
#ifdef __x86_64__ #ifdef __x86_64__
static __inline long long static __inline long long __attribute__((__always_inline__))
_mm_cvttsd_si64x (__m128d __A) _mm_cvttsd_si64x (__m128d __A)
{ {
return __builtin_ia32_cvttsd2si64 ((__v2df) __A); return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
} }
#endif #endif
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvtsd_ss (__m128 __A, __m128d __B) _mm_cvtsd_ss (__m128 __A, __m128d __B)
{ {
return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B); return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cvtsi32_sd (__m128d __A, int __B) _mm_cvtsi32_sd (__m128d __A, int __B)
{ {
return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B); return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
} }
#ifdef __x86_64__ #ifdef __x86_64__
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cvtsi64x_sd (__m128d __A, long long __B) _mm_cvtsi64x_sd (__m128d __A, long long __B)
{ {
return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
} }
#endif #endif
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_cvtss_sd (__m128d __A, __m128 __B) _mm_cvtss_sd (__m128d __A, __m128 __B)
{ {
return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B); return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
...@@ -838,253 +838,253 @@ _mm_cvtss_sd (__m128d __A, __m128 __B) ...@@ -838,253 +838,253 @@ _mm_cvtss_sd (__m128d __A, __m128 __B)
#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C))) #define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (__C)))
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_unpackhi_pd (__m128d __A, __m128d __B) _mm_unpackhi_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_unpacklo_pd (__m128d __A, __m128d __B) _mm_unpacklo_pd (__m128d __A, __m128d __B)
{ {
return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_loadh_pd (__m128d __A, double const *__B) _mm_loadh_pd (__m128d __A, double const *__B)
{ {
return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B); return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_loadl_pd (__m128d __A, double const *__B) _mm_loadl_pd (__m128d __A, double const *__B)
{ {
return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B); return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_movemask_pd (__m128d __A) _mm_movemask_pd (__m128d __A)
{ {
return __builtin_ia32_movmskpd ((__v2df)__A); return __builtin_ia32_movmskpd ((__v2df)__A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_packs_epi16 (__m128i __A, __m128i __B) _mm_packs_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_packs_epi32 (__m128i __A, __m128i __B) _mm_packs_epi32 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B); return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_packus_epi16 (__m128i __A, __m128i __B) _mm_packus_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_unpackhi_epi8 (__m128i __A, __m128i __B) _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_unpackhi_epi16 (__m128i __A, __m128i __B) _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_unpackhi_epi32 (__m128i __A, __m128i __B) _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B); return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_unpackhi_epi64 (__m128i __A, __m128i __B) _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B); return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_unpacklo_epi8 (__m128i __A, __m128i __B) _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_unpacklo_epi16 (__m128i __A, __m128i __B) _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_unpacklo_epi32 (__m128i __A, __m128i __B) _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B); return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_unpacklo_epi64 (__m128i __A, __m128i __B) _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_add_epi8 (__m128i __A, __m128i __B) _mm_add_epi8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_add_epi16 (__m128i __A, __m128i __B) _mm_add_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_add_epi32 (__m128i __A, __m128i __B) _mm_add_epi32 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B); return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_add_epi64 (__m128i __A, __m128i __B) _mm_add_epi64 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B); return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_adds_epi8 (__m128i __A, __m128i __B) _mm_adds_epi8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_adds_epi16 (__m128i __A, __m128i __B) _mm_adds_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_adds_epu8 (__m128i __A, __m128i __B) _mm_adds_epu8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_adds_epu16 (__m128i __A, __m128i __B) _mm_adds_epu16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_sub_epi8 (__m128i __A, __m128i __B) _mm_sub_epi8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_sub_epi16 (__m128i __A, __m128i __B) _mm_sub_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_sub_epi32 (__m128i __A, __m128i __B) _mm_sub_epi32 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B); return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_sub_epi64 (__m128i __A, __m128i __B) _mm_sub_epi64 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B); return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_subs_epi8 (__m128i __A, __m128i __B) _mm_subs_epi8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_subs_epi16 (__m128i __A, __m128i __B) _mm_subs_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_subs_epu8 (__m128i __A, __m128i __B) _mm_subs_epu8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_subs_epu16 (__m128i __A, __m128i __B) _mm_subs_epu16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_madd_epi16 (__m128i __A, __m128i __B) _mm_madd_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_mulhi_epi16 (__m128i __A, __m128i __B) _mm_mulhi_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_mullo_epi16 (__m128i __A, __m128i __B) _mm_mullo_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_mul_su32 (__m64 __A, __m64 __B) _mm_mul_su32 (__m64 __A, __m64 __B)
{ {
return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B); return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_mul_epu32 (__m128i __A, __m128i __B) _mm_mul_epu32 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B); return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_slli_epi16 (__m128i __A, int __B) _mm_slli_epi16 (__m128i __A, int __B)
{ {
return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_slli_epi32 (__m128i __A, int __B) _mm_slli_epi32 (__m128i __A, int __B)
{ {
return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_slli_epi64 (__m128i __A, int __B) _mm_slli_epi64 (__m128i __A, int __B)
{ {
return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_srai_epi16 (__m128i __A, int __B) _mm_srai_epi16 (__m128i __A, int __B)
{ {
return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B); return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_srai_epi32 (__m128i __A, int __B) _mm_srai_epi32 (__m128i __A, int __B)
{ {
return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
...@@ -1109,145 +1109,145 @@ _mm_srli_si128 (__m128i __A, const int __B) ...@@ -1109,145 +1109,145 @@ _mm_srli_si128 (__m128i __A, const int __B)
((__m128i)__builtin_ia32_pslldqi128 (__A, (__B) * 8)) ((__m128i)__builtin_ia32_pslldqi128 (__A, (__B) * 8))
#endif #endif
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_srli_epi16 (__m128i __A, int __B) _mm_srli_epi16 (__m128i __A, int __B)
{ {
return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_srli_epi32 (__m128i __A, int __B) _mm_srli_epi32 (__m128i __A, int __B)
{ {
return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_srli_epi64 (__m128i __A, int __B) _mm_srli_epi64 (__m128i __A, int __B)
{ {
return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_sll_epi16 (__m128i __A, __m128i __B) _mm_sll_epi16 (__m128i __A, __m128i __B)
{ {
return _mm_slli_epi16 (__A, _mm_cvtsi128_si32 (__B)); return _mm_slli_epi16 (__A, _mm_cvtsi128_si32 (__B));
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_sll_epi32 (__m128i __A, __m128i __B) _mm_sll_epi32 (__m128i __A, __m128i __B)
{ {
return _mm_slli_epi32 (__A, _mm_cvtsi128_si32 (__B)); return _mm_slli_epi32 (__A, _mm_cvtsi128_si32 (__B));
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_sll_epi64 (__m128i __A, __m128i __B) _mm_sll_epi64 (__m128i __A, __m128i __B)
{ {
return _mm_slli_epi64 (__A, _mm_cvtsi128_si32 (__B)); return _mm_slli_epi64 (__A, _mm_cvtsi128_si32 (__B));
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_sra_epi16 (__m128i __A, __m128i __B) _mm_sra_epi16 (__m128i __A, __m128i __B)
{ {
return _mm_srai_epi16 (__A, _mm_cvtsi128_si32 (__B)); return _mm_srai_epi16 (__A, _mm_cvtsi128_si32 (__B));
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_sra_epi32 (__m128i __A, __m128i __B) _mm_sra_epi32 (__m128i __A, __m128i __B)
{ {
return _mm_srai_epi32 (__A, _mm_cvtsi128_si32 (__B)); return _mm_srai_epi32 (__A, _mm_cvtsi128_si32 (__B));
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_srl_epi16 (__m128i __A, __m128i __B) _mm_srl_epi16 (__m128i __A, __m128i __B)
{ {
return _mm_srli_epi16 (__A, _mm_cvtsi128_si32 (__B)); return _mm_srli_epi16 (__A, _mm_cvtsi128_si32 (__B));
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_srl_epi32 (__m128i __A, __m128i __B) _mm_srl_epi32 (__m128i __A, __m128i __B)
{ {
return _mm_srli_epi32 (__A, _mm_cvtsi128_si32 (__B)); return _mm_srli_epi32 (__A, _mm_cvtsi128_si32 (__B));
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_srl_epi64 (__m128i __A, __m128i __B) _mm_srl_epi64 (__m128i __A, __m128i __B)
{ {
return _mm_srli_epi64 (__A, _mm_cvtsi128_si32 (__B)); return _mm_srli_epi64 (__A, _mm_cvtsi128_si32 (__B));
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_and_si128 (__m128i __A, __m128i __B) _mm_and_si128 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B); return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_andnot_si128 (__m128i __A, __m128i __B) _mm_andnot_si128 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B); return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_or_si128 (__m128i __A, __m128i __B) _mm_or_si128 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B); return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_xor_si128 (__m128i __A, __m128i __B) _mm_xor_si128 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B); return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cmpeq_epi8 (__m128i __A, __m128i __B) _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cmpeq_epi16 (__m128i __A, __m128i __B) _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cmpeq_epi32 (__m128i __A, __m128i __B) _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B); return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cmplt_epi8 (__m128i __A, __m128i __B) _mm_cmplt_epi8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A); return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cmplt_epi16 (__m128i __A, __m128i __B) _mm_cmplt_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A); return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cmplt_epi32 (__m128i __A, __m128i __B) _mm_cmplt_epi32 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A); return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cmpgt_epi8 (__m128i __A, __m128i __B) _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cmpgt_epi16 (__m128i __A, __m128i __B) _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cmpgt_epi32 (__m128i __A, __m128i __B) _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B); return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
...@@ -1272,37 +1272,37 @@ _mm_insert_epi16 (__m128i const __A, int const __D, int const __N) ...@@ -1272,37 +1272,37 @@ _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(A), (D), (N))) ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(A), (D), (N)))
#endif #endif
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_max_epi16 (__m128i __A, __m128i __B) _mm_max_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_max_epu8 (__m128i __A, __m128i __B) _mm_max_epu8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_min_epi16 (__m128i __A, __m128i __B) _mm_min_epi16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_min_epu8 (__m128i __A, __m128i __B) _mm_min_epu8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_movemask_epi8 (__m128i __A) _mm_movemask_epi8 (__m128i __A)
{ {
return __builtin_ia32_pmovmskb128 ((__v16qi)__A); return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_mulhi_epu16 (__m128i __A, __m128i __B) _mm_mulhi_epu16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
...@@ -1312,74 +1312,74 @@ _mm_mulhi_epu16 (__m128i __A, __m128i __B) ...@@ -1312,74 +1312,74 @@ _mm_mulhi_epu16 (__m128i __A, __m128i __B)
#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B)) #define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __B))
#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B)) #define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B))
static __inline void static __inline void __attribute__((__always_inline__))
_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
{ {
__builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C); __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_avg_epu8 (__m128i __A, __m128i __B) _mm_avg_epu8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_avg_epu16 (__m128i __A, __m128i __B) _mm_avg_epu16 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B); return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_sad_epu8 (__m128i __A, __m128i __B) _mm_sad_epu8 (__m128i __A, __m128i __B)
{ {
return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B); return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_stream_si32 (int *__A, int __B) _mm_stream_si32 (int *__A, int __B)
{ {
__builtin_ia32_movnti (__A, __B); __builtin_ia32_movnti (__A, __B);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_stream_si128 (__m128i *__A, __m128i __B) _mm_stream_si128 (__m128i *__A, __m128i __B)
{ {
__builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B); __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_stream_pd (double *__A, __m128d __B) _mm_stream_pd (double *__A, __m128d __B)
{ {
__builtin_ia32_movntpd (__A, (__v2df)__B); __builtin_ia32_movntpd (__A, (__v2df)__B);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_clflush (void const *__A) _mm_clflush (void const *__A)
{ {
__builtin_ia32_clflush (__A); __builtin_ia32_clflush (__A);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_lfence (void) _mm_lfence (void)
{ {
__builtin_ia32_lfence (); __builtin_ia32_lfence ();
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_mfence (void) _mm_mfence (void)
{ {
__builtin_ia32_mfence (); __builtin_ia32_mfence ();
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cvtsi32_si128 (int __A) _mm_cvtsi32_si128 (int __A)
{ {
return _mm_set_epi32 (0, 0, 0, __A); return _mm_set_epi32 (0, 0, 0, __A);
} }
#ifdef __x86_64__ #ifdef __x86_64__
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_cvtsi64x_si128 (long long __A) _mm_cvtsi64x_si128 (long long __A)
{ {
return _mm_set_epi64x (0, __A); return _mm_set_epi64x (0, __A);
...@@ -1388,37 +1388,37 @@ _mm_cvtsi64x_si128 (long long __A) ...@@ -1388,37 +1388,37 @@ _mm_cvtsi64x_si128 (long long __A)
/* Casts between various SP, DP, INT vector types. Note that these do no /* Casts between various SP, DP, INT vector types. Note that these do no
conversion of values, they just change the type. */ conversion of values, they just change the type. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_castpd_ps(__m128d __A) _mm_castpd_ps(__m128d __A)
{ {
return (__m128) __A; return (__m128) __A;
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_castpd_si128(__m128d __A) _mm_castpd_si128(__m128d __A)
{ {
return (__m128i) __A; return (__m128i) __A;
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_castps_pd(__m128 __A) _mm_castps_pd(__m128 __A)
{ {
return (__m128d) __A; return (__m128d) __A;
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_castps_si128(__m128 __A) _mm_castps_si128(__m128 __A)
{ {
return (__m128i) __A; return (__m128i) __A;
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_castsi128_ps(__m128i __A) _mm_castsi128_ps(__m128i __A)
{ {
return (__m128) __A; return (__m128) __A;
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_castsi128_pd(__m128i __A) _mm_castsi128_pd(__m128i __A)
{ {
return (__m128d) __A; return (__m128d) __A;
......
...@@ -42,26 +42,26 @@ typedef short __v4hi __attribute__ ((__vector_size__ (8))); ...@@ -42,26 +42,26 @@ typedef short __v4hi __attribute__ ((__vector_size__ (8)));
typedef char __v8qi __attribute__ ((__vector_size__ (8))); typedef char __v8qi __attribute__ ((__vector_size__ (8)));
/* Empty the multimedia state. */ /* Empty the multimedia state. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_empty (void) _mm_empty (void)
{ {
__builtin_ia32_emms (); __builtin_ia32_emms ();
} }
static __inline void static __inline void __attribute__((__always_inline__))
_m_empty (void) _m_empty (void)
{ {
_mm_empty (); _mm_empty ();
} }
/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cvtsi32_si64 (int __i) _mm_cvtsi32_si64 (int __i)
{ {
return (__m64) __builtin_ia32_vec_init_v2si (__i, 0); return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_from_int (int __i) _m_from_int (int __i)
{ {
return _mm_cvtsi32_si64 (__i); return _mm_cvtsi32_si64 (__i);
...@@ -69,14 +69,14 @@ _m_from_int (int __i) ...@@ -69,14 +69,14 @@ _m_from_int (int __i)
#ifdef __x86_64__ #ifdef __x86_64__
/* Convert I to a __m64 object. */ /* Convert I to a __m64 object. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cvtsi64x_si64 (long long __i) _mm_cvtsi64x_si64 (long long __i)
{ {
return (__m64) __i; return (__m64) __i;
} }
/* Convert I to a __m64 object. */ /* Convert I to a __m64 object. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_set_pi64x (long long __i) _mm_set_pi64x (long long __i)
{ {
return (__m64) __i; return (__m64) __i;
...@@ -84,13 +84,13 @@ _mm_set_pi64x (long long __i) ...@@ -84,13 +84,13 @@ _mm_set_pi64x (long long __i)
#endif #endif
/* Convert the lower 32 bits of the __m64 object into an integer. */ /* Convert the lower 32 bits of the __m64 object into an integer. */
static __inline int static __inline int __attribute__((__always_inline__))
_mm_cvtsi64_si32 (__m64 __i) _mm_cvtsi64_si32 (__m64 __i)
{ {
return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0); return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_m_to_int (__m64 __i) _m_to_int (__m64 __i)
{ {
return _mm_cvtsi64_si32 (__i); return _mm_cvtsi64_si32 (__i);
...@@ -98,7 +98,7 @@ _m_to_int (__m64 __i) ...@@ -98,7 +98,7 @@ _m_to_int (__m64 __i)
#ifdef __x86_64__ #ifdef __x86_64__
/* Convert the lower 32 bits of the __m64 object into an integer. */ /* Convert the lower 32 bits of the __m64 object into an integer. */
static __inline long long static __inline long long __attribute__((__always_inline__))
_mm_cvtsi64_si64x (__m64 __i) _mm_cvtsi64_si64x (__m64 __i)
{ {
return (long long)__i; return (long long)__i;
...@@ -108,13 +108,13 @@ _mm_cvtsi64_si64x (__m64 __i) ...@@ -108,13 +108,13 @@ _mm_cvtsi64_si64x (__m64 __i)
/* Pack the four 16-bit values from M1 into the lower four 8-bit values of /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
the result, and the four 16-bit values from M2 into the upper four 8-bit the result, and the four 16-bit values from M2 into the upper four 8-bit
values of the result, all with signed saturation. */ values of the result, all with signed saturation. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_packs_pi16 (__m64 __m1, __m64 __m2) _mm_packs_pi16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_packsswb (__m64 __m1, __m64 __m2) _m_packsswb (__m64 __m1, __m64 __m2)
{ {
return _mm_packs_pi16 (__m1, __m2); return _mm_packs_pi16 (__m1, __m2);
...@@ -123,13 +123,13 @@ _m_packsswb (__m64 __m1, __m64 __m2) ...@@ -123,13 +123,13 @@ _m_packsswb (__m64 __m1, __m64 __m2)
/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
the result, and the two 32-bit values from M2 into the upper two 16-bit the result, and the two 32-bit values from M2 into the upper two 16-bit
values of the result, all with signed saturation. */ values of the result, all with signed saturation. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_packs_pi32 (__m64 __m1, __m64 __m2) _mm_packs_pi32 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2); return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_packssdw (__m64 __m1, __m64 __m2) _m_packssdw (__m64 __m1, __m64 __m2)
{ {
return _mm_packs_pi32 (__m1, __m2); return _mm_packs_pi32 (__m1, __m2);
...@@ -138,13 +138,13 @@ _m_packssdw (__m64 __m1, __m64 __m2) ...@@ -138,13 +138,13 @@ _m_packssdw (__m64 __m1, __m64 __m2)
/* Pack the four 16-bit values from M1 into the lower four 8-bit values of /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
the result, and the four 16-bit values from M2 into the upper four 8-bit the result, and the four 16-bit values from M2 into the upper four 8-bit
values of the result, all with unsigned saturation. */ values of the result, all with unsigned saturation. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_packs_pu16 (__m64 __m1, __m64 __m2) _mm_packs_pu16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_packuswb (__m64 __m1, __m64 __m2) _m_packuswb (__m64 __m1, __m64 __m2)
{ {
return _mm_packs_pu16 (__m1, __m2); return _mm_packs_pu16 (__m1, __m2);
...@@ -152,13 +152,13 @@ _m_packuswb (__m64 __m1, __m64 __m2) ...@@ -152,13 +152,13 @@ _m_packuswb (__m64 __m1, __m64 __m2)
/* Interleave the four 8-bit values from the high half of M1 with the four /* Interleave the four 8-bit values from the high half of M1 with the four
8-bit values from the high half of M2. */ 8-bit values from the high half of M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2); return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_punpckhbw (__m64 __m1, __m64 __m2) _m_punpckhbw (__m64 __m1, __m64 __m2)
{ {
return _mm_unpackhi_pi8 (__m1, __m2); return _mm_unpackhi_pi8 (__m1, __m2);
...@@ -166,13 +166,13 @@ _m_punpckhbw (__m64 __m1, __m64 __m2) ...@@ -166,13 +166,13 @@ _m_punpckhbw (__m64 __m1, __m64 __m2)
/* Interleave the two 16-bit values from the high half of M1 with the two /* Interleave the two 16-bit values from the high half of M1 with the two
16-bit values from the high half of M2. */ 16-bit values from the high half of M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_punpckhwd (__m64 __m1, __m64 __m2) _m_punpckhwd (__m64 __m1, __m64 __m2)
{ {
return _mm_unpackhi_pi16 (__m1, __m2); return _mm_unpackhi_pi16 (__m1, __m2);
...@@ -180,13 +180,13 @@ _m_punpckhwd (__m64 __m1, __m64 __m2) ...@@ -180,13 +180,13 @@ _m_punpckhwd (__m64 __m1, __m64 __m2)
/* Interleave the 32-bit value from the high half of M1 with the 32-bit /* Interleave the 32-bit value from the high half of M1 with the 32-bit
value from the high half of M2. */ value from the high half of M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2); return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_punpckhdq (__m64 __m1, __m64 __m2) _m_punpckhdq (__m64 __m1, __m64 __m2)
{ {
return _mm_unpackhi_pi32 (__m1, __m2); return _mm_unpackhi_pi32 (__m1, __m2);
...@@ -194,13 +194,13 @@ _m_punpckhdq (__m64 __m1, __m64 __m2) ...@@ -194,13 +194,13 @@ _m_punpckhdq (__m64 __m1, __m64 __m2)
/* Interleave the four 8-bit values from the low half of M1 with the four /* Interleave the four 8-bit values from the low half of M1 with the four
8-bit values from the low half of M2. */ 8-bit values from the low half of M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2); return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_punpcklbw (__m64 __m1, __m64 __m2) _m_punpcklbw (__m64 __m1, __m64 __m2)
{ {
return _mm_unpacklo_pi8 (__m1, __m2); return _mm_unpacklo_pi8 (__m1, __m2);
...@@ -208,13 +208,13 @@ _m_punpcklbw (__m64 __m1, __m64 __m2) ...@@ -208,13 +208,13 @@ _m_punpcklbw (__m64 __m1, __m64 __m2)
/* Interleave the two 16-bit values from the low half of M1 with the two /* Interleave the two 16-bit values from the low half of M1 with the two
16-bit values from the low half of M2. */ 16-bit values from the low half of M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_punpcklwd (__m64 __m1, __m64 __m2) _m_punpcklwd (__m64 __m1, __m64 __m2)
{ {
return _mm_unpacklo_pi16 (__m1, __m2); return _mm_unpacklo_pi16 (__m1, __m2);
...@@ -222,59 +222,59 @@ _m_punpcklwd (__m64 __m1, __m64 __m2) ...@@ -222,59 +222,59 @@ _m_punpcklwd (__m64 __m1, __m64 __m2)
/* Interleave the 32-bit value from the low half of M1 with the 32-bit /* Interleave the 32-bit value from the low half of M1 with the 32-bit
value from the low half of M2. */ value from the low half of M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2); return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_punpckldq (__m64 __m1, __m64 __m2) _m_punpckldq (__m64 __m1, __m64 __m2)
{ {
return _mm_unpacklo_pi32 (__m1, __m2); return _mm_unpacklo_pi32 (__m1, __m2);
} }
/* Add the 8-bit values in M1 to the 8-bit values in M2. */ /* Add the 8-bit values in M1 to the 8-bit values in M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_add_pi8 (__m64 __m1, __m64 __m2) _mm_add_pi8 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2); return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_paddb (__m64 __m1, __m64 __m2) _m_paddb (__m64 __m1, __m64 __m2)
{ {
return _mm_add_pi8 (__m1, __m2); return _mm_add_pi8 (__m1, __m2);
} }
/* Add the 16-bit values in M1 to the 16-bit values in M2. */ /* Add the 16-bit values in M1 to the 16-bit values in M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_add_pi16 (__m64 __m1, __m64 __m2) _mm_add_pi16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_paddw (__m64 __m1, __m64 __m2) _m_paddw (__m64 __m1, __m64 __m2)
{ {
return _mm_add_pi16 (__m1, __m2); return _mm_add_pi16 (__m1, __m2);
} }
/* Add the 32-bit values in M1 to the 32-bit values in M2. */ /* Add the 32-bit values in M1 to the 32-bit values in M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_add_pi32 (__m64 __m1, __m64 __m2) _mm_add_pi32 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2); return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_paddd (__m64 __m1, __m64 __m2) _m_paddd (__m64 __m1, __m64 __m2)
{ {
return _mm_add_pi32 (__m1, __m2); return _mm_add_pi32 (__m1, __m2);
} }
/* Add the 64-bit values in M1 to the 64-bit values in M2. */ /* Add the 64-bit values in M1 to the 64-bit values in M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_add_si64 (__m64 __m1, __m64 __m2) _mm_add_si64 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_paddq ((long long)__m1, (long long)__m2); return (__m64) __builtin_ia32_paddq ((long long)__m1, (long long)__m2);
...@@ -282,13 +282,13 @@ _mm_add_si64 (__m64 __m1, __m64 __m2) ...@@ -282,13 +282,13 @@ _mm_add_si64 (__m64 __m1, __m64 __m2)
/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
saturated arithmetic. */ saturated arithmetic. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_adds_pi8 (__m64 __m1, __m64 __m2) _mm_adds_pi8 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2); return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_paddsb (__m64 __m1, __m64 __m2) _m_paddsb (__m64 __m1, __m64 __m2)
{ {
return _mm_adds_pi8 (__m1, __m2); return _mm_adds_pi8 (__m1, __m2);
...@@ -296,13 +296,13 @@ _m_paddsb (__m64 __m1, __m64 __m2) ...@@ -296,13 +296,13 @@ _m_paddsb (__m64 __m1, __m64 __m2)
/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
saturated arithmetic. */ saturated arithmetic. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_adds_pi16 (__m64 __m1, __m64 __m2) _mm_adds_pi16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_paddsw (__m64 __m1, __m64 __m2) _m_paddsw (__m64 __m1, __m64 __m2)
{ {
return _mm_adds_pi16 (__m1, __m2); return _mm_adds_pi16 (__m1, __m2);
...@@ -310,13 +310,13 @@ _m_paddsw (__m64 __m1, __m64 __m2) ...@@ -310,13 +310,13 @@ _m_paddsw (__m64 __m1, __m64 __m2)
/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
saturated arithmetic. */ saturated arithmetic. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_adds_pu8 (__m64 __m1, __m64 __m2) _mm_adds_pu8 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2); return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_paddusb (__m64 __m1, __m64 __m2) _m_paddusb (__m64 __m1, __m64 __m2)
{ {
return _mm_adds_pu8 (__m1, __m2); return _mm_adds_pu8 (__m1, __m2);
...@@ -324,59 +324,59 @@ _m_paddusb (__m64 __m1, __m64 __m2) ...@@ -324,59 +324,59 @@ _m_paddusb (__m64 __m1, __m64 __m2)
/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
saturated arithmetic. */ saturated arithmetic. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_adds_pu16 (__m64 __m1, __m64 __m2) _mm_adds_pu16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_paddusw (__m64 __m1, __m64 __m2) _m_paddusw (__m64 __m1, __m64 __m2)
{ {
return _mm_adds_pu16 (__m1, __m2); return _mm_adds_pu16 (__m1, __m2);
} }
/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_sub_pi8 (__m64 __m1, __m64 __m2) _mm_sub_pi8 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2); return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psubb (__m64 __m1, __m64 __m2) _m_psubb (__m64 __m1, __m64 __m2)
{ {
return _mm_sub_pi8 (__m1, __m2); return _mm_sub_pi8 (__m1, __m2);
} }
/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_sub_pi16 (__m64 __m1, __m64 __m2) _mm_sub_pi16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psubw (__m64 __m1, __m64 __m2) _m_psubw (__m64 __m1, __m64 __m2)
{ {
return _mm_sub_pi16 (__m1, __m2); return _mm_sub_pi16 (__m1, __m2);
} }
/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_sub_pi32 (__m64 __m1, __m64 __m2) _mm_sub_pi32 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2); return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psubd (__m64 __m1, __m64 __m2) _m_psubd (__m64 __m1, __m64 __m2)
{ {
return _mm_sub_pi32 (__m1, __m2); return _mm_sub_pi32 (__m1, __m2);
} }
/* Add the 64-bit values in M1 to the 64-bit values in M2. */ /* Add the 64-bit values in M1 to the 64-bit values in M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_sub_si64 (__m64 __m1, __m64 __m2) _mm_sub_si64 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_psubq ((long long)__m1, (long long)__m2); return (__m64) __builtin_ia32_psubq ((long long)__m1, (long long)__m2);
...@@ -384,13 +384,13 @@ _mm_sub_si64 (__m64 __m1, __m64 __m2) ...@@ -384,13 +384,13 @@ _mm_sub_si64 (__m64 __m1, __m64 __m2)
/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
saturating arithmetic. */ saturating arithmetic. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_subs_pi8 (__m64 __m1, __m64 __m2) _mm_subs_pi8 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2); return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psubsb (__m64 __m1, __m64 __m2) _m_psubsb (__m64 __m1, __m64 __m2)
{ {
return _mm_subs_pi8 (__m1, __m2); return _mm_subs_pi8 (__m1, __m2);
...@@ -398,13 +398,13 @@ _m_psubsb (__m64 __m1, __m64 __m2) ...@@ -398,13 +398,13 @@ _m_psubsb (__m64 __m1, __m64 __m2)
/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
signed saturating arithmetic. */ signed saturating arithmetic. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_subs_pi16 (__m64 __m1, __m64 __m2) _mm_subs_pi16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psubsw (__m64 __m1, __m64 __m2) _m_psubsw (__m64 __m1, __m64 __m2)
{ {
return _mm_subs_pi16 (__m1, __m2); return _mm_subs_pi16 (__m1, __m2);
...@@ -412,13 +412,13 @@ _m_psubsw (__m64 __m1, __m64 __m2) ...@@ -412,13 +412,13 @@ _m_psubsw (__m64 __m1, __m64 __m2)
/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
unsigned saturating arithmetic. */ unsigned saturating arithmetic. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_subs_pu8 (__m64 __m1, __m64 __m2) _mm_subs_pu8 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2); return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psubusb (__m64 __m1, __m64 __m2) _m_psubusb (__m64 __m1, __m64 __m2)
{ {
return _mm_subs_pu8 (__m1, __m2); return _mm_subs_pu8 (__m1, __m2);
...@@ -426,13 +426,13 @@ _m_psubusb (__m64 __m1, __m64 __m2) ...@@ -426,13 +426,13 @@ _m_psubusb (__m64 __m1, __m64 __m2)
/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
unsigned saturating arithmetic. */ unsigned saturating arithmetic. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_subs_pu16 (__m64 __m1, __m64 __m2) _mm_subs_pu16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psubusw (__m64 __m1, __m64 __m2) _m_psubusw (__m64 __m1, __m64 __m2)
{ {
return _mm_subs_pu16 (__m1, __m2); return _mm_subs_pu16 (__m1, __m2);
...@@ -441,13 +441,13 @@ _m_psubusw (__m64 __m1, __m64 __m2) ...@@ -441,13 +441,13 @@ _m_psubusw (__m64 __m1, __m64 __m2)
/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
four 32-bit intermediate results, which are then summed by pairs to four 32-bit intermediate results, which are then summed by pairs to
produce two 32-bit results. */ produce two 32-bit results. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_madd_pi16 (__m64 __m1, __m64 __m2) _mm_madd_pi16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pmaddwd (__m64 __m1, __m64 __m2) _m_pmaddwd (__m64 __m1, __m64 __m2)
{ {
return _mm_madd_pi16 (__m1, __m2); return _mm_madd_pi16 (__m1, __m2);
...@@ -455,13 +455,13 @@ _m_pmaddwd (__m64 __m1, __m64 __m2) ...@@ -455,13 +455,13 @@ _m_pmaddwd (__m64 __m1, __m64 __m2)
/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
M2 and produce the high 16 bits of the 32-bit results. */ M2 and produce the high 16 bits of the 32-bit results. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_mulhi_pi16 (__m64 __m1, __m64 __m2) _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pmulhw (__m64 __m1, __m64 __m2) _m_pmulhw (__m64 __m1, __m64 __m2)
{ {
return _mm_mulhi_pi16 (__m1, __m2); return _mm_mulhi_pi16 (__m1, __m2);
...@@ -469,226 +469,226 @@ _m_pmulhw (__m64 __m1, __m64 __m2) ...@@ -469,226 +469,226 @@ _m_pmulhw (__m64 __m1, __m64 __m2)
/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
the low 16 bits of the results. */ the low 16 bits of the results. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_mullo_pi16 (__m64 __m1, __m64 __m2) _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pmullw (__m64 __m1, __m64 __m2) _m_pmullw (__m64 __m1, __m64 __m2)
{ {
return _mm_mullo_pi16 (__m1, __m2); return _mm_mullo_pi16 (__m1, __m2);
} }
/* Shift four 16-bit values in M left by COUNT. */ /* Shift four 16-bit values in M left by COUNT. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_sll_pi16 (__m64 __m, __m64 __count) _mm_sll_pi16 (__m64 __m, __m64 __count)
{ {
return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (long long)__count); return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (long long)__count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psllw (__m64 __m, __m64 __count) _m_psllw (__m64 __m, __m64 __count)
{ {
return _mm_sll_pi16 (__m, __count); return _mm_sll_pi16 (__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_slli_pi16 (__m64 __m, int __count) _mm_slli_pi16 (__m64 __m, int __count)
{ {
return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count); return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psllwi (__m64 __m, int __count) _m_psllwi (__m64 __m, int __count)
{ {
return _mm_slli_pi16 (__m, __count); return _mm_slli_pi16 (__m, __count);
} }
/* Shift two 32-bit values in M left by COUNT. */ /* Shift two 32-bit values in M left by COUNT. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_sll_pi32 (__m64 __m, __m64 __count) _mm_sll_pi32 (__m64 __m, __m64 __count)
{ {
return (__m64) __builtin_ia32_pslld ((__v2si)__m, (long long)__count); return (__m64) __builtin_ia32_pslld ((__v2si)__m, (long long)__count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pslld (__m64 __m, __m64 __count) _m_pslld (__m64 __m, __m64 __count)
{ {
return _mm_sll_pi32 (__m, __count); return _mm_sll_pi32 (__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_slli_pi32 (__m64 __m, int __count) _mm_slli_pi32 (__m64 __m, int __count)
{ {
return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count); return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pslldi (__m64 __m, int __count) _m_pslldi (__m64 __m, int __count)
{ {
return _mm_slli_pi32 (__m, __count); return _mm_slli_pi32 (__m, __count);
} }
/* Shift the 64-bit value in M left by COUNT. */ /* Shift the 64-bit value in M left by COUNT. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_sll_si64 (__m64 __m, __m64 __count) _mm_sll_si64 (__m64 __m, __m64 __count)
{ {
return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count); return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psllq (__m64 __m, __m64 __count) _m_psllq (__m64 __m, __m64 __count)
{ {
return _mm_sll_si64 (__m, __count); return _mm_sll_si64 (__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_slli_si64 (__m64 __m, int __count) _mm_slli_si64 (__m64 __m, int __count)
{ {
return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count); return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psllqi (__m64 __m, int __count) _m_psllqi (__m64 __m, int __count)
{ {
return _mm_slli_si64 (__m, __count); return _mm_slli_si64 (__m, __count);
} }
/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_sra_pi16 (__m64 __m, __m64 __count) _mm_sra_pi16 (__m64 __m, __m64 __count)
{ {
return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (long long)__count); return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (long long)__count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psraw (__m64 __m, __m64 __count) _m_psraw (__m64 __m, __m64 __count)
{ {
return _mm_sra_pi16 (__m, __count); return _mm_sra_pi16 (__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_srai_pi16 (__m64 __m, int __count) _mm_srai_pi16 (__m64 __m, int __count)
{ {
return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count); return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psrawi (__m64 __m, int __count) _m_psrawi (__m64 __m, int __count)
{ {
return _mm_srai_pi16 (__m, __count); return _mm_srai_pi16 (__m, __count);
} }
/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_sra_pi32 (__m64 __m, __m64 __count) _mm_sra_pi32 (__m64 __m, __m64 __count)
{ {
return (__m64) __builtin_ia32_psrad ((__v2si)__m, (long long)__count); return (__m64) __builtin_ia32_psrad ((__v2si)__m, (long long)__count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psrad (__m64 __m, __m64 __count) _m_psrad (__m64 __m, __m64 __count)
{ {
return _mm_sra_pi32 (__m, __count); return _mm_sra_pi32 (__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_srai_pi32 (__m64 __m, int __count) _mm_srai_pi32 (__m64 __m, int __count)
{ {
return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count); return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psradi (__m64 __m, int __count) _m_psradi (__m64 __m, int __count)
{ {
return _mm_srai_pi32 (__m, __count); return _mm_srai_pi32 (__m, __count);
} }
/* Shift four 16-bit values in M right by COUNT; shift in zeros. */ /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_srl_pi16 (__m64 __m, __m64 __count) _mm_srl_pi16 (__m64 __m, __m64 __count)
{ {
return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (long long)__count); return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (long long)__count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psrlw (__m64 __m, __m64 __count) _m_psrlw (__m64 __m, __m64 __count)
{ {
return _mm_srl_pi16 (__m, __count); return _mm_srl_pi16 (__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_srli_pi16 (__m64 __m, int __count) _mm_srli_pi16 (__m64 __m, int __count)
{ {
return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count); return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psrlwi (__m64 __m, int __count) _m_psrlwi (__m64 __m, int __count)
{ {
return _mm_srli_pi16 (__m, __count); return _mm_srli_pi16 (__m, __count);
} }
/* Shift two 32-bit values in M right by COUNT; shift in zeros. */ /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_srl_pi32 (__m64 __m, __m64 __count) _mm_srl_pi32 (__m64 __m, __m64 __count)
{ {
return (__m64) __builtin_ia32_psrld ((__v2si)__m, (long long)__count); return (__m64) __builtin_ia32_psrld ((__v2si)__m, (long long)__count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psrld (__m64 __m, __m64 __count) _m_psrld (__m64 __m, __m64 __count)
{ {
return _mm_srl_pi32 (__m, __count); return _mm_srl_pi32 (__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_srli_pi32 (__m64 __m, int __count) _mm_srli_pi32 (__m64 __m, int __count)
{ {
return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count); return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psrldi (__m64 __m, int __count) _m_psrldi (__m64 __m, int __count)
{ {
return _mm_srli_pi32 (__m, __count); return _mm_srli_pi32 (__m, __count);
} }
/* Shift the 64-bit value in M left by COUNT; shift in zeros. */ /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_srl_si64 (__m64 __m, __m64 __count) _mm_srl_si64 (__m64 __m, __m64 __count)
{ {
return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count); return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psrlq (__m64 __m, __m64 __count) _m_psrlq (__m64 __m, __m64 __count)
{ {
return _mm_srl_si64 (__m, __count); return _mm_srl_si64 (__m, __count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_srli_si64 (__m64 __m, int __count) _mm_srli_si64 (__m64 __m, int __count)
{ {
return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count); return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psrlqi (__m64 __m, int __count) _m_psrlqi (__m64 __m, int __count)
{ {
return _mm_srli_si64 (__m, __count); return _mm_srli_si64 (__m, __count);
} }
/* Bit-wise AND the 64-bit values in M1 and M2. */ /* Bit-wise AND the 64-bit values in M1 and M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_and_si64 (__m64 __m1, __m64 __m2) _mm_and_si64 (__m64 __m1, __m64 __m2)
{ {
return __builtin_ia32_pand (__m1, __m2); return __builtin_ia32_pand (__m1, __m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pand (__m64 __m1, __m64 __m2) _m_pand (__m64 __m1, __m64 __m2)
{ {
return _mm_and_si64 (__m1, __m2); return _mm_and_si64 (__m1, __m2);
...@@ -696,39 +696,39 @@ _m_pand (__m64 __m1, __m64 __m2) ...@@ -696,39 +696,39 @@ _m_pand (__m64 __m1, __m64 __m2)
/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
64-bit value in M2. */ 64-bit value in M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_andnot_si64 (__m64 __m1, __m64 __m2) _mm_andnot_si64 (__m64 __m1, __m64 __m2)
{ {
return __builtin_ia32_pandn (__m1, __m2); return __builtin_ia32_pandn (__m1, __m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pandn (__m64 __m1, __m64 __m2) _m_pandn (__m64 __m1, __m64 __m2)
{ {
return _mm_andnot_si64 (__m1, __m2); return _mm_andnot_si64 (__m1, __m2);
} }
/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_or_si64 (__m64 __m1, __m64 __m2) _mm_or_si64 (__m64 __m1, __m64 __m2)
{ {
return __builtin_ia32_por (__m1, __m2); return __builtin_ia32_por (__m1, __m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_por (__m64 __m1, __m64 __m2) _m_por (__m64 __m1, __m64 __m2)
{ {
return _mm_or_si64 (__m1, __m2); return _mm_or_si64 (__m1, __m2);
} }
/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_xor_si64 (__m64 __m1, __m64 __m2) _mm_xor_si64 (__m64 __m1, __m64 __m2)
{ {
return __builtin_ia32_pxor (__m1, __m2); return __builtin_ia32_pxor (__m1, __m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pxor (__m64 __m1, __m64 __m2) _m_pxor (__m64 __m1, __m64 __m2)
{ {
return _mm_xor_si64 (__m1, __m2); return _mm_xor_si64 (__m1, __m2);
...@@ -736,25 +736,25 @@ _m_pxor (__m64 __m1, __m64 __m2) ...@@ -736,25 +736,25 @@ _m_pxor (__m64 __m1, __m64 __m2)
/* Compare eight 8-bit values. The result of the comparison is 0xFF if the /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
test is true and zero if false. */ test is true and zero if false. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2); return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pcmpeqb (__m64 __m1, __m64 __m2) _m_pcmpeqb (__m64 __m1, __m64 __m2)
{ {
return _mm_cmpeq_pi8 (__m1, __m2); return _mm_cmpeq_pi8 (__m1, __m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2); return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pcmpgtb (__m64 __m1, __m64 __m2) _m_pcmpgtb (__m64 __m1, __m64 __m2)
{ {
return _mm_cmpgt_pi8 (__m1, __m2); return _mm_cmpgt_pi8 (__m1, __m2);
...@@ -762,25 +762,25 @@ _m_pcmpgtb (__m64 __m1, __m64 __m2) ...@@ -762,25 +762,25 @@ _m_pcmpgtb (__m64 __m1, __m64 __m2)
/* Compare four 16-bit values. The result of the comparison is 0xFFFF if /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
the test is true and zero if false. */ the test is true and zero if false. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pcmpeqw (__m64 __m1, __m64 __m2) _m_pcmpeqw (__m64 __m1, __m64 __m2)
{ {
return _mm_cmpeq_pi16 (__m1, __m2); return _mm_cmpeq_pi16 (__m1, __m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2); return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pcmpgtw (__m64 __m1, __m64 __m2) _m_pcmpgtw (__m64 __m1, __m64 __m2)
{ {
return _mm_cmpgt_pi16 (__m1, __m2); return _mm_cmpgt_pi16 (__m1, __m2);
...@@ -788,53 +788,53 @@ _m_pcmpgtw (__m64 __m1, __m64 __m2) ...@@ -788,53 +788,53 @@ _m_pcmpgtw (__m64 __m1, __m64 __m2)
/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
the test is true and zero if false. */ the test is true and zero if false. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2); return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pcmpeqd (__m64 __m1, __m64 __m2) _m_pcmpeqd (__m64 __m1, __m64 __m2)
{ {
return _mm_cmpeq_pi32 (__m1, __m2); return _mm_cmpeq_pi32 (__m1, __m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
{ {
return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2); return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pcmpgtd (__m64 __m1, __m64 __m2) _m_pcmpgtd (__m64 __m1, __m64 __m2)
{ {
return _mm_cmpgt_pi32 (__m1, __m2); return _mm_cmpgt_pi32 (__m1, __m2);
} }
/* Creates a 64-bit zero. */ /* Creates a 64-bit zero. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_setzero_si64 (void) _mm_setzero_si64 (void)
{ {
return (__m64)0LL; return (__m64)0LL;
} }
/* Creates a vector of two 32-bit values; I0 is least significant. */ /* Creates a vector of two 32-bit values; I0 is least significant. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_set_pi32 (int __i1, int __i0) _mm_set_pi32 (int __i1, int __i0)
{ {
return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1); return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
} }
/* Creates a vector of four 16-bit values; W0 is least significant. */ /* Creates a vector of four 16-bit values; W0 is least significant. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
{ {
return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3); return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
} }
/* Creates a vector of eight 8-bit values; B0 is least significant. */ /* Creates a vector of eight 8-bit values; B0 is least significant. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
char __b3, char __b2, char __b1, char __b0) char __b3, char __b2, char __b1, char __b0)
{ {
...@@ -843,19 +843,19 @@ _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, ...@@ -843,19 +843,19 @@ _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
} }
/* Similar, but with the arguments in reverse order. */ /* Similar, but with the arguments in reverse order. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_setr_pi32 (int __i0, int __i1) _mm_setr_pi32 (int __i0, int __i1)
{ {
return _mm_set_pi32 (__i1, __i0); return _mm_set_pi32 (__i1, __i0);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
{ {
return _mm_set_pi16 (__w3, __w2, __w1, __w0); return _mm_set_pi16 (__w3, __w2, __w1, __w0);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
char __b4, char __b5, char __b6, char __b7) char __b4, char __b5, char __b6, char __b7)
{ {
...@@ -863,21 +863,21 @@ _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, ...@@ -863,21 +863,21 @@ _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
} }
/* Creates a vector of two 32-bit values, both elements containing I. */ /* Creates a vector of two 32-bit values, both elements containing I. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_set1_pi32 (int __i) _mm_set1_pi32 (int __i)
{ {
return _mm_set_pi32 (__i, __i); return _mm_set_pi32 (__i, __i);
} }
/* Creates a vector of four 16-bit values, all elements containing W. */ /* Creates a vector of four 16-bit values, all elements containing W. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_set1_pi16 (short __w) _mm_set1_pi16 (short __w)
{ {
return _mm_set_pi16 (__w, __w, __w, __w); return _mm_set_pi16 (__w, __w, __w, __w);
} }
/* Creates a vector of eight 8-bit values, all elements containing B. */ /* Creates a vector of eight 8-bit values, all elements containing B. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_set1_pi8 (char __b) _mm_set1_pi8 (char __b)
{ {
return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b); return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
......
...@@ -44,80 +44,80 @@ ...@@ -44,80 +44,80 @@
#define _MM_GET_DENORMALS_ZERO_MODE() \ #define _MM_GET_DENORMALS_ZERO_MODE() \
(_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_addsub_ps (__m128 __X, __m128 __Y) _mm_addsub_ps (__m128 __X, __m128 __Y)
{ {
return (__m128) __builtin_ia32_addsubps ((__v4sf)__X, (__v4sf)__Y); return (__m128) __builtin_ia32_addsubps ((__v4sf)__X, (__v4sf)__Y);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_hadd_ps (__m128 __X, __m128 __Y) _mm_hadd_ps (__m128 __X, __m128 __Y)
{ {
return (__m128) __builtin_ia32_haddps ((__v4sf)__X, (__v4sf)__Y); return (__m128) __builtin_ia32_haddps ((__v4sf)__X, (__v4sf)__Y);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_hsub_ps (__m128 __X, __m128 __Y) _mm_hsub_ps (__m128 __X, __m128 __Y)
{ {
return (__m128) __builtin_ia32_hsubps ((__v4sf)__X, (__v4sf)__Y); return (__m128) __builtin_ia32_hsubps ((__v4sf)__X, (__v4sf)__Y);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_movehdup_ps (__m128 __X) _mm_movehdup_ps (__m128 __X)
{ {
return (__m128) __builtin_ia32_movshdup ((__v4sf)__X); return (__m128) __builtin_ia32_movshdup ((__v4sf)__X);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_moveldup_ps (__m128 __X) _mm_moveldup_ps (__m128 __X)
{ {
return (__m128) __builtin_ia32_movsldup ((__v4sf)__X); return (__m128) __builtin_ia32_movsldup ((__v4sf)__X);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_addsub_pd (__m128d __X, __m128d __Y) _mm_addsub_pd (__m128d __X, __m128d __Y)
{ {
return (__m128d) __builtin_ia32_addsubpd ((__v2df)__X, (__v2df)__Y); return (__m128d) __builtin_ia32_addsubpd ((__v2df)__X, (__v2df)__Y);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_hadd_pd (__m128d __X, __m128d __Y) _mm_hadd_pd (__m128d __X, __m128d __Y)
{ {
return (__m128d) __builtin_ia32_haddpd ((__v2df)__X, (__v2df)__Y); return (__m128d) __builtin_ia32_haddpd ((__v2df)__X, (__v2df)__Y);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_hsub_pd (__m128d __X, __m128d __Y) _mm_hsub_pd (__m128d __X, __m128d __Y)
{ {
return (__m128d) __builtin_ia32_hsubpd ((__v2df)__X, (__v2df)__Y); return (__m128d) __builtin_ia32_hsubpd ((__v2df)__X, (__v2df)__Y);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_loaddup_pd (double const *__P) _mm_loaddup_pd (double const *__P)
{ {
return _mm_load1_pd (__P); return _mm_load1_pd (__P);
} }
static __inline __m128d static __inline __m128d __attribute__((__always_inline__))
_mm_movedup_pd (__m128d __X) _mm_movedup_pd (__m128d __X)
{ {
return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0)); return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
} }
static __inline __m128i static __inline __m128i __attribute__((__always_inline__))
_mm_lddqu_si128 (__m128i const *__P) _mm_lddqu_si128 (__m128i const *__P)
{ {
return (__m128i) __builtin_ia32_lddqu ((char const *)__P); return (__m128i) __builtin_ia32_lddqu ((char const *)__P);
} }
#if 0 #if 0
static __inline void static __inline void __attribute__((__always_inline__))
_mm_monitor (void const * __P, unsigned int __E, unsigned int __H) _mm_monitor (void const * __P, unsigned int __E, unsigned int __H)
{ {
__builtin_ia32_monitor (__P, __E, __H); __builtin_ia32_monitor (__P, __E, __H);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_mwait (unsigned int __E, unsigned int __H) _mm_mwait (unsigned int __E, unsigned int __H)
{ {
__builtin_ia32_mwait (__E, __H); __builtin_ia32_mwait (__E, __H);
......
...@@ -87,7 +87,7 @@ enum _mm_hint ...@@ -87,7 +87,7 @@ enum _mm_hint
#define _MM_FLUSH_ZERO_OFF 0x0000 #define _MM_FLUSH_ZERO_OFF 0x0000
/* Create a vector of zeros. */ /* Create a vector of zeros. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_setzero_ps (void) _mm_setzero_ps (void)
{ {
return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
...@@ -97,55 +97,55 @@ _mm_setzero_ps (void) ...@@ -97,55 +97,55 @@ _mm_setzero_ps (void)
floating-point) values of A and B; the upper three SPFP values are floating-point) values of A and B; the upper three SPFP values are
passed through from A. */ passed through from A. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_add_ss (__m128 __A, __m128 __B) _mm_add_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_sub_ss (__m128 __A, __m128 __B) _mm_sub_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_mul_ss (__m128 __A, __m128 __B) _mm_mul_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_div_ss (__m128 __A, __m128 __B) _mm_div_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_sqrt_ss (__m128 __A) _mm_sqrt_ss (__m128 __A)
{ {
return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_rcp_ss (__m128 __A) _mm_rcp_ss (__m128 __A)
{ {
return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_rsqrt_ss (__m128 __A) _mm_rsqrt_ss (__m128 __A)
{ {
return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_min_ss (__m128 __A, __m128 __B) _mm_min_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_max_ss (__m128 __A, __m128 __B) _mm_max_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
...@@ -153,55 +153,55 @@ _mm_max_ss (__m128 __A, __m128 __B) ...@@ -153,55 +153,55 @@ _mm_max_ss (__m128 __A, __m128 __B)
/* Perform the respective operation on the four SPFP values in A and B. */ /* Perform the respective operation on the four SPFP values in A and B. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_add_ps (__m128 __A, __m128 __B) _mm_add_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_sub_ps (__m128 __A, __m128 __B) _mm_sub_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_mul_ps (__m128 __A, __m128 __B) _mm_mul_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_div_ps (__m128 __A, __m128 __B) _mm_div_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_sqrt_ps (__m128 __A) _mm_sqrt_ps (__m128 __A)
{ {
return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_rcp_ps (__m128 __A) _mm_rcp_ps (__m128 __A)
{ {
return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_rsqrt_ps (__m128 __A) _mm_rsqrt_ps (__m128 __A)
{ {
return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_min_ps (__m128 __A, __m128 __B) _mm_min_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_max_ps (__m128 __A, __m128 __B) _mm_max_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
...@@ -209,25 +209,25 @@ _mm_max_ps (__m128 __A, __m128 __B) ...@@ -209,25 +209,25 @@ _mm_max_ps (__m128 __A, __m128 __B)
/* Perform logical bit-wise operations on 128-bit values. */ /* Perform logical bit-wise operations on 128-bit values. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_and_ps (__m128 __A, __m128 __B) _mm_and_ps (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_andps (__A, __B); return __builtin_ia32_andps (__A, __B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_andnot_ps (__m128 __A, __m128 __B) _mm_andnot_ps (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_andnps (__A, __B); return __builtin_ia32_andnps (__A, __B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_or_ps (__m128 __A, __m128 __B) _mm_or_ps (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_orps (__A, __B); return __builtin_ia32_orps (__A, __B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_xor_ps (__m128 __A, __m128 __B) _mm_xor_ps (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_xorps (__A, __B); return __builtin_ia32_xorps (__A, __B);
...@@ -237,25 +237,25 @@ _mm_xor_ps (__m128 __A, __m128 __B) ...@@ -237,25 +237,25 @@ _mm_xor_ps (__m128 __A, __m128 __B)
comparison is true, place a mask of all ones in the result, otherwise a comparison is true, place a mask of all ones in the result, otherwise a
mask of zeros. The upper three SPFP values are passed through from A. */ mask of zeros. The upper three SPFP values are passed through from A. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpeq_ss (__m128 __A, __m128 __B) _mm_cmpeq_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmplt_ss (__m128 __A, __m128 __B) _mm_cmplt_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmple_ss (__m128 __A, __m128 __B) _mm_cmple_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpgt_ss (__m128 __A, __m128 __B) _mm_cmpgt_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_movss ((__v4sf) __A, return (__m128) __builtin_ia32_movss ((__v4sf) __A,
...@@ -265,7 +265,7 @@ _mm_cmpgt_ss (__m128 __A, __m128 __B) ...@@ -265,7 +265,7 @@ _mm_cmpgt_ss (__m128 __A, __m128 __B)
__A)); __A));
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpge_ss (__m128 __A, __m128 __B) _mm_cmpge_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_movss ((__v4sf) __A, return (__m128) __builtin_ia32_movss ((__v4sf) __A,
...@@ -275,25 +275,25 @@ _mm_cmpge_ss (__m128 __A, __m128 __B) ...@@ -275,25 +275,25 @@ _mm_cmpge_ss (__m128 __A, __m128 __B)
__A)); __A));
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpneq_ss (__m128 __A, __m128 __B) _mm_cmpneq_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpnlt_ss (__m128 __A, __m128 __B) _mm_cmpnlt_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpnle_ss (__m128 __A, __m128 __B) _mm_cmpnle_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpngt_ss (__m128 __A, __m128 __B) _mm_cmpngt_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_movss ((__v4sf) __A, return (__m128) __builtin_ia32_movss ((__v4sf) __A,
...@@ -303,7 +303,7 @@ _mm_cmpngt_ss (__m128 __A, __m128 __B) ...@@ -303,7 +303,7 @@ _mm_cmpngt_ss (__m128 __A, __m128 __B)
__A)); __A));
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpnge_ss (__m128 __A, __m128 __B) _mm_cmpnge_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_movss ((__v4sf) __A, return (__m128) __builtin_ia32_movss ((__v4sf) __A,
...@@ -313,13 +313,13 @@ _mm_cmpnge_ss (__m128 __A, __m128 __B) ...@@ -313,13 +313,13 @@ _mm_cmpnge_ss (__m128 __A, __m128 __B)
__A)); __A));
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpord_ss (__m128 __A, __m128 __B) _mm_cmpord_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpunord_ss (__m128 __A, __m128 __B) _mm_cmpunord_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
...@@ -329,73 +329,73 @@ _mm_cmpunord_ss (__m128 __A, __m128 __B) ...@@ -329,73 +329,73 @@ _mm_cmpunord_ss (__m128 __A, __m128 __B)
element, if the comparison is true, place a mask of all ones in the element, if the comparison is true, place a mask of all ones in the
result, otherwise a mask of zeros. */ result, otherwise a mask of zeros. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpeq_ps (__m128 __A, __m128 __B) _mm_cmpeq_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmplt_ps (__m128 __A, __m128 __B) _mm_cmplt_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmple_ps (__m128 __A, __m128 __B) _mm_cmple_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpgt_ps (__m128 __A, __m128 __B) _mm_cmpgt_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpge_ps (__m128 __A, __m128 __B) _mm_cmpge_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpneq_ps (__m128 __A, __m128 __B) _mm_cmpneq_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpnlt_ps (__m128 __A, __m128 __B) _mm_cmpnlt_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpnle_ps (__m128 __A, __m128 __B) _mm_cmpnle_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpngt_ps (__m128 __A, __m128 __B) _mm_cmpngt_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpnge_ps (__m128 __A, __m128 __B) _mm_cmpnge_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpord_ps (__m128 __A, __m128 __B) _mm_cmpord_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cmpunord_ps (__m128 __A, __m128 __B) _mm_cmpunord_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
...@@ -404,73 +404,73 @@ _mm_cmpunord_ps (__m128 __A, __m128 __B) ...@@ -404,73 +404,73 @@ _mm_cmpunord_ps (__m128 __A, __m128 __B)
/* Compare the lower SPFP values of A and B and return 1 if true /* Compare the lower SPFP values of A and B and return 1 if true
and 0 if false. */ and 0 if false. */
static __inline int static __inline int __attribute__((__always_inline__))
_mm_comieq_ss (__m128 __A, __m128 __B) _mm_comieq_ss (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_comilt_ss (__m128 __A, __m128 __B) _mm_comilt_ss (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_comile_ss (__m128 __A, __m128 __B) _mm_comile_ss (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_comigt_ss (__m128 __A, __m128 __B) _mm_comigt_ss (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_comige_ss (__m128 __A, __m128 __B) _mm_comige_ss (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_comineq_ss (__m128 __A, __m128 __B) _mm_comineq_ss (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_ucomieq_ss (__m128 __A, __m128 __B) _mm_ucomieq_ss (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_ucomilt_ss (__m128 __A, __m128 __B) _mm_ucomilt_ss (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_ucomile_ss (__m128 __A, __m128 __B) _mm_ucomile_ss (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_ucomigt_ss (__m128 __A, __m128 __B) _mm_ucomigt_ss (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_ucomige_ss (__m128 __A, __m128 __B) _mm_ucomige_ss (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_ucomineq_ss (__m128 __A, __m128 __B) _mm_ucomineq_ss (__m128 __A, __m128 __B)
{ {
return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
...@@ -478,13 +478,13 @@ _mm_ucomineq_ss (__m128 __A, __m128 __B) ...@@ -478,13 +478,13 @@ _mm_ucomineq_ss (__m128 __A, __m128 __B)
/* Convert the lower SPFP value to a 32-bit integer according to the current /* Convert the lower SPFP value to a 32-bit integer according to the current
rounding mode. */ rounding mode. */
static __inline int static __inline int __attribute__((__always_inline__))
_mm_cvtss_si32 (__m128 __A) _mm_cvtss_si32 (__m128 __A)
{ {
return __builtin_ia32_cvtss2si ((__v4sf) __A); return __builtin_ia32_cvtss2si ((__v4sf) __A);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_cvt_ss2si (__m128 __A) _mm_cvt_ss2si (__m128 __A)
{ {
return _mm_cvtss_si32 (__A); return _mm_cvtss_si32 (__A);
...@@ -493,7 +493,7 @@ _mm_cvt_ss2si (__m128 __A) ...@@ -493,7 +493,7 @@ _mm_cvt_ss2si (__m128 __A)
#ifdef __x86_64__ #ifdef __x86_64__
/* Convert the lower SPFP value to a 32-bit integer according to the current /* Convert the lower SPFP value to a 32-bit integer according to the current
rounding mode. */ rounding mode. */
static __inline long long static __inline long long __attribute__((__always_inline__))
_mm_cvtss_si64x (__m128 __A) _mm_cvtss_si64x (__m128 __A)
{ {
return __builtin_ia32_cvtss2si64 ((__v4sf) __A); return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
...@@ -502,26 +502,26 @@ _mm_cvtss_si64x (__m128 __A) ...@@ -502,26 +502,26 @@ _mm_cvtss_si64x (__m128 __A)
/* Convert the two lower SPFP values to 32-bit integers according to the /* Convert the two lower SPFP values to 32-bit integers according to the
current rounding mode. Return the integers in packed form. */ current rounding mode. Return the integers in packed form. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cvtps_pi32 (__m128 __A) _mm_cvtps_pi32 (__m128 __A)
{ {
return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cvt_ps2pi (__m128 __A) _mm_cvt_ps2pi (__m128 __A)
{ {
return _mm_cvtps_pi32 (__A); return _mm_cvtps_pi32 (__A);
} }
/* Truncate the lower SPFP value to a 32-bit integer. */ /* Truncate the lower SPFP value to a 32-bit integer. */
static __inline int static __inline int __attribute__((__always_inline__))
_mm_cvttss_si32 (__m128 __A) _mm_cvttss_si32 (__m128 __A)
{ {
return __builtin_ia32_cvttss2si ((__v4sf) __A); return __builtin_ia32_cvttss2si ((__v4sf) __A);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_mm_cvtt_ss2si (__m128 __A) _mm_cvtt_ss2si (__m128 __A)
{ {
return _mm_cvttss_si32 (__A); return _mm_cvttss_si32 (__A);
...@@ -529,7 +529,7 @@ _mm_cvtt_ss2si (__m128 __A) ...@@ -529,7 +529,7 @@ _mm_cvtt_ss2si (__m128 __A)
#ifdef __x86_64__ #ifdef __x86_64__
/* Truncate the lower SPFP value to a 32-bit integer. */ /* Truncate the lower SPFP value to a 32-bit integer. */
static __inline long long static __inline long long __attribute__((__always_inline__))
_mm_cvttss_si64x (__m128 __A) _mm_cvttss_si64x (__m128 __A)
{ {
return __builtin_ia32_cvttss2si64 ((__v4sf) __A); return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
...@@ -538,26 +538,26 @@ _mm_cvttss_si64x (__m128 __A) ...@@ -538,26 +538,26 @@ _mm_cvttss_si64x (__m128 __A)
/* Truncate the two lower SPFP values to 32-bit integers. Return the /* Truncate the two lower SPFP values to 32-bit integers. Return the
integers in packed form. */ integers in packed form. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cvttps_pi32 (__m128 __A) _mm_cvttps_pi32 (__m128 __A)
{ {
return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cvtt_ps2pi (__m128 __A) _mm_cvtt_ps2pi (__m128 __A)
{ {
return _mm_cvttps_pi32 (__A); return _mm_cvttps_pi32 (__A);
} }
/* Convert B to a SPFP value and insert it as element zero in A. */ /* Convert B to a SPFP value and insert it as element zero in A. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvtsi32_ss (__m128 __A, int __B) _mm_cvtsi32_ss (__m128 __A, int __B)
{ {
return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvt_si2ss (__m128 __A, int __B) _mm_cvt_si2ss (__m128 __A, int __B)
{ {
return _mm_cvtsi32_ss (__A, __B); return _mm_cvtsi32_ss (__A, __B);
...@@ -565,7 +565,7 @@ _mm_cvt_si2ss (__m128 __A, int __B) ...@@ -565,7 +565,7 @@ _mm_cvt_si2ss (__m128 __A, int __B)
#ifdef __x86_64__ #ifdef __x86_64__
/* Convert B to a SPFP value and insert it as element zero in A. */ /* Convert B to a SPFP value and insert it as element zero in A. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvtsi64x_ss (__m128 __A, long long __B) _mm_cvtsi64x_ss (__m128 __A, long long __B)
{ {
return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
...@@ -574,20 +574,20 @@ _mm_cvtsi64x_ss (__m128 __A, long long __B) ...@@ -574,20 +574,20 @@ _mm_cvtsi64x_ss (__m128 __A, long long __B)
/* Convert the two 32-bit values in B to SPFP form and insert them /* Convert the two 32-bit values in B to SPFP form and insert them
as the two lower elements in A. */ as the two lower elements in A. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvtpi32_ps (__m128 __A, __m64 __B) _mm_cvtpi32_ps (__m128 __A, __m64 __B)
{ {
return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvt_pi2ps (__m128 __A, __m64 __B) _mm_cvt_pi2ps (__m128 __A, __m64 __B)
{ {
return _mm_cvtpi32_ps (__A, __B); return _mm_cvtpi32_ps (__A, __B);
} }
/* Convert the four signed 16-bit values in A to SPFP form. */ /* Convert the four signed 16-bit values in A to SPFP form. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvtpi16_ps (__m64 __A) _mm_cvtpi16_ps (__m64 __A)
{ {
__v4hi __sign; __v4hi __sign;
...@@ -613,7 +613,7 @@ _mm_cvtpi16_ps (__m64 __A) ...@@ -613,7 +613,7 @@ _mm_cvtpi16_ps (__m64 __A)
} }
/* Convert the four unsigned 16-bit values in A to SPFP form. */ /* Convert the four unsigned 16-bit values in A to SPFP form. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvtpu16_ps (__m64 __A) _mm_cvtpu16_ps (__m64 __A)
{ {
__v2si __hisi, __losi; __v2si __hisi, __losi;
...@@ -633,7 +633,7 @@ _mm_cvtpu16_ps (__m64 __A) ...@@ -633,7 +633,7 @@ _mm_cvtpu16_ps (__m64 __A)
} }
/* Convert the low four signed 8-bit values in A to SPFP form. */ /* Convert the low four signed 8-bit values in A to SPFP form. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvtpi8_ps (__m64 __A) _mm_cvtpi8_ps (__m64 __A)
{ {
__v8qi __sign; __v8qi __sign;
...@@ -650,7 +650,7 @@ _mm_cvtpi8_ps (__m64 __A) ...@@ -650,7 +650,7 @@ _mm_cvtpi8_ps (__m64 __A)
} }
/* Convert the low four unsigned 8-bit values in A to SPFP form. */ /* Convert the low four unsigned 8-bit values in A to SPFP form. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvtpu8_ps(__m64 __A) _mm_cvtpu8_ps(__m64 __A)
{ {
__A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL); __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
...@@ -658,7 +658,7 @@ _mm_cvtpu8_ps(__m64 __A) ...@@ -658,7 +658,7 @@ _mm_cvtpu8_ps(__m64 __A)
} }
/* Convert the four signed 32-bit values in A and B to SPFP form. */ /* Convert the four signed 32-bit values in A and B to SPFP form. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
{ {
__v4sf __zero = (__v4sf) _mm_setzero_ps (); __v4sf __zero = (__v4sf) _mm_setzero_ps ();
...@@ -668,7 +668,7 @@ _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) ...@@ -668,7 +668,7 @@ _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
} }
/* Convert the four SPFP values in A to four signed 16-bit integers. */ /* Convert the four SPFP values in A to four signed 16-bit integers. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cvtps_pi16(__m128 __A) _mm_cvtps_pi16(__m128 __A)
{ {
__v4sf __hisf = (__v4sf)__A; __v4sf __hisf = (__v4sf)__A;
...@@ -679,7 +679,7 @@ _mm_cvtps_pi16(__m128 __A) ...@@ -679,7 +679,7 @@ _mm_cvtps_pi16(__m128 __A)
} }
/* Convert the four SPFP values in A to four signed 8-bit integers. */ /* Convert the four SPFP values in A to four signed 8-bit integers. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_cvtps_pi8(__m128 __A) _mm_cvtps_pi8(__m128 __A)
{ {
__v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
...@@ -688,7 +688,7 @@ _mm_cvtps_pi8(__m128 __A) ...@@ -688,7 +688,7 @@ _mm_cvtps_pi8(__m128 __A)
/* Selects four specific SPFP values from A and B based on MASK. */ /* Selects four specific SPFP values from A and B based on MASK. */
#if 0 #if 0
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_shuffle_ps (__m128 __A, __m128 __B, int __mask) _mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
{ {
return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
...@@ -700,14 +700,14 @@ _mm_shuffle_ps (__m128 __A, __m128 __B, int __mask) ...@@ -700,14 +700,14 @@ _mm_shuffle_ps (__m128 __A, __m128 __B, int __mask)
/* Selects and interleaves the upper two SPFP values from A and B. */ /* Selects and interleaves the upper two SPFP values from A and B. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_unpackhi_ps (__m128 __A, __m128 __B) _mm_unpackhi_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
} }
/* Selects and interleaves the lower two SPFP values from A and B. */ /* Selects and interleaves the lower two SPFP values from A and B. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_unpacklo_ps (__m128 __A, __m128 __B) _mm_unpacklo_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
...@@ -715,28 +715,28 @@ _mm_unpacklo_ps (__m128 __A, __m128 __B) ...@@ -715,28 +715,28 @@ _mm_unpacklo_ps (__m128 __A, __m128 __B)
/* Sets the upper two SPFP values with 64-bits of data loaded from P; /* Sets the upper two SPFP values with 64-bits of data loaded from P;
the lower two values are passed through from A. */ the lower two values are passed through from A. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_loadh_pi (__m128 __A, __m64 const *__P) _mm_loadh_pi (__m128 __A, __m64 const *__P)
{ {
return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P); return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P);
} }
/* Stores the upper two SPFP values of A into P. */ /* Stores the upper two SPFP values of A into P. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_storeh_pi (__m64 *__P, __m128 __A) _mm_storeh_pi (__m64 *__P, __m128 __A)
{ {
__builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A); __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A);
} }
/* Moves the upper two values of B into the lower two values of A. */ /* Moves the upper two values of B into the lower two values of A. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_movehl_ps (__m128 __A, __m128 __B) _mm_movehl_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
} }
/* Moves the lower two values of B into the upper two values of A. */ /* Moves the lower two values of B into the upper two values of A. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_movelh_ps (__m128 __A, __m128 __B) _mm_movelh_ps (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
...@@ -744,146 +744,146 @@ _mm_movelh_ps (__m128 __A, __m128 __B) ...@@ -744,146 +744,146 @@ _mm_movelh_ps (__m128 __A, __m128 __B)
/* Sets the lower two SPFP values with 64-bits of data loaded from P; /* Sets the lower two SPFP values with 64-bits of data loaded from P;
the upper two values are passed through from A. */ the upper two values are passed through from A. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_loadl_pi (__m128 __A, __m64 const *__P) _mm_loadl_pi (__m128 __A, __m64 const *__P)
{ {
return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P); return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P);
} }
/* Stores the lower two SPFP values of A into P. */ /* Stores the lower two SPFP values of A into P. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_storel_pi (__m64 *__P, __m128 __A) _mm_storel_pi (__m64 *__P, __m128 __A)
{ {
__builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A); __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A);
} }
/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
static __inline int static __inline int __attribute__((__always_inline__))
_mm_movemask_ps (__m128 __A) _mm_movemask_ps (__m128 __A)
{ {
return __builtin_ia32_movmskps ((__v4sf)__A); return __builtin_ia32_movmskps ((__v4sf)__A);
} }
/* Return the contents of the control register. */ /* Return the contents of the control register. */
static __inline unsigned int static __inline unsigned int __attribute__((__always_inline__))
_mm_getcsr (void) _mm_getcsr (void)
{ {
return __builtin_ia32_stmxcsr (); return __builtin_ia32_stmxcsr ();
} }
/* Read exception bits from the control register. */ /* Read exception bits from the control register. */
static __inline unsigned int static __inline unsigned int __attribute__((__always_inline__))
_MM_GET_EXCEPTION_STATE (void) _MM_GET_EXCEPTION_STATE (void)
{ {
return _mm_getcsr() & _MM_EXCEPT_MASK; return _mm_getcsr() & _MM_EXCEPT_MASK;
} }
static __inline unsigned int static __inline unsigned int __attribute__((__always_inline__))
_MM_GET_EXCEPTION_MASK (void) _MM_GET_EXCEPTION_MASK (void)
{ {
return _mm_getcsr() & _MM_MASK_MASK; return _mm_getcsr() & _MM_MASK_MASK;
} }
static __inline unsigned int static __inline unsigned int __attribute__((__always_inline__))
_MM_GET_ROUNDING_MODE (void) _MM_GET_ROUNDING_MODE (void)
{ {
return _mm_getcsr() & _MM_ROUND_MASK; return _mm_getcsr() & _MM_ROUND_MASK;
} }
static __inline unsigned int static __inline unsigned int __attribute__((__always_inline__))
_MM_GET_FLUSH_ZERO_MODE (void) _MM_GET_FLUSH_ZERO_MODE (void)
{ {
return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
} }
/* Set the control register to I. */ /* Set the control register to I. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_setcsr (unsigned int __I) _mm_setcsr (unsigned int __I)
{ {
__builtin_ia32_ldmxcsr (__I); __builtin_ia32_ldmxcsr (__I);
} }
/* Set exception bits in the control register. */ /* Set exception bits in the control register. */
static __inline void static __inline void __attribute__((__always_inline__))
_MM_SET_EXCEPTION_STATE(unsigned int __mask) _MM_SET_EXCEPTION_STATE(unsigned int __mask)
{ {
_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_MM_SET_EXCEPTION_MASK (unsigned int __mask) _MM_SET_EXCEPTION_MASK (unsigned int __mask)
{ {
_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_MM_SET_ROUNDING_MODE (unsigned int __mode) _MM_SET_ROUNDING_MODE (unsigned int __mode)
{ {
_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) _MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
{ {
_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
} }
/* Create a vector with element 0 as F and the rest zero. */ /* Create a vector with element 0 as F and the rest zero. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_set_ss (float __F) _mm_set_ss (float __F)
{ {
return __extension__ (__m128)(__v4sf){ __F, 0, 0, 0 }; return __extension__ (__m128)(__v4sf){ __F, 0, 0, 0 };
} }
/* Create a vector with all four elements equal to F. */ /* Create a vector with all four elements equal to F. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_set1_ps (float __F) _mm_set1_ps (float __F)
{ {
return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_set_ps1 (float __F) _mm_set_ps1 (float __F)
{ {
return _mm_set1_ps (__F); return _mm_set1_ps (__F);
} }
/* Create a vector with element 0 as *P and the rest zero. */ /* Create a vector with element 0 as *P and the rest zero. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_load_ss (float const *__P) _mm_load_ss (float const *__P)
{ {
return _mm_set_ss (*__P); return _mm_set_ss (*__P);
} }
/* Create a vector with all four elements equal to *P. */ /* Create a vector with all four elements equal to *P. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_load1_ps (float const *__P) _mm_load1_ps (float const *__P)
{ {
return _mm_set1_ps (*__P); return _mm_set1_ps (*__P);
} }
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_load_ps1 (float const *__P) _mm_load_ps1 (float const *__P)
{ {
return _mm_load1_ps (__P); return _mm_load1_ps (__P);
} }
/* Load four SPFP values from P. The address must be 16-byte aligned. */ /* Load four SPFP values from P. The address must be 16-byte aligned. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_load_ps (float const *__P) _mm_load_ps (float const *__P)
{ {
return (__m128) *(__v4sf *)__P; return (__m128) *(__v4sf *)__P;
} }
/* Load four SPFP values from P. The address need not be 16-byte aligned. */ /* Load four SPFP values from P. The address need not be 16-byte aligned. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_loadu_ps (float const *__P) _mm_loadu_ps (float const *__P)
{ {
return (__m128) __builtin_ia32_loadups (__P); return (__m128) __builtin_ia32_loadups (__P);
} }
/* Load four SPFP values in reverse order. The address must be aligned. */ /* Load four SPFP values in reverse order. The address must be aligned. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_loadr_ps (float const *__P) _mm_loadr_ps (float const *__P)
{ {
__v4sf __tmp = *(__v4sf *)__P; __v4sf __tmp = *(__v4sf *)__P;
...@@ -891,42 +891,42 @@ _mm_loadr_ps (float const *__P) ...@@ -891,42 +891,42 @@ _mm_loadr_ps (float const *__P)
} }
/* Create the vector [Z Y X W]. */ /* Create the vector [Z Y X W]. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
{ {
return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
} }
/* Create the vector [W X Y Z]. */ /* Create the vector [W X Y Z]. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_setr_ps (float __Z, float __Y, float __X, float __W) _mm_setr_ps (float __Z, float __Y, float __X, float __W)
{ {
return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
} }
/* Stores the lower SPFP value. */ /* Stores the lower SPFP value. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_store_ss (float *__P, __m128 __A) _mm_store_ss (float *__P, __m128 __A)
{ {
*__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
} }
/* Store four SPFP values. The address must be 16-byte aligned. */ /* Store four SPFP values. The address must be 16-byte aligned. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_store_ps (float *__P, __m128 __A) _mm_store_ps (float *__P, __m128 __A)
{ {
*(__v4sf *)__P = (__v4sf)__A; *(__v4sf *)__P = (__v4sf)__A;
} }
/* Store four SPFP values. The address need not be 16-byte aligned. */ /* Store four SPFP values. The address need not be 16-byte aligned. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_storeu_ps (float *__P, __m128 __A) _mm_storeu_ps (float *__P, __m128 __A)
{ {
__builtin_ia32_storeups (__P, (__v4sf)__A); __builtin_ia32_storeups (__P, (__v4sf)__A);
} }
/* Store the lower SPFP value across four words. */ /* Store the lower SPFP value across four words. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_store1_ps (float *__P, __m128 __A) _mm_store1_ps (float *__P, __m128 __A)
{ {
__v4sf __va = (__v4sf)__A; __v4sf __va = (__v4sf)__A;
...@@ -934,14 +934,14 @@ _mm_store1_ps (float *__P, __m128 __A) ...@@ -934,14 +934,14 @@ _mm_store1_ps (float *__P, __m128 __A)
_mm_storeu_ps (__P, __tmp); _mm_storeu_ps (__P, __tmp);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_mm_store_ps1 (float *__P, __m128 __A) _mm_store_ps1 (float *__P, __m128 __A)
{ {
_mm_store1_ps (__P, __A); _mm_store1_ps (__P, __A);
} }
/* Store four SPFP values in reverse order. The address must be aligned. */ /* Store four SPFP values in reverse order. The address must be aligned. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_storer_ps (float *__P, __m128 __A) _mm_storer_ps (float *__P, __m128 __A)
{ {
__v4sf __va = (__v4sf)__A; __v4sf __va = (__v4sf)__A;
...@@ -950,7 +950,7 @@ _mm_storer_ps (float *__P, __m128 __A) ...@@ -950,7 +950,7 @@ _mm_storer_ps (float *__P, __m128 __A)
} }
/* Sets the low SPFP value of A from the low value of B. */ /* Sets the low SPFP value of A from the low value of B. */
static __inline __m128 static __inline __m128 __attribute__((__always_inline__))
_mm_move_ss (__m128 __A, __m128 __B) _mm_move_ss (__m128 __A, __m128 __B)
{ {
return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
...@@ -995,65 +995,65 @@ _m_pinsrw (__m64 const __A, int const __D, int const __N) ...@@ -995,65 +995,65 @@ _m_pinsrw (__m64 const __A, int const __D, int const __N)
#endif #endif
/* Compute the element-wise maximum of signed 16-bit values. */ /* Compute the element-wise maximum of signed 16-bit values. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_max_pi16 (__m64 __A, __m64 __B) _mm_max_pi16 (__m64 __A, __m64 __B)
{ {
return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pmaxsw (__m64 __A, __m64 __B) _m_pmaxsw (__m64 __A, __m64 __B)
{ {
return _mm_max_pi16 (__A, __B); return _mm_max_pi16 (__A, __B);
} }
/* Compute the element-wise maximum of unsigned 8-bit values. */ /* Compute the element-wise maximum of unsigned 8-bit values. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_max_pu8 (__m64 __A, __m64 __B) _mm_max_pu8 (__m64 __A, __m64 __B)
{ {
return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pmaxub (__m64 __A, __m64 __B) _m_pmaxub (__m64 __A, __m64 __B)
{ {
return _mm_max_pu8 (__A, __B); return _mm_max_pu8 (__A, __B);
} }
/* Compute the element-wise minimum of signed 16-bit values. */ /* Compute the element-wise minimum of signed 16-bit values. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_min_pi16 (__m64 __A, __m64 __B) _mm_min_pi16 (__m64 __A, __m64 __B)
{ {
return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pminsw (__m64 __A, __m64 __B) _m_pminsw (__m64 __A, __m64 __B)
{ {
return _mm_min_pi16 (__A, __B); return _mm_min_pi16 (__A, __B);
} }
/* Compute the element-wise minimum of unsigned 8-bit values. */ /* Compute the element-wise minimum of unsigned 8-bit values. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_min_pu8 (__m64 __A, __m64 __B) _mm_min_pu8 (__m64 __A, __m64 __B)
{ {
return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pminub (__m64 __A, __m64 __B) _m_pminub (__m64 __A, __m64 __B)
{ {
return _mm_min_pu8 (__A, __B); return _mm_min_pu8 (__A, __B);
} }
/* Create an 8-bit mask of the signs of 8-bit values. */ /* Create an 8-bit mask of the signs of 8-bit values. */
static __inline int static __inline int __attribute__((__always_inline__))
_mm_movemask_pi8 (__m64 __A) _mm_movemask_pi8 (__m64 __A)
{ {
return __builtin_ia32_pmovmskb ((__v8qi)__A); return __builtin_ia32_pmovmskb ((__v8qi)__A);
} }
static __inline int static __inline int __attribute__((__always_inline__))
_m_pmovmskb (__m64 __A) _m_pmovmskb (__m64 __A)
{ {
return _mm_movemask_pi8 (__A); return _mm_movemask_pi8 (__A);
...@@ -1061,13 +1061,13 @@ _m_pmovmskb (__m64 __A) ...@@ -1061,13 +1061,13 @@ _m_pmovmskb (__m64 __A)
/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
in B and produce the high 16 bits of the 32-bit results. */ in B and produce the high 16 bits of the 32-bit results. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_mulhi_pu16 (__m64 __A, __m64 __B) _mm_mulhi_pu16 (__m64 __A, __m64 __B)
{ {
return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pmulhuw (__m64 __A, __m64 __B) _m_pmulhuw (__m64 __A, __m64 __B)
{ {
return _mm_mulhi_pu16 (__A, __B); return _mm_mulhi_pu16 (__A, __B);
...@@ -1076,13 +1076,13 @@ _m_pmulhuw (__m64 __A, __m64 __B) ...@@ -1076,13 +1076,13 @@ _m_pmulhuw (__m64 __A, __m64 __B)
/* Return a combination of the four 16-bit values in A. The selector /* Return a combination of the four 16-bit values in A. The selector
must be an immediate. */ must be an immediate. */
#if 0 #if 0
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_shuffle_pi16 (__m64 __A, int __N) _mm_shuffle_pi16 (__m64 __A, int __N)
{ {
return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pshufw (__m64 __A, int __N) _m_pshufw (__m64 __A, int __N)
{ {
return _mm_shuffle_pi16 (__A, __N); return _mm_shuffle_pi16 (__A, __N);
...@@ -1096,39 +1096,39 @@ _m_pshufw (__m64 __A, int __N) ...@@ -1096,39 +1096,39 @@ _m_pshufw (__m64 __A, int __N)
/* Conditionally store byte elements of A into P. The high bit of each /* Conditionally store byte elements of A into P. The high bit of each
byte in the selector N determines whether the corresponding byte from byte in the selector N determines whether the corresponding byte from
A is stored. */ A is stored. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
{ {
__builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
} }
static __inline void static __inline void __attribute__((__always_inline__))
_m_maskmovq (__m64 __A, __m64 __N, char *__P) _m_maskmovq (__m64 __A, __m64 __N, char *__P)
{ {
_mm_maskmove_si64 (__A, __N, __P); _mm_maskmove_si64 (__A, __N, __P);
} }
/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_avg_pu8 (__m64 __A, __m64 __B) _mm_avg_pu8 (__m64 __A, __m64 __B)
{ {
return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pavgb (__m64 __A, __m64 __B) _m_pavgb (__m64 __A, __m64 __B)
{ {
return _mm_avg_pu8 (__A, __B); return _mm_avg_pu8 (__A, __B);
} }
/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_avg_pu16 (__m64 __A, __m64 __B) _mm_avg_pu16 (__m64 __A, __m64 __B)
{ {
return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_pavgw (__m64 __A, __m64 __B) _m_pavgw (__m64 __A, __m64 __B)
{ {
return _mm_avg_pu16 (__A, __B); return _mm_avg_pu16 (__A, __B);
...@@ -1137,13 +1137,13 @@ _m_pavgw (__m64 __A, __m64 __B) ...@@ -1137,13 +1137,13 @@ _m_pavgw (__m64 __A, __m64 __B)
/* Compute the sum of the absolute differences of the unsigned 8-bit /* Compute the sum of the absolute differences of the unsigned 8-bit
values in A and B. Return the value in the lower 16-bit word; the values in A and B. Return the value in the lower 16-bit word; the
upper words are cleared. */ upper words are cleared. */
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_mm_sad_pu8 (__m64 __A, __m64 __B) _mm_sad_pu8 (__m64 __A, __m64 __B)
{ {
return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
} }
static __inline __m64 static __inline __m64 __attribute__((__always_inline__))
_m_psadbw (__m64 __A, __m64 __B) _m_psadbw (__m64 __A, __m64 __B)
{ {
return _mm_sad_pu8 (__A, __B); return _mm_sad_pu8 (__A, __B);
...@@ -1152,7 +1152,7 @@ _m_psadbw (__m64 __A, __m64 __B) ...@@ -1152,7 +1152,7 @@ _m_psadbw (__m64 __A, __m64 __B)
/* Loads one cache line from address P to a location "closer" to the /* Loads one cache line from address P to a location "closer" to the
processor. The selector I specifies the type of prefetch operation. */ processor. The selector I specifies the type of prefetch operation. */
#if 0 #if 0
static __inline void static __inline void __attribute__((__always_inline__))
_mm_prefetch (void *__P, enum _mm_hint __I) _mm_prefetch (void *__P, enum _mm_hint __I)
{ {
__builtin_prefetch (__P, 0, __I); __builtin_prefetch (__P, 0, __I);
...@@ -1163,14 +1163,14 @@ _mm_prefetch (void *__P, enum _mm_hint __I) ...@@ -1163,14 +1163,14 @@ _mm_prefetch (void *__P, enum _mm_hint __I)
#endif #endif
/* Stores the data in A to the address P without polluting the caches. */ /* Stores the data in A to the address P without polluting the caches. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_stream_pi (__m64 *__P, __m64 __A) _mm_stream_pi (__m64 *__P, __m64 __A)
{ {
__builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A); __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
} }
/* Likewise. The address must be 16-byte aligned. */ /* Likewise. The address must be 16-byte aligned. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_stream_ps (float *__P, __m128 __A) _mm_stream_ps (float *__P, __m128 __A)
{ {
__builtin_ia32_movntps (__P, (__v4sf)__A); __builtin_ia32_movntps (__P, (__v4sf)__A);
...@@ -1178,7 +1178,7 @@ _mm_stream_ps (float *__P, __m128 __A) ...@@ -1178,7 +1178,7 @@ _mm_stream_ps (float *__P, __m128 __A)
/* Guarantees that every preceding store is globally visible before /* Guarantees that every preceding store is globally visible before
any subsequent store. */ any subsequent store. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_sfence (void) _mm_sfence (void)
{ {
__builtin_ia32_sfence (); __builtin_ia32_sfence ();
...@@ -1187,7 +1187,7 @@ _mm_sfence (void) ...@@ -1187,7 +1187,7 @@ _mm_sfence (void)
/* The execution of the next instruction is delayed by an implementation /* The execution of the next instruction is delayed by an implementation
specific amount of time. The instruction does not modify the specific amount of time. The instruction does not modify the
architectural state. */ architectural state. */
static __inline void static __inline void __attribute__((__always_inline__))
_mm_pause (void) _mm_pause (void)
{ {
__asm__ __volatile__ ("rep; nop" : : ); __asm__ __volatile__ ("rep; nop" : : );
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment