Commit cbf2e4d4 by Harsha Jagasia Committed by Harsha Jagasia

config.gcc (i[34567]86-*-*): Include fma4intrin.h.

2009-09-29  Harsha Jagasia  <harsha.jagasia@amd.com>

	* config.gcc (i[34567]86-*-*): Include fma4intrin.h.
	(x86_64-*-*): Ditto.	

	* config/i386/fma4intrin.h: New file, provide common x86 compiler
	intrinisics for FMA4.
	* config/i386/cpuid.h (bit_FMA4): Define FMA4 bit.
	* config/i386/x86intrin.h: Fix typo to SSE4A instead of SSE4a.
	Add FMA4 check and fma4intrin.h.
	* config/i386/i386-c.c(ix86_target_macros_internal): Check
	ISA_FLAG for FMA4. 
	* config/i386/i386.h(TARGET_FMA4): New macro for FMA4.
	* config/i386/i386.md (UNSPEC_FMA4_INTRINSIC): Add new UNSPEC
	constant for FMA4 support.
	(UNSPEC_FMA4_FMADDSUB): Ditto.
	(UNSPEC_FMA4_FMSUBADD): Ditto.
	* config/i386/i386.opt (-mfma4): New switch for FMA4 support.
	* config/i386/i386-protos.h (ix86_fma4_valid_op_p): Add
	declaration.
	(ix86_expand_fma4_multiple_memory): Ditto.
	* config/i386/i386.c (OPTION_MASK_ISA_FMA4_SET): New.
	(OPTION_MASK_ISA_FMA4_UNSET): New.	
	(OPTION_MASK_ISA_SSE4A_UNSET): Change definition to
	depend on FMA4.
	(OPTION_MASK_ISA_AVX_UNSET): Change definition to
	depend on FMA4.
	(ix86_handle_option): Handle -mfma4.
	(isa_opts): Handle -mfma4.
	(enum pta_flags): Add PTA_FMA4.
	(override_options): Add FMA4 support.	
	(IX86_BUILTIN_VFMADDSS): New for FMA4 intrinsic.
	(IX86_BUILTIN_VFMADDSD): Ditto.
	(IX86_BUILTIN_VFMADDPS): Ditto.
	(IX86_BUILTIN_VFMADDPD): Ditto.
	(IX86_BUILTIN_VFMSUBSS): Ditto.
	(IX86_BUILTIN_VFMSUBSD): Ditto.
	(IX86_BUILTIN_VFMSUBPS): Ditto.
	(IX86_BUILTIN_VFMSUBPD): Ditto.
	(IX86_BUILTIN_VFMADDSUBPS): Ditto.
	(IX86_BUILTIN_VFMADDSUBPD): Ditto.
	(IX86_BUILTIN_VFMSUBADDPS): Ditto.
	(IX86_BUILTIN_VFMSUBADDPD): Ditto.
	(IX86_BUILTIN_VFNMADDSS): Ditto.
	(IX86_BUILTIN_VFNMADDSD): Ditto.
	(IX86_BUILTIN_VFNMADDPS): Ditto.
	(IX86_BUILTIN_VFNMADDPD): Ditto.
	(IX86_BUILTIN_VFNMSUBSS): Ditto.
	(IX86_BUILTIN_VFNMSUBSD): Ditto.
	(IX86_BUILTIN_VFNMSUBPS): Ditto.
	(IX86_BUILTIN_VFNMSUBPD): Ditto.
	(IX86_BUILTIN_VFMADDPS256): Ditto.
	(IX86_BUILTIN_VFMADDPD256): Ditto.
	(IX86_BUILTIN_VFMSUBPS256): Ditto.
	(IX86_BUILTIN_VFMSUBPD256): Ditto.
	(IX86_BUILTIN_VFMADDSUBPS256): Ditto.
	(IX86_BUILTIN_VFMADDSUBPD256): Ditto.
	(IX86_BUILTIN_VFMSUBADDPS256): Ditto.
	(IX86_BUILTIN_VFMSUBADDPD256): Ditto.
	(IX86_BUILTIN_VFNMADDPS256): Ditto.
	(IX86_BUILTIN_VFNMADDPD256): Ditto.
	(IX86_BUILTIN_VFNMSUBPS256): Ditto.
	(IX86_BUILTIN_VFNMSUBPD256): Ditto.
	(enum multi_arg_type): New enum for describing the various FMA4
	intrinsic argument types.
	(bdesc_multi_arg): New table for FMA4 intrinsics.
	(ix86_init_mmx_sse_builtins): Add FMA4 intrinsic support.
	(ix86_expand_multi_arg_builtin): New function for creating FMA4
	intrinsics.
	(ix86_expand_builtin): Add FMA4 intrinsic support.
	(ix86_fma4_valid_op_p): New function to validate FMA4 3 and 4
	operand instructions.
	(ix86_expand_fma4_multiple_memory): New function to split the
	second memory reference from FMA4 instructions.
	* config/i386/sse.md (ssemodesuffixf4): New mode attribute for FMA4.
	(ssemodesuffixf2s): Ditto.
	(fma4_fmadd<mode>4): Add FMA4 floating point multiply/add
	instructions.
	(fma4_fmsub<mode>4): Ditto.
	(fma4_fnmadd<mode>4): Ditto.
	(fma4_fnmsub<mode>4): Ditto.
	(fma4_vmfmadd<mode>4): Ditto.
	(fma4_vmfmsub<mode>4): Ditto.
	(fma4_vmfnmadd<mode>4): Ditto.
	(fma4_vmfnmsub<mode>4): Ditto.
	(fma4_fmadd<mode>4256): Ditto.
	(fma4_fmsub<mode>4256): Ditto.
	(fma4_fnmadd<mode>4256): Ditto.
	(fma4_fnmsub<mode>4256): Ditto.
	(fma4_fmaddsubv8sf4): Ditto.
	(fma4_fmaddsubv4sf4): Ditto.
	(fma4_fmaddsubv4df4): Ditto.
	(fma4_fmaddsubv2df4): Ditto.
	(fma4_fmsubaddv8sf4): Ditto.
	(fma4_fmsubaddv4sf4): Ditto.
	(fma4_fmsubaddv4df4): Ditto.
	(fma4_fmsubaddv2df4): Ditto.
	(fma4i_fmadd<mode>4): Add FMA4 floating point multiply/add
	instructions for intrinsics.
	(fma4i_fmsub<mode>4): Ditto.
	(fma4i_fnmadd<mode>4): Ditto.
	(fma4i_fnmsub<mode>4): Ditto.
	(fma4i_vmfmadd<mode>4): Ditto.
	(fma4i_vmfmsub<mode>4): Ditto.
	(fma4i_vmfnmadd<mode>4): Ditto.
	(fma4i_vmfnmsub<mode>4): Ditto.
	(fma4i_fmadd<mode>4256): Ditto.
	(fma4i_fmsub<mode>4256): Ditto.
	(fma4i_fnmadd<mode>4256): Ditto.
	(fma4i_fnmsub<mode>4256): Ditto.
	(fma4i_fmaddsubv8sf4): Ditto.
	(fma4i_fmaddsubv4sf4): Ditto.
	(fma4i_fmaddsubv4df4): Ditto.
	(fma4i_fmaddsubv2df4): Ditto.
	(fma4i_fmsubaddv8sf4): Ditto.
	(fma4i_fmsubaddv4sf4): Ditto.
	(fma4i_fmsubaddv4df4): Ditto.
	(fma4i_fmsubaddv2df4): Ditto.

	* doc/invoke.texi (-mfma4): Add documentation.
	* doc/extend.texi (x86 intrinsics): Add FMA4 intrinsics.

	* gcc.target/i386/fma4-check.h
	* gcc.target/i386/fma4-fma.c
	* gcc.target/i386/fma4-maccXX.c
	* gcc.target/i386/fma4-msubXX.c
	* gcc.target/i386/fma4-nmaccXX.c
	* gcc.target/i386/fma4-nmsubXX.c
	* gcc.target/i386/fma4-vector.c
	* gcc.target/i386/fma4-256-maccXX.c
	* gcc.target/i386/fma4-256-msubXX.c
	* gcc.target/i386/fma4-256-nmaccXX.c
	* gcc.target/i386/fma4-256-nmsubXX.c
	* gcc.target/i386/fma4-256-vector.c
	* gcc.target/i386/funcspec-2.c: New file.
	* gcc.target/i386/funcspec-4.c: Test error conditions
	related to FMA4.
	* gcc.target/i386/funcspec-5.c
	* gcc.target/i386/funcspec-6.c
	* gcc.target/i386/funcspec-8.c: Add FMA4.
	* gcc.target/i386/funcspec-9.c: New file.
	* gcc.target/i386/i386.exp: Add check_effective_target_fma4.
	* gcc.target/i386/isa-10.c
	* gcc.target/i386/isa-11.c
	* gcc.target/i386/isa-12.c
	* gcc.target/i386/isa-13.c
	* gcc.target/i386/isa-2.c
	* gcc.target/i386/isa-3.c
	* gcc.target/i386/isa-4.c
	* gcc.target/i386/isa-7.c
	* gcc.target/i386/isa-8.c
	* gcc.target/i386/isa-9.c: New file.
	* gcc.target/i386/isa-14.c
	* gcc.target/i386/isa-1.c
	* gcc.target/i386/isa-5.c
	* gcc.target/i386/isa-6.c: Add FMA4.
	* gcc.target/i386/sse-12.c
	* gcc.target/i386/sse-13.c
	* gcc.target/i386/sse-14.c
	* gcc.target/i386/sse-22.c: New file.
	* g++.dg/other/i386-2.C
	* g++.dg/other/i386-3.C
	* g++.dg/other/i386-5.C
	* g++.dg/other/i386-6.C: Add -mfma4 in dg-options.

From-SVN: r152311
parent f8fd49b5
2009-09-29 Harsha Jagasia <harsha.jagasia@amd.com>
* config.gcc (i[34567]86-*-*): Include fma4intrin.h.
(x86_64-*-*): Ditto.
* config/i386/fma4intrin.h: New file, provide common x86 compiler
intrinisics for FMA4.
* config/i386/cpuid.h (bit_FMA4): Define FMA4 bit.
* config/i386/x86intrin.h: Fix typo to SSE4A instead of SSE4a.
Add FMA4 check and fma4intrin.h.
* config/i386/i386-c.c(ix86_target_macros_internal): Check
ISA_FLAG for FMA4.
* config/i386/i386.h(TARGET_FMA4): New macro for FMA4.
* config/i386/i386.md (UNSPEC_FMA4_INTRINSIC): Add new UNSPEC
constant for FMA4 support.
(UNSPEC_FMA4_FMADDSUB): Ditto.
(UNSPEC_FMA4_FMSUBADD): Ditto.
* config/i386/i386.opt (-mfma4): New switch for FMA4 support.
* config/i386/i386-protos.h (ix86_fma4_valid_op_p): Add
declaration.
(ix86_expand_fma4_multiple_memory): Ditto.
* config/i386/i386.c (OPTION_MASK_ISA_FMA4_SET): New.
(OPTION_MASK_ISA_FMA4_UNSET): New.
(OPTION_MASK_ISA_SSE4A_UNSET): Change definition to
depend on FMA4.
(OPTION_MASK_ISA_AVX_UNSET): Change definition to
depend on FMA4.
(ix86_handle_option): Handle -mfma4.
(isa_opts): Handle -mfma4.
(enum pta_flags): Add PTA_FMA4.
(override_options): Add FMA4 support.
(IX86_BUILTIN_VFMADDSS): New for FMA4 intrinsic.
(IX86_BUILTIN_VFMADDSD): Ditto.
(IX86_BUILTIN_VFMADDPS): Ditto.
(IX86_BUILTIN_VFMADDPD): Ditto.
(IX86_BUILTIN_VFMSUBSS): Ditto.
(IX86_BUILTIN_VFMSUBSD): Ditto.
(IX86_BUILTIN_VFMSUBPS): Ditto.
(IX86_BUILTIN_VFMSUBPD): Ditto.
(IX86_BUILTIN_VFMADDSUBPS): Ditto.
(IX86_BUILTIN_VFMADDSUBPD): Ditto.
(IX86_BUILTIN_VFMSUBADDPS): Ditto.
(IX86_BUILTIN_VFMSUBADDPD): Ditto.
(IX86_BUILTIN_VFNMADDSS): Ditto.
(IX86_BUILTIN_VFNMADDSD): Ditto.
(IX86_BUILTIN_VFNMADDPS): Ditto.
(IX86_BUILTIN_VFNMADDPD): Ditto.
(IX86_BUILTIN_VFNMSUBSS): Ditto.
(IX86_BUILTIN_VFNMSUBSD): Ditto.
(IX86_BUILTIN_VFNMSUBPS): Ditto.
(IX86_BUILTIN_VFNMSUBPD): Ditto.
(IX86_BUILTIN_VFMADDPS256): Ditto.
(IX86_BUILTIN_VFMADDPD256): Ditto.
(IX86_BUILTIN_VFMSUBPS256): Ditto.
(IX86_BUILTIN_VFMSUBPD256): Ditto.
(IX86_BUILTIN_VFMADDSUBPS256): Ditto.
(IX86_BUILTIN_VFMADDSUBPD256): Ditto.
(IX86_BUILTIN_VFMSUBADDPS256): Ditto.
(IX86_BUILTIN_VFMSUBADDPD256): Ditto.
(IX86_BUILTIN_VFNMADDPS256): Ditto.
(IX86_BUILTIN_VFNMADDPD256): Ditto.
(IX86_BUILTIN_VFNMSUBPS256): Ditto.
(IX86_BUILTIN_VFNMSUBPD256): Ditto.
(enum multi_arg_type): New enum for describing the various FMA4
intrinsic argument types.
(bdesc_multi_arg): New table for FMA4 intrinsics.
(ix86_init_mmx_sse_builtins): Add FMA4 intrinsic support.
(ix86_expand_multi_arg_builtin): New function for creating FMA4
intrinsics.
(ix86_expand_builtin): Add FMA4 intrinsic support.
(ix86_fma4_valid_op_p): New function to validate FMA4 3 and 4
operand instructions.
(ix86_expand_fma4_multiple_memory): New function to split the
second memory reference from FMA4 instructions.
* config/i386/sse.md (ssemodesuffixf4): New mode attribute for FMA4.
(ssemodesuffixf2s): Ditto.
(fma4_fmadd<mode>4): Add FMA4 floating point multiply/add
instructions.
(fma4_fmsub<mode>4): Ditto.
(fma4_fnmadd<mode>4): Ditto.
(fma4_fnmsub<mode>4): Ditto.
(fma4_vmfmadd<mode>4): Ditto.
(fma4_vmfmsub<mode>4): Ditto.
(fma4_vmfnmadd<mode>4): Ditto.
(fma4_vmfnmsub<mode>4): Ditto.
(fma4_fmadd<mode>4256): Ditto.
(fma4_fmsub<mode>4256): Ditto.
(fma4_fnmadd<mode>4256): Ditto.
(fma4_fnmsub<mode>4256): Ditto.
(fma4_fmaddsubv8sf4): Ditto.
(fma4_fmaddsubv4sf4): Ditto.
(fma4_fmaddsubv4df4): Ditto.
(fma4_fmaddsubv2df4): Ditto.
(fma4_fmsubaddv8sf4): Ditto.
(fma4_fmsubaddv4sf4): Ditto.
(fma4_fmsubaddv4df4): Ditto.
(fma4_fmsubaddv2df4): Ditto.
(fma4i_fmadd<mode>4): Add FMA4 floating point multiply/add
instructions for intrinsics.
(fma4i_fmsub<mode>4): Ditto.
(fma4i_fnmadd<mode>4): Ditto.
(fma4i_fnmsub<mode>4): Ditto.
(fma4i_vmfmadd<mode>4): Ditto.
(fma4i_vmfmsub<mode>4): Ditto.
(fma4i_vmfnmadd<mode>4): Ditto.
(fma4i_vmfnmsub<mode>4): Ditto.
(fma4i_fmadd<mode>4256): Ditto.
(fma4i_fmsub<mode>4256): Ditto.
(fma4i_fnmadd<mode>4256): Ditto.
(fma4i_fnmsub<mode>4256): Ditto.
(fma4i_fmaddsubv8sf4): Ditto.
(fma4i_fmaddsubv4sf4): Ditto.
(fma4i_fmaddsubv4df4): Ditto.
(fma4i_fmaddsubv2df4): Ditto.
(fma4i_fmsubaddv8sf4): Ditto.
(fma4i_fmsubaddv4sf4): Ditto.
(fma4i_fmsubaddv4df4): Ditto.
(fma4i_fmsubaddv2df4): Ditto.
* doc/invoke.texi (-mfma4): Add documentation.
* doc/extend.texi (x86 intrinsics): Add FMA4 intrinsics.
2009-09-29 Richard Henderson <rth@redhat.com> 2009-09-29 Richard Henderson <rth@redhat.com>
* tree-eh.c (unsplit_eh): Do not unsplit if there's already * tree-eh.c (unsplit_eh): Do not unsplit if there's already
...@@ -286,8 +286,9 @@ i[34567]86-*-*) ...@@ -286,8 +286,9 @@ i[34567]86-*-*)
cxx_target_objs="i386-c.o" cxx_target_objs="i386-c.o"
extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
nmmintrin.h bmmintrin.h wmmintrin.h immintrin.h nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
x86intrin.h avxintrin.h ia32intrin.h cross-stdarg.h" immintrin.h x86intrin.h avxintrin.h
ia32intrin.h cross-stdarg.h"
;; ;;
x86_64-*-*) x86_64-*-*)
cpu_type=i386 cpu_type=i386
...@@ -295,8 +296,9 @@ x86_64-*-*) ...@@ -295,8 +296,9 @@ x86_64-*-*)
cxx_target_objs="i386-c.o" cxx_target_objs="i386-c.o"
extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
nmmintrin.h bmmintrin.h wmmintrin.h immintrin.h nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
x86intrin.h avxintrin.h ia32intrin.h cross-stdarg.h" immintrin.h x86intrin.h avxintrin.h
ia32intrin.h cross-stdarg.h"
need_64bit_hwint=yes need_64bit_hwint=yes
;; ;;
ia64-*-*) ia64-*-*)
......
...@@ -48,6 +48,7 @@ ...@@ -48,6 +48,7 @@
/* %ecx */ /* %ecx */
#define bit_LAHF_LM (1 << 0) #define bit_LAHF_LM (1 << 0)
#define bit_SSE4a (1 << 6) #define bit_SSE4a (1 << 6)
#define bit_FMA4 (1 << 16)
/* %edx */ /* %edx */
#define bit_LM (1 << 29) #define bit_LM (1 << 29)
......
/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
#ifndef _X86INTRIN_H_INCLUDED
# error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _FMA4INTRIN_H_INCLUDED
#define _FMA4INTRIN_H_INCLUDED
#ifndef __FMA4__
# error "FMA4 instruction set not enabled"
#else
/* We need definitions from the SSE4A, SSE3, SSE2 and SSE header files. */
#include <ammintrin.h>
/* Internal data types for implementing the intrinsics. */
typedef float __v8sf __attribute__ ((__vector_size__ (32)));
typedef double __v4df __attribute__ ((__vector_size__ (32)));
typedef float __m256 __attribute__ ((__vector_size__ (32),
__may_alias__));
typedef double __m256d __attribute__ ((__vector_size__ (32),
__may_alias__));
/* 128b Floating point multiply/add type instructions. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_ps (__m128 __A, __m128 __B, __m128 __C)
{
return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_pd (__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_ss (__m128 __A, __m128 __B, __m128 __C)
{
return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_sd (__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_ps (__m128 __A, __m128 __B, __m128 __C)
{
return (__m128) __builtin_ia32_vfmsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_pd (__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d) __builtin_ia32_vfmsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_ss (__m128 __A, __m128 __B, __m128 __C)
{
return (__m128) __builtin_ia32_vfmsubss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_sd (__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d) __builtin_ia32_vfmsubsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_ps (__m128 __A, __m128 __B, __m128 __C)
{
return (__m128) __builtin_ia32_vfnmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_pd (__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d) __builtin_ia32_vfnmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_ss (__m128 __A, __m128 __B, __m128 __C)
{
return (__m128) __builtin_ia32_vfnmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_sd (__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d) __builtin_ia32_vfnmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_ps (__m128 __A, __m128 __B, __m128 __C)
{
return (__m128) __builtin_ia32_vfnmsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_pd (__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d) __builtin_ia32_vfnmsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_ss (__m128 __A, __m128 __B, __m128 __C)
{
return (__m128) __builtin_ia32_vfnmsubss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_sd (__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d) __builtin_ia32_vfnmsubsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddsub_ps (__m128 __A, __m128 __B, __m128 __C)
{
return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddsub_pd (__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msubadd_ps (__m128 __A, __m128 __B, __m128 __C)
{
return (__m128) __builtin_ia32_vfmsubaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msubadd_pd (__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d) __builtin_ia32_vfmsubaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
/* 256b Floating point multiply/add type instructions. */
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_macc_ps (__m256 __A, __m256 __B, __m256 __C)
{
return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_macc_pd (__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msub_ps (__m256 __A, __m256 __B, __m256 __C)
{
return (__m256) __builtin_ia32_vfmsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msub_pd (__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d) __builtin_ia32_vfmsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmacc_ps (__m256 __A, __m256 __B, __m256 __C)
{
return (__m256) __builtin_ia32_vfnmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmacc_pd (__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d) __builtin_ia32_vfnmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmsub_ps (__m256 __A, __m256 __B, __m256 __C)
{
return (__m256) __builtin_ia32_vfnmsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmsub_pd (__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d) __builtin_ia32_vfnmsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maddsub_ps (__m256 __A, __m256 __B, __m256 __C)
{
return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maddsub_pd (__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msubadd_ps (__m256 __A, __m256 __B, __m256 __C)
{
return (__m256) __builtin_ia32_vfmsubaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msubadd_pd (__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d) __builtin_ia32_vfmsubaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
#endif
#endif
...@@ -230,6 +230,8 @@ ix86_target_macros_internal (int isa_flag, ...@@ -230,6 +230,8 @@ ix86_target_macros_internal (int isa_flag,
def_or_undef (parse_in, "__FMA__"); def_or_undef (parse_in, "__FMA__");
if (isa_flag & OPTION_MASK_ISA_SSE4A) if (isa_flag & OPTION_MASK_ISA_SSE4A)
def_or_undef (parse_in, "__SSE4A__"); def_or_undef (parse_in, "__SSE4A__");
if (isa_flag & OPTION_MASK_ISA_FMA4)
def_or_undef (parse_in, "__FMA4__");
if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE)) if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE))
def_or_undef (parse_in, "__SSE_MATH__"); def_or_undef (parse_in, "__SSE_MATH__");
if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE2)) if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE2))
......
...@@ -214,6 +214,9 @@ extern void ix86_expand_vector_set (bool, rtx, rtx, int); ...@@ -214,6 +214,9 @@ extern void ix86_expand_vector_set (bool, rtx, rtx, int);
extern void ix86_expand_vector_extract (bool, rtx, rtx, int); extern void ix86_expand_vector_extract (bool, rtx, rtx, int);
extern void ix86_expand_reduc_v4sf (rtx (*)(rtx, rtx, rtx), rtx, rtx); extern void ix86_expand_reduc_v4sf (rtx (*)(rtx, rtx, rtx), rtx, rtx);
extern bool ix86_fma4_valid_op_p (rtx [], rtx, int, bool, int, bool);
extern void ix86_expand_fma4_multiple_memory (rtx [], int, enum machine_mode);
/* In i386-c.c */ /* In i386-c.c */
extern void ix86_target_macros (void); extern void ix86_target_macros (void);
extern void ix86_register_pragmas (void); extern void ix86_register_pragmas (void);
......
...@@ -54,6 +54,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ...@@ -54,6 +54,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#define TARGET_AVX OPTION_ISA_AVX #define TARGET_AVX OPTION_ISA_AVX
#define TARGET_FMA OPTION_ISA_FMA #define TARGET_FMA OPTION_ISA_FMA
#define TARGET_SSE4A OPTION_ISA_SSE4A #define TARGET_SSE4A OPTION_ISA_SSE4A
#define TARGET_FMA4 OPTION_ISA_FMA4
#define TARGET_ROUND OPTION_ISA_ROUND #define TARGET_ROUND OPTION_ISA_ROUND
#define TARGET_ABM OPTION_ISA_ABM #define TARGET_ABM OPTION_ISA_ABM
#define TARGET_POPCNT OPTION_ISA_POPCNT #define TARGET_POPCNT OPTION_ISA_POPCNT
...@@ -65,8 +66,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ...@@ -65,8 +66,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#define TARGET_CMPXCHG16B OPTION_ISA_CX16 #define TARGET_CMPXCHG16B OPTION_ISA_CX16
/* SSE4.1 define round instructions */ /* SSE4.1 defines round instructions */
#define OPTION_MASK_ISA_ROUND (OPTION_MASK_ISA_SSE4_1) #define OPTION_MASK_ISA_ROUND OPTION_MASK_ISA_SSE4_1
#define OPTION_ISA_ROUND ((ix86_isa_flags & OPTION_MASK_ISA_ROUND) != 0) #define OPTION_ISA_ROUND ((ix86_isa_flags & OPTION_MASK_ISA_ROUND) != 0)
#include "config/vxworks-dummy.h" #include "config/vxworks-dummy.h"
...@@ -1351,6 +1352,10 @@ enum reg_class ...@@ -1351,6 +1352,10 @@ enum reg_class
(TARGET_AVX && ((MODE) == V4SFmode || (MODE) == V2DFmode \ (TARGET_AVX && ((MODE) == V4SFmode || (MODE) == V2DFmode \
|| (MODE) == V8SFmode || (MODE) == V4DFmode)) || (MODE) == V8SFmode || (MODE) == V4DFmode))
#define FMA4_VEC_FLOAT_MODE_P(MODE) \
(TARGET_FMA4 && ((MODE) == V4SFmode || (MODE) == V2DFmode \
|| (MODE) == V8SFmode || (MODE) == V4DFmode))
#define MMX_REG_P(XOP) (REG_P (XOP) && MMX_REGNO_P (REGNO (XOP))) #define MMX_REG_P(XOP) (REG_P (XOP) && MMX_REGNO_P (REGNO (XOP)))
#define MMX_REGNO_P(N) IN_RANGE ((N), FIRST_MMX_REG, LAST_MMX_REG) #define MMX_REGNO_P(N) IN_RANGE ((N), FIRST_MMX_REG, LAST_MMX_REG)
......
...@@ -195,6 +195,10 @@ ...@@ -195,6 +195,10 @@
(UNSPEC_PCMPESTR 144) (UNSPEC_PCMPESTR 144)
(UNSPEC_PCMPISTR 145) (UNSPEC_PCMPISTR 145)
; For FMA4 support
(UNSPEC_FMA4_INTRINSIC 150)
(UNSPEC_FMA4_FMADDSUB 151)
(UNSPEC_FMA4_FMSUBADD 152)
; For AES support ; For AES support
(UNSPEC_AESENC 159) (UNSPEC_AESENC 159)
(UNSPEC_AESENCLAST 160) (UNSPEC_AESENCLAST 160)
......
...@@ -310,6 +310,10 @@ msse4a ...@@ -310,6 +310,10 @@ msse4a
Target Report Mask(ISA_SSE4A) Var(ix86_isa_flags) VarExists Save Target Report Mask(ISA_SSE4A) Var(ix86_isa_flags) VarExists Save
Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation
mfma4
Target Report Mask(ISA_FMA4) Var(ix86_isa_flags) VarExists Save
Support FMA4 built-in functions and code generation
mabm mabm
Target Report Mask(ISA_ABM) Var(ix86_isa_flags) VarExists Save Target Report Mask(ISA_ABM) Var(ix86_isa_flags) VarExists Save
Support code generation of Advanced Bit Manipulation (ABM) instructions. Support code generation of Advanced Bit Manipulation (ABM) instructions.
......
...@@ -46,7 +46,7 @@ ...@@ -46,7 +46,7 @@
#include <tmmintrin.h> #include <tmmintrin.h>
#endif #endif
#ifdef __SSE4a__ #ifdef __SSE4A__
#include <ammintrin.h> #include <ammintrin.h>
#endif #endif
...@@ -54,6 +54,10 @@ ...@@ -54,6 +54,10 @@
#include <smmintrin.h> #include <smmintrin.h>
#endif #endif
#ifdef __FMA4__
#include <fma4intrin.h>
#endif
#if defined (__AES__) || defined (__PCLMUL__) #if defined (__AES__) || defined (__PCLMUL__)
#include <wmmintrin.h> #include <wmmintrin.h>
#endif #endif
......
...@@ -3168,6 +3168,11 @@ Enable/disable the generation of the sse4.2 instructions. ...@@ -3168,6 +3168,11 @@ Enable/disable the generation of the sse4.2 instructions.
@cindex @code{target("sse4a")} attribute @cindex @code{target("sse4a")} attribute
Enable/disable the generation of the SSE4A instructions. Enable/disable the generation of the SSE4A instructions.
@item fma4
@itemx no-fma4
@cindex @code{target("fma4")} attribute
Enable/disable the generation of the FMA4 instructions.
@item ssse3 @item ssse3
@itemx no-ssse3 @itemx no-ssse3
@cindex @code{target("ssse3")} attribute @cindex @code{target("ssse3")} attribute
...@@ -8888,6 +8893,46 @@ v2di __builtin_ia32_insertq (v2di, v2di) ...@@ -8888,6 +8893,46 @@ v2di __builtin_ia32_insertq (v2di, v2di)
v2di __builtin_ia32_insertqi (v2di, v2di, const unsigned int, const unsigned int) v2di __builtin_ia32_insertqi (v2di, v2di, const unsigned int, const unsigned int)
@end smallexample @end smallexample
The following built-in functions are available when @option{-mfma4} is used.
All of them generate the machine instruction that is part of the name
with MMX registers.
@smallexample
v2df __builtin_ia32_fmaddpd (v2df, v2df, v2df)
v4sf __builtin_ia32_fmaddps (v4sf, v4sf, v4sf)
v2df __builtin_ia32_fmaddsd (v2df, v2df, v2df)
v4sf __builtin_ia32_fmaddss (v4sf, v4sf, v4sf)
v2df __builtin_ia32_fmsubpd (v2df, v2df, v2df)
v4sf __builtin_ia32_fmsubps (v4sf, v4sf, v4sf)
v2df __builtin_ia32_fmsubsd (v2df, v2df, v2df)
v4sf __builtin_ia32_fmsubss (v4sf, v4sf, v4sf)
v2df __builtin_ia32_fnmaddpd (v2df, v2df, v2df)
v4sf __builtin_ia32_fnmaddps (v4sf, v4sf, v4sf)
v2df __builtin_ia32_fnmaddsd (v2df, v2df, v2df)
v4sf __builtin_ia32_fnmaddss (v4sf, v4sf, v4sf)
v2df __builtin_ia32_fnmsubpd (v2df, v2df, v2df)
v4sf __builtin_ia32_fnmsubps (v4sf, v4sf, v4sf)
v2df __builtin_ia32_fnmsubsd (v2df, v2df, v2df)
v4sf __builtin_ia32_fnmsubss (v4sf, v4sf, v4sf)
v2df __builtin_ia32_fmaddsubpd (v2df, v2df, v2df)
v4sf __builtin_ia32_fmaddsubps (v4sf, v4sf, v4sf)
v2df __builtin_ia32_fmsubaddpd (v2df, v2df, v2df)
v4sf __builtin_ia32_fmsubaddps (v4sf, v4sf, v4sf)
v4df __builtin_ia32_fmaddpd256 (v4df, v4df, v4df)
v8sf __builtin_ia32_fmaddps256 (v8sf, v8sf, v8sf)
v4df __builtin_ia32_fmsubpd256 (v4df, v4df, v4df)
v8sf __builtin_ia32_fmsubps256 (v8sf, v8sf, v8sf)
v4df __builtin_ia32_fnmaddpd256 (v4df, v4df, v4df)
v8sf __builtin_ia32_fnmaddps256 (v8sf, v8sf, v8sf)
v4df __builtin_ia32_fnmsubpd256 (v4df, v4df, v4df)
v8sf __builtin_ia32_fnmsubps256 (v8sf, v8sf, v8sf)
v4df __builtin_ia32_fmaddsubpd256 (v4df, v4df, v4df)
v8sf __builtin_ia32_fmaddsubps256 (v8sf, v8sf, v8sf)
v4df __builtin_ia32_fmsubaddpd256 (v4df, v4df, v4df)
v8sf __builtin_ia32_fmsubaddps256 (v8sf, v8sf, v8sf)
@end smallexample
The following built-in functions are available when @option{-m3dnow} is used. The following built-in functions are available when @option{-m3dnow} is used.
All of them generate the machine instruction that is part of the name. All of them generate the machine instruction that is part of the name.
......
...@@ -592,7 +592,7 @@ Objective-C and Objective-C++ Dialects}. ...@@ -592,7 +592,7 @@ Objective-C and Objective-C++ Dialects}.
-mcld -mcx16 -msahf -mmovbe -mcrc32 -mrecip @gol -mcld -mcx16 -msahf -mmovbe -mcrc32 -mrecip @gol
-mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol
-maes -mpclmul @gol -maes -mpclmul @gol
-msse4a -m3dnow -mpopcnt -mabm @gol -msse4a -m3dnow -mpopcnt -mabm -mfma4 @gol
-mthreads -mno-align-stringops -minline-all-stringops @gol -mthreads -mno-align-stringops -minline-all-stringops @gol
-minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
-mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol
...@@ -11727,6 +11727,8 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. ...@@ -11727,6 +11727,8 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
@itemx -mno-pclmul @itemx -mno-pclmul
@itemx -msse4a @itemx -msse4a
@itemx -mno-sse4a @itemx -mno-sse4a
@itemx -mfma4
@itemx -mno-fma4
@itemx -m3dnow @itemx -m3dnow
@itemx -mno-3dnow @itemx -mno-3dnow
@itemx -mpopcnt @itemx -mpopcnt
...@@ -11740,7 +11742,7 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. ...@@ -11740,7 +11742,7 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}.
@opindex m3dnow @opindex m3dnow
@opindex mno-3dnow @opindex mno-3dnow
These switches enable or disable the use of instructions in the MMX, These switches enable or disable the use of instructions in the MMX,
SSE, SSE2, SSE3, SSSE3, SSE4.1, AVX, AES, PCLMUL, SSE4A, ABM or SSE, SSE2, SSE3, SSSE3, SSE4.1, AVX, AES, PCLMUL, SSE4A, FMA4, ABM or
3DNow!@: extended instruction sets. 3DNow!@: extended instruction sets.
These extensions are also available as built-in functions: see These extensions are also available as built-in functions: see
@ref{X86 Built-in Functions}, for details of the functions enabled and @ref{X86 Built-in Functions}, for details of the functions enabled and
......
2009-09-29 Harsha Jagasia <harsha.jagasia@amd.com>
* gcc.target/i386/fma4-check.h
* gcc.target/i386/fma4-fma.c
* gcc.target/i386/fma4-maccXX.c
* gcc.target/i386/fma4-msubXX.c
* gcc.target/i386/fma4-nmaccXX.c
* gcc.target/i386/fma4-nmsubXX.c
* gcc.target/i386/fma4-vector.c
* gcc.target/i386/fma4-256-maccXX.c
* gcc.target/i386/fma4-256-msubXX.c
* gcc.target/i386/fma4-256-nmaccXX.c
* gcc.target/i386/fma4-256-nmsubXX.c
* gcc.target/i386/fma4-256-vector.c
* gcc.target/i386/funcspec-2.c: New file.
* gcc.target/i386/funcspec-4.c: Test error conditions
related to FMA4.
* gcc.target/i386/funcspec-5.c
* gcc.target/i386/funcspec-6.c
* gcc.target/i386/funcspec-8.c: Add FMA4.
* gcc.target/i386/funcspec-9.c: New file.
* gcc.target/i386/i386.exp: Add check_effective_target_fma4.
* gcc.target/i386/isa-10.c
* gcc.target/i386/isa-11.c
* gcc.target/i386/isa-12.c
* gcc.target/i386/isa-13.c
* gcc.target/i386/isa-2.c
* gcc.target/i386/isa-3.c
* gcc.target/i386/isa-4.c
* gcc.target/i386/isa-7.c
* gcc.target/i386/isa-8.c
* gcc.target/i386/isa-9.c: New file.
* gcc.target/i386/isa-14.c
* gcc.target/i386/isa-1.c
* gcc.target/i386/isa-5.c
* gcc.target/i386/isa-6.c: Add FMA4.
* gcc.target/i386/sse-12.c
* gcc.target/i386/sse-13.c
* gcc.target/i386/sse-14.c
* gcc.target/i386/sse-22.c: New file.
* g++.dg/other/i386-2.C
* g++.dg/other/i386-3.C
* g++.dg/other/i386-5.C
* g++.dg/other/i386-6.C: Add -mfma4 in dg-options.
2009-09-29 H.J. Lu <hongjiu.lu@intel.com> 2009-09-29 H.J. Lu <hongjiu.lu@intel.com>
PR testsuite/41496 PR testsuite/41496
......
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are /* Test that {,x,e,p,t,s,w,a,i}mmintrin.h, fma4intrin.h, mm3dnow.h and
usable with -O -pedantic-errors. */ mm_malloc.h are usable with -O -pedantic-errors. */
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O -pedantic-errors -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */ /* { dg-options "-O -pedantic-errors -march=k8 -m3dnow -mavx -msse4a -mfma4 -maes -mpclmul" } */
#include <x86intrin.h> #include <x86intrin.h>
......
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are /* Test that {,x,e,p,t,s,w,a,i}mmintrin.h, fma4intrin.h, mm3dnow.h and
usable with -O -fkeep-inline-functions. */ mm_malloc.h are usable with -O -fkeep-inline-functions. */
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O -fkeep-inline-functions -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */ /* { dg-options "-O -fkeep-inline-functions -march=k8 -m3dnow -mavx -msse4a -mfma4 -maes -mpclmul" } */
#include <x86intrin.h> #include <x86intrin.h>
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are /* Test that {,x,e,p,t,s,w,a,i}mmintrin.h, fma4intrin.h, mm3dnow.h and
usable with -O -fkeep-inline-functions. */ mm_malloc.h are usable with -O -fkeep-inline-functions. */
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O -fkeep-inline-functions -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */ /* { dg-options "-O -fkeep-inline-functions -march=k8 -m3dnow -mavx -msse4a -mfma4 -maes -mpclmul" } */
#include <x86intrin.h> #include <x86intrin.h>
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are /* Test that {,x,e,p,t,s,w,a,i}mmintrin.h, fma4intrin.h, mm3dnow.h and
usable with -O -pedantic-errors. */ mm_malloc.h are usable with -O -pedantic-errors. */
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O -pedantic-errors -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */ /* { dg-options "-O -pedantic-errors -march=k8 -m3dnow -mavx -msse4a -mfma4 -maes -mpclmul" } */
#include <x86intrin.h> #include <x86intrin.h>
......
/* { dg-do run } */
/* { dg-require-effective-target fma4 } */
/* { dg-options "-O2 -mfma4" } */
#include "fma4-check.h"
#include <x86intrin.h>
#include <string.h>
#define NUM 20
union
{
__m256 x[NUM];
float f[NUM * 8];
__m256d y[NUM];
double d[NUM * 4];
} dst, res, src1, src2, src3;
/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
product is not rounded, only the addition is rounded. */
static void
init_maccps ()
{
int i;
for (i = 0; i < NUM * 8; i++)
{
src1.f[i] = i;
src2.f[i] = i + 10;
src3.f[i] = i + 20;
}
}
static void
init_maccpd ()
{
int i;
for (i = 0; i < NUM * 4; i++)
{
src1.d[i] = i;
src2.d[i] = i + 10;
src3.d[i] = i + 20;
}
}
static int
check_maccps ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 8; i = i + 8)
for (j = 0; j < 8; j++)
{
res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j];
if (dst.f[i + j] != res.f[i + j])
check_fails++;
}
return check_fails++;
}
static int
check_maccpd ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 4; i = i + 4)
for (j = 0; j < 4; j++)
{
res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j];
if (dst.d[i + j] != res.d[i + j])
check_fails++;
}
return check_fails++;
}
static void
fma4_test (void)
{
int i;
init_maccps ();
for (i = 0; i < NUM; i++)
dst.x[i] = _mm256_macc_ps (src1.x[i], src2.x[i], src3.x[i]);
if (check_maccps ())
abort ();
init_maccpd ();
for (i = 0; i < NUM; i++)
dst.y[i] = _mm256_macc_pd (src1.y[i], src2.y[i], src3.y[i]);
if (check_maccpd ())
abort ();
}
/* { dg-do run } */
/* { dg-require-effective-target fma4 } */
/* { dg-options "-O2 -mfma4" } */
#include "fma4-check.h"
#include <x86intrin.h>
#include <string.h>
#define NUM 20
union
{
__m256 x[NUM];
float f[NUM * 8];
__m256d y[NUM];
double d[NUM * 4];
} dst, res, src1, src2, src3;
/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
product is not rounded, only the addition is rounded. */
static void
init_msubps ()
{
int i;
for (i = 0; i < NUM * 8; i++)
{
src1.f[i] = i;
src2.f[i] = i + 10;
src3.f[i] = i + 20;
}
}
static void
init_msubpd ()
{
int i;
for (i = 0; i < NUM * 4; i++)
{
src1.d[i] = i;
src2.d[i] = i + 10;
src3.d[i] = i + 20;
}
}
static int
check_msubps ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 8; i = i + 8)
for (j = 0; j < 8; j++)
{
res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j];
if (dst.f[i + j] != res.f[i + j])
check_fails++;
}
return check_fails++;
}
static int
check_msubpd ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 4; i = i + 4)
for (j = 0; j < 4; j++)
{
res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j];
if (dst.d[i + j] != res.d[i + j])
check_fails++;
}
return check_fails++;
}
static void
fma4_test (void)
{
int i;
init_msubps ();
for (i = 0; i < NUM; i++)
dst.x[i] = _mm256_msub_ps (src1.x[i], src2.x[i], src3.x[i]);
if (check_msubps ())
abort ();
init_msubpd ();
for (i = 0; i < NUM; i++)
dst.y[i] = _mm256_msub_pd (src1.y[i], src2.y[i], src3.y[i]);
if (check_msubpd ())
abort ();
}
/* { dg-do run } */
/* { dg-require-effective-target fma4 } */
/* { dg-options "-O2 -mfma4" } */
#include "fma4-check.h"
#include <x86intrin.h>
#include <string.h>
#define NUM 20
union
{
__m256 x[NUM];
float f[NUM * 8];
__m256d y[NUM];
double d[NUM * 4];
} dst, res, src1, src2, src3;
/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
product is not rounded, only the addition is rounded. */
static void
init_nmaccps ()
{
int i;
for (i = 0; i < NUM * 8; i++)
{
src1.f[i] = i;
src2.f[i] = i + 10;
src3.f[i] = i + 20;
}
}
static void
init_nmaccpd ()
{
int i;
for (i = 0; i < NUM * 4; i++)
{
src1.d[i] = i;
src2.d[i] = i + 10;
src3.d[i] = i + 20;
}
}
static int
check_nmaccps ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 8; i = i + 8)
for (j = 0; j < 8; j++)
{
res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j];
if (dst.f[i + j] != res.f[i + j])
check_fails++;
}
return check_fails++;
}
static int
check_nmaccpd ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 4; i = i + 4)
for (j = 0; j < 4; j++)
{
res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j];
if (dst.d[i + j] != res.d[i + j])
check_fails++;
}
return check_fails++;
}
static void
fma4_test (void)
{
int i;
init_nmaccps ();
for (i = 0; i < NUM; i++)
dst.x[i] = _mm256_nmacc_ps (src1.x[i], src2.x[i], src3.x[i]);
if (check_nmaccps ())
abort ();
init_nmaccpd ();
for (i = 0; i < NUM; i++)
dst.y[i] = _mm256_nmacc_pd (src1.y[i], src2.y[i], src3.y[i]);
if (check_nmaccpd ())
abort ();
}
/* { dg-require-effective-target fma4 } */
/* { dg-options "-O2 -mfma4" } */
#include "fma4-check.h"
#include <x86intrin.h>
#include <string.h>
#define NUM 20
union
{
__m256 x[NUM];
float f[NUM * 8];
__m256d y[NUM];
double d[NUM * 4];
} dst, res, src1, src2, src3;
/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
product is not rounded, only the addition is rounded. */
static void
init_nmsubps ()
{
int i;
for (i = 0; i < NUM * 8; i++)
{
src1.f[i] = i;
src2.f[i] = i + 10;
src3.f[i] = i + 20;
}
}
static void
init_nmsubpd ()
{
int i;
for (i = 0; i < NUM * 4; i++)
{
src1.d[i] = i;
src2.d[i] = i + 10;
src3.d[i] = i + 20;
}
}
static int
check_nmsubps ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 8; i = i + 8)
for (j = 0; j < 8; j++)
{
res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j];
if (dst.f[i + j] != res.f[i + j])
check_fails++;
}
return check_fails++;
}
static int
check_nmsubpd ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 4; i = i + 4)
for (j = 0; j < 4; j++)
{
res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j];
if (dst.d[i + j] != res.d[i + j])
check_fails++;
}
return check_fails++;
}
static void
fma4_test (void)
{
int i;
init_nmsubps ();
for (i = 0; i < NUM; i++)
dst.x[i] = _mm256_nmsub_ps (src1.x[i], src2.x[i], src3.x[i]);
if (check_nmsubps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4]))
abort ();
init_nmsubpd ();
for (i = 0; i < NUM; i++)
dst.y[i] = _mm256_nmsub_pd (src1.y[i], src2.y[i], src3.y[i]);
if (check_nmsubpd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2]))
abort ();
}
/* Test that the compiler properly optimizes floating point multiply and add
instructions vector into vfmaddps on FMA4 systems. */
/* { dg-do compile } */
/* { dg-require-effective-target lp64 } */
/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */
extern void exit (int);
typedef float __m256 __attribute__ ((__vector_size__ (32), __may_alias__));
typedef double __m256d __attribute__ ((__vector_size__ (32), __may_alias__));
#define SIZE 10240
union {
__m256 f_align;
__m256d d_align;
float f[SIZE];
double d[SIZE];
} a, b, c, d;
void
flt_mul_add (void)
{
int i;
for (i = 0; i < SIZE; i++)
a.f[i] = (b.f[i] * c.f[i]) + d.f[i];
}
void
dbl_mul_add (void)
{
int i;
for (i = 0; i < SIZE; i++)
a.d[i] = (b.d[i] * c.d[i]) + d.d[i];
}
void
flt_mul_sub (void)
{
int i;
for (i = 0; i < SIZE; i++)
a.f[i] = (b.f[i] * c.f[i]) - d.f[i];
}
void
dbl_mul_sub (void)
{
int i;
for (i = 0; i < SIZE; i++)
a.d[i] = (b.d[i] * c.d[i]) - d.d[i];
}
void
flt_neg_mul_add (void)
{
int i;
for (i = 0; i < SIZE; i++)
a.f[i] = (-(b.f[i] * c.f[i])) + d.f[i];
}
void
dbl_neg_mul_add (void)
{
int i;
for (i = 0; i < SIZE; i++)
a.d[i] = (-(b.d[i] * c.d[i])) + d.d[i];
}
int main ()
{
flt_mul_add ();
flt_mul_sub ();
flt_neg_mul_add ();
dbl_mul_add ();
dbl_mul_sub ();
dbl_neg_mul_add ();
exit (0);
}
/* { dg-final { scan-assembler "vfmaddps" } } */
/* { dg-final { scan-assembler "vfmaddpd" } } */
/* { dg-final { scan-assembler "vfmsubps" } } */
/* { dg-final { scan-assembler "vfmsubpd" } } */
/* { dg-final { scan-assembler "vfnmaddps" } } */
/* { dg-final { scan-assembler "vfnmaddpd" } } */
#include <stdlib.h>
#include "cpuid.h"
static void fma4_test (void);
int
main ()
{
unsigned int eax, ebx, ecx, edx;
if (!__get_cpuid (0x80000001, &eax, &ebx, &ecx, &edx))
return 0;
/* Run FMA4 test only if host has FMA4 support. */
if (ecx & bit_FMA4)
fma4_test ();
exit (0);
}
/* Test that the compiler properly optimizes floating point multiply
and add instructions into vfmaddss, vfmsubss, vfnmaddss,
vfnmsubss on FMA4 systems. */
/* { dg-do compile } */
/* { dg-require-effective-target lp64 } */
/* { dg-options "-O2 -mfma4" } */
extern void exit (int);
float
flt_mul_add (float a, float b, float c)
{
return (a * b) + c;
}
double
dbl_mul_add (double a, double b, double c)
{
return (a * b) + c;
}
float
flt_mul_sub (float a, float b, float c)
{
return (a * b) - c;
}
double
dbl_mul_sub (double a, double b, double c)
{
return (a * b) - c;
}
float
flt_neg_mul_add (float a, float b, float c)
{
return (-(a * b)) + c;
}
double
dbl_neg_mul_add (double a, double b, double c)
{
return (-(a * b)) + c;
}
float
flt_neg_mul_sub (float a, float b, float c)
{
return (-(a * b)) - c;
}
double
dbl_neg_mul_sub (double a, double b, double c)
{
return (-(a * b)) - c;
}
float f[10] = { 2, 3, 4 };
double d[10] = { 2, 3, 4 };
int main ()
{
f[3] = flt_mul_add (f[0], f[1], f[2]);
f[4] = flt_mul_sub (f[0], f[1], f[2]);
f[5] = flt_neg_mul_add (f[0], f[1], f[2]);
f[6] = flt_neg_mul_sub (f[0], f[1], f[2]);
d[3] = dbl_mul_add (d[0], d[1], d[2]);
d[4] = dbl_mul_sub (d[0], d[1], d[2]);
d[5] = dbl_neg_mul_add (d[0], d[1], d[2]);
d[6] = dbl_neg_mul_sub (d[0], d[1], d[2]);
exit (0);
}
/* { dg-final { scan-assembler "vfmaddss" } } */
/* { dg-final { scan-assembler "vfmaddsd" } } */
/* { dg-final { scan-assembler "vfmsubss" } } */
/* { dg-final { scan-assembler "vfmsubsd" } } */
/* { dg-final { scan-assembler "vfnmaddss" } } */
/* { dg-final { scan-assembler "vfnmaddsd" } } */
/* { dg-final { scan-assembler "vfnmsubss" } } */
/* { dg-final { scan-assembler "vfnmsubsd" } } */
/* { dg-do run } */
/* { dg-require-effective-target fma4 } */
/* { dg-options "-O0 -mfma4" } */
#include "fma4-check.h"
#include <x86intrin.h>
#include <string.h>
#define NUM 20
union
{
__m128 x[NUM];
float f[NUM * 4];
__m128d y[NUM];
double d[NUM * 2];
} dst, res, src1, src2, src3;
/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
product is not rounded, only the addition is rounded. */
static void
init_maccps ()
{
int i;
for (i = 0; i < NUM * 4; i++)
{
src1.f[i] = i;
src2.f[i] = i + 10;
src3.f[i] = i + 20;
}
}
static void
init_maccpd ()
{
int i;
for (i = 0; i < NUM * 4; i++)
{
src1.d[i] = i;
src2.d[i] = i + 10;
src3.d[i] = i + 20;
}
}
static int
check_maccps ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 4; i = i + 4)
for (j = 0; j < 4; j++)
{
res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j];
if (dst.f[i + j] != res.f[i + j])
check_fails++;
}
return check_fails++;
}
static int
check_maccpd ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 2; i = i + 2)
for (j = 0; j < 2; j++)
{
res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j];
if (dst.d[i + j] != res.d[i + j])
check_fails++;
}
return check_fails++;
}
static int
check_maccss ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 4; i= i + 4)
{
res.f[i] = (src1.f[i] * src2.f[i]) + src3.f[i];
if (dst.f[i] != res.f[i])
check_fails++;
}
return check_fails++;
}
static int
check_maccsd ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 2; i = i + 2)
{
res.d[i] = (src1.d[i] * src2.d[i]) + src3.d[i];
if (dst.d[i] != res.d[i])
check_fails++;
}
return check_fails++;
}
static void
fma4_test (void)
{
int i;
init_maccps ();
for (i = 0; i < NUM; i++)
dst.x[i] = _mm_macc_ps (src1.x[i], src2.x[i], src3.x[i]);
if (check_maccps ())
abort ();
for (i = 0; i < NUM; i++)
dst.x[i] = _mm_macc_ss (src1.x[i], src2.x[i], src3.x[i]);
if (check_maccss ())
abort ();
init_maccpd ();
for (i = 0; i < NUM; i++)
dst.y[i] = _mm_macc_pd (src1.y[i], src2.y[i], src3.y[i]);
if (check_maccpd ())
abort ();
for (i = 0; i < NUM; i++)
dst.y[i] = _mm_macc_sd (src1.y[i], src2.y[i], src3.y[i]);
if (check_maccsd ())
abort ();
}
/* { dg-do run } */
/* { dg-require-effective-target fma4 } */
/* { dg-options "-O0 -mfma4" } */
#include "fma4-check.h"
#include <x86intrin.h>
#include <string.h>
#define NUM 20
union
{
__m128 x[NUM];
float f[NUM * 4];
__m128d y[NUM];
double d[NUM * 2];
} dst, res, src1, src2, src3;
/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
product is not rounded, only the addition is rounded. */
static void
init_msubps ()
{
int i;
for (i = 0; i < NUM * 4; i++)
{
src1.f[i] = i;
src2.f[i] = i + 10;
src3.f[i] = i + 20;
}
}
static void
init_msubpd ()
{
int i;
for (i = 0; i < NUM * 4; i++)
{
src1.d[i] = i;
src2.d[i] = i + 10;
src3.d[i] = i + 20;
}
}
static int
check_msubps ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 4; i = i + 4)
for (j = 0; j < 4; j++)
{
res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j];
if (dst.f[i + j] != res.f[i + j])
check_fails++;
}
return check_fails++;
}
static int
check_msubpd ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 2; i = i + 2)
for (j = 0; j < 2; j++)
{
res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j];
if (dst.d[i + j] != res.d[i + j])
check_fails++;
}
return check_fails++;
}
static int
check_msubss ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 4; i = i + 4)
{
res.f[i] = (src1.f[i] * src2.f[i]) - src3.f[i];
if (dst.f[i] != res.f[i])
check_fails++;
}
return check_fails++;
}
static int
check_msubsd ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 2; i = i + 2)
{
res.d[i] = (src1.d[i] * src2.d[i]) - src3.d[i];
if (dst.d[i] != res.d[i])
check_fails++;
}
return check_fails++;
}
static void
fma4_test (void)
{
int i;
init_msubps ();
for (i = 0; i < NUM; i++)
dst.x[i] = _mm_msub_ps (src1.x[i], src2.x[i], src3.x[i]);
if (check_msubps ())
abort ();
for (i = 0; i < NUM; i++)
dst.x[i] = _mm_msub_ss (src1.x[i], src2.x[i], src3.x[i]);
if (check_msubss ())
abort ();
init_msubpd ();
for (i = 0; i < NUM; i++)
dst.y[i] = _mm_msub_pd (src1.y[i], src2.y[i], src3.y[i]);
if (check_msubpd ())
abort ();
for (i = 0; i < NUM; i++)
dst.y[i] = _mm_msub_sd (src1.y[i], src2.y[i], src3.y[i]);
if (check_msubsd ())
abort ();
}
/* { dg-do run } */
/* { dg-require-effective-target fma4 } */
/* { dg-options "-O0 -mfma4" } */
#include "fma4-check.h"
#include <x86intrin.h>
#include <string.h>
#define NUM 20
union
{
__m128 x[NUM];
float f[NUM * 4];
__m128d y[NUM];
double d[NUM * 2];
} dst, res, src1, src2, src3;
/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
product is not rounded, only the addition is rounded. */
static void
init_nmaccps ()
{
int i;
for (i = 0; i < NUM * 4; i++)
{
src1.f[i] = i;
src2.f[i] = i + 10;
src3.f[i] = i + 20;
}
}
static void
init_nmaccpd ()
{
int i;
for (i = 0; i < NUM * 4; i++)
{
src1.d[i] = i;
src2.d[i] = i + 10;
src3.d[i] = i + 20;
}
}
static int
check_nmaccps ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 4; i = i + 4)
for (j = 0; j < 4; j++)
{
res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j];
if (dst.f[i + j] != res.f[i + j])
check_fails++;
}
return check_fails++;
}
static int
check_nmaccpd ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 2; i = i + 2)
for (j = 0; j < 2; j++)
{
res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j];
if (dst.d[i + j] != res.d[i + j])
check_fails++;
}
return check_fails++;
}
static int
check_nmaccss ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 4; i = i + 4)
{
res.f[i] = - (src1.f[i] * src2.f[i]) + src3.f[i];
if (dst.f[i] != res.f[i])
check_fails++;
}
return check_fails++;
}
static int
check_nmaccsd ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 2; i = i + 2)
{
res.d[i] = - (src1.d[i] * src2.d[i]) + src3.d[i];
if (dst.d[i] != res.d[i])
check_fails++;
}
return check_fails++;
}
static void
fma4_test (void)
{
int i;
init_nmaccps ();
for (i = 0; i < NUM; i++)
dst.x[i] = _mm_nmacc_ps (src1.x[i], src2.x[i], src3.x[i]);
if (check_nmaccps ())
abort ();
for (i = 0; i < NUM; i++)
dst.x[i] = _mm_nmacc_ss (src1.x[i], src2.x[i], src3.x[i]);
if (check_nmaccss ())
abort ();
init_nmaccpd ();
for (i = 0; i < NUM; i++)
dst.y[i] = _mm_nmacc_pd (src1.y[i], src2.y[i], src3.y[i]);
if (check_nmaccpd ())
abort ();
for (i = 0; i < NUM; i++)
dst.y[i] = _mm_nmacc_sd (src1.y[i], src2.y[i], src3.y[i]);
if (check_nmaccsd ())
abort ();
}
/* { dg-do run } */
/* { dg-require-effective-target fma4 } */
/* { dg-options "-O0 -mfma4" } */
#include "fma4-check.h"
#include <x86intrin.h>
#include <string.h>
#define NUM 20
union
{
__m128 x[NUM];
float f[NUM * 4];
__m128d y[NUM];
double d[NUM * 2];
} dst, res, src1, src2, src3;
/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
product is not rounded, only the addition is rounded. */
static void
init_nmsubps ()
{
int i;
for (i = 0; i < NUM * 4; i++)
{
src1.f[i] = i;
src2.f[i] = i + 10;
src3.f[i] = i + 20;
}
}
static void
init_nmsubpd ()
{
int i;
for (i = 0; i < NUM * 4; i++)
{
src1.d[i] = i;
src2.d[i] = i + 10;
src3.d[i] = i + 20;
}
}
static int
check_nmsubps ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 4; i = i + 4)
for (j = 0; j < 4; j++)
{
res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j];
if (dst.f[i + j] != res.f[i + j])
check_fails++;
}
return check_fails++;
}
static int
check_nmsubpd ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 2; i = i + 2)
for (j = 0; j < 2; j++)
{
res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j];
if (dst.d[i + j] != res.d[i + j])
check_fails++;
}
return check_fails++;
}
static int
check_nmsubss ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 4; i = i + 4)
{
res.f[i] = - (src1.f[i] * src2.f[i]) - src3.f[i];
if (dst.f[i] != res.f[i])
check_fails++;
}
return check_fails++;
}
static int
check_nmsubsd ()
{
int i, j, check_fails = 0;
for (i = 0; i < NUM * 2; i = i + 2)
{
res.d[i] = - (src1.d[i] * src2.d[i]) - src3.d[i];
if (dst.d[i] != res.d[i])
check_fails++;
}
return check_fails++;
}
static void
fma4_test (void)
{
int i;
init_nmsubps ();
for (i = 0; i < NUM; i++)
dst.x[i] = _mm_nmsub_ps (src1.x[i], src2.x[i], src3.x[i]);
if (check_nmsubps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4]))
abort ();
for (i = 0; i < NUM; i++)
dst.x[i] = _mm_nmsub_ss (src1.x[i], src2.x[i], src3.x[i]);
if (check_nmsubss (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4]))
abort ();
init_nmsubpd ();
for (i = 0; i < NUM; i++)
dst.y[i] = _mm_nmsub_pd (src1.y[i], src2.y[i], src3.y[i]);
if (check_nmsubpd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2]))
abort ();
for (i = 0; i < NUM; i++)
dst.y[i] = _mm_nmsub_sd (src1.y[i], src2.y[i], src3.y[i]);
if (check_nmsubsd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2]))
abort ();
}
/* Test that the compiler properly optimizes floating point multiply and add
instructions vector into vfmaddps on FMA4 systems. */
/* { dg-do compile } */
/* { dg-require-effective-target lp64 } */
/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */
extern void exit (int);
typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
#define SIZE 10240
union {
__m128 f_align;
__m128d d_align;
float f[SIZE];
double d[SIZE];
} a, b, c, d;
void
flt_mul_add (void)
{
int i;
for (i = 0; i < SIZE; i++)
a.f[i] = (b.f[i] * c.f[i]) + d.f[i];
}
void
dbl_mul_add (void)
{
int i;
for (i = 0; i < SIZE; i++)
a.d[i] = (b.d[i] * c.d[i]) + d.d[i];
}
void
flt_mul_sub (void)
{
int i;
for (i = 0; i < SIZE; i++)
a.f[i] = (b.f[i] * c.f[i]) - d.f[i];
}
void
dbl_mul_sub (void)
{
int i;
for (i = 0; i < SIZE; i++)
a.d[i] = (b.d[i] * c.d[i]) - d.d[i];
}
void
flt_neg_mul_add (void)
{
int i;
for (i = 0; i < SIZE; i++)
a.f[i] = (-(b.f[i] * c.f[i])) + d.f[i];
}
void
dbl_neg_mul_add (void)
{
int i;
for (i = 0; i < SIZE; i++)
a.d[i] = (-(b.d[i] * c.d[i])) + d.d[i];
}
int main ()
{
flt_mul_add ();
flt_mul_sub ();
flt_neg_mul_add ();
dbl_mul_add ();
dbl_mul_sub ();
dbl_neg_mul_add ();
exit (0);
}
/* { dg-final { scan-assembler "vfmaddps" } } */
/* { dg-final { scan-assembler "vfmaddpd" } } */
/* { dg-final { scan-assembler "vfmsubps" } } */
/* { dg-final { scan-assembler "vfmsubpd" } } */
/* { dg-final { scan-assembler "vfnmaddps" } } */
/* { dg-final { scan-assembler "vfnmaddpd" } } */
/* Test whether using target specific options, we can generate FMA4 code. */
/* { dg-do compile } */
/* { dg-require-effective-target lp64 } */
/* { dg-options "-O2 -march=k8" } */
extern void exit (int);
#define FMA4_ATTR __attribute__((__target__("fma4")))
extern float flt_mul_add (float a, float b, float c) FMA4_ATTR;
extern float flt_mul_sub (float a, float b, float c) FMA4_ATTR;
extern float flt_neg_mul_add (float a, float b, float c) FMA4_ATTR;
extern float flt_neg_mul_sub (float a, float b, float c) FMA4_ATTR;
extern double dbl_mul_add (double a, double b, double c) FMA4_ATTR;
extern double dbl_mul_sub (double a, double b, double c) FMA4_ATTR;
extern double dbl_neg_mul_add (double a, double b, double c) FMA4_ATTR;
extern double dbl_neg_mul_sub (double a, double b, double c) FMA4_ATTR;
float
flt_mul_add (float a, float b, float c)
{
return (a * b) + c;
}
double
dbl_mul_add (double a, double b, double c)
{
return (a * b) + c;
}
float
flt_mul_sub (float a, float b, float c)
{
return (a * b) - c;
}
double
dbl_mul_sub (double a, double b, double c)
{
return (a * b) - c;
}
float
flt_neg_mul_add (float a, float b, float c)
{
return (-(a * b)) + c;
}
double
dbl_neg_mul_add (double a, double b, double c)
{
return (-(a * b)) + c;
}
float
flt_neg_mul_sub (float a, float b, float c)
{
return (-(a * b)) - c;
}
double
dbl_neg_mul_sub (double a, double b, double c)
{
return (-(a * b)) - c;
}
float f[10] = { 2, 3, 4 };
double d[10] = { 2, 3, 4 };
int main ()
{
f[3] = flt_mul_add (f[0], f[1], f[2]);
f[4] = flt_mul_sub (f[0], f[1], f[2]);
f[5] = flt_neg_mul_add (f[0], f[1], f[2]);
f[6] = flt_neg_mul_sub (f[0], f[1], f[2]);
d[3] = dbl_mul_add (d[0], d[1], d[2]);
d[4] = dbl_mul_sub (d[0], d[1], d[2]);
d[5] = dbl_neg_mul_add (d[0], d[1], d[2]);
d[6] = dbl_neg_mul_sub (d[0], d[1], d[2]);
exit (0);
}
/* { dg-final { scan-assembler "vfmaddss" } } */
/* { dg-final { scan-assembler "vfmaddsd" } } */
/* { dg-final { scan-assembler "vfmsubss" } } */
/* { dg-final { scan-assembler "vfmsubsd" } } */
/* { dg-final { scan-assembler "vfnmaddss" } } */
/* { dg-final { scan-assembler "vfnmaddsd" } } */
/* { dg-final { scan-assembler "vfnmsubss" } } */
/* { dg-final { scan-assembler "vfnmsubsd" } } */
/* { dg-final { scan-assembler "call\t(.*)flt_mul_add" } } */
/* { dg-final { scan-assembler "call\t(.*)flt_mul_sub" } } */
/* { dg-final { scan-assembler "call\t(.*)flt_neg_mul_add" } } */
/* { dg-final { scan-assembler "call\t(.*)flt_neg_mul_sub" } } */
/* { dg-final { scan-assembler "call\t(.*)dbl_mul_add" } } */
/* { dg-final { scan-assembler "call\t(.*)dbl_mul_sub" } } */
/* { dg-final { scan-assembler "call\t(.*)dbl_neg_mul_add" } } */
/* { dg-final { scan-assembler "call\t(.*)dbl_neg_mul_sub" } } */
/* Test some error conditions with function specific options. */ /* Test some error conditions with function specific options. */
/* { dg-do compile } */ /* { dg-do compile } */
/* no fma400 switch */
extern void error1 (void) __attribute__((__target__("fma400"))); /* { dg-error "unknown" } */
/* Multiple arch switches */ /* Multiple arch switches */
extern void error2 (void) __attribute__((__target__("arch=core2,arch=k8"))); /* { dg-error "already specified" } */ extern void error2 (void) __attribute__((__target__("arch=core2,arch=k8"))); /* { dg-error "already specified" } */
......
...@@ -16,6 +16,7 @@ extern void test_sse4 (void) __attribute__((__target__("sse4"))); ...@@ -16,6 +16,7 @@ extern void test_sse4 (void) __attribute__((__target__("sse4")));
extern void test_sse4_1 (void) __attribute__((__target__("sse4.1"))); extern void test_sse4_1 (void) __attribute__((__target__("sse4.1")));
extern void test_sse4_2 (void) __attribute__((__target__("sse4.2"))); extern void test_sse4_2 (void) __attribute__((__target__("sse4.2")));
extern void test_sse4a (void) __attribute__((__target__("sse4a"))); extern void test_sse4a (void) __attribute__((__target__("sse4a")));
extern void test_fma4 (void) __attribute__((__target__("fma4")));
extern void test_ssse3 (void) __attribute__((__target__("ssse3"))); extern void test_ssse3 (void) __attribute__((__target__("ssse3")));
extern void test_no_abm (void) __attribute__((__target__("no-abm"))); extern void test_no_abm (void) __attribute__((__target__("no-abm")));
...@@ -31,6 +32,7 @@ extern void test_no_sse4 (void) __attribute__((__target__("no-sse4"))); ...@@ -31,6 +32,7 @@ extern void test_no_sse4 (void) __attribute__((__target__("no-sse4")));
extern void test_no_sse4_1 (void) __attribute__((__target__("no-sse4.1"))); extern void test_no_sse4_1 (void) __attribute__((__target__("no-sse4.1")));
extern void test_no_sse4_2 (void) __attribute__((__target__("no-sse4.2"))); extern void test_no_sse4_2 (void) __attribute__((__target__("no-sse4.2")));
extern void test_no_sse4a (void) __attribute__((__target__("no-sse4a"))); extern void test_no_sse4a (void) __attribute__((__target__("no-sse4a")));
extern void test_no_fma4 (void) __attribute__((__target__("no-fma4")));
extern void test_no_ssse3 (void) __attribute__((__target__("no-ssse3"))); extern void test_no_ssse3 (void) __attribute__((__target__("no-ssse3")));
extern void test_arch_i386 (void) __attribute__((__target__("arch=i386"))); extern void test_arch_i386 (void) __attribute__((__target__("arch=i386")));
......
...@@ -16,6 +16,7 @@ extern void test_sse4 (void) __attribute__((__target__("sse4"))); ...@@ -16,6 +16,7 @@ extern void test_sse4 (void) __attribute__((__target__("sse4")));
extern void test_sse4_1 (void) __attribute__((__target__("sse4.1"))); extern void test_sse4_1 (void) __attribute__((__target__("sse4.1")));
extern void test_sse4_2 (void) __attribute__((__target__("sse4.2"))); extern void test_sse4_2 (void) __attribute__((__target__("sse4.2")));
extern void test_sse4a (void) __attribute__((__target__("sse4a"))); extern void test_sse4a (void) __attribute__((__target__("sse4a")));
extern void test_fma4 (void) __attribute__((__target__("fma4")));
extern void test_ssse3 (void) __attribute__((__target__("ssse3"))); extern void test_ssse3 (void) __attribute__((__target__("ssse3")));
extern void test_no_abm (void) __attribute__((__target__("no-abm"))); extern void test_no_abm (void) __attribute__((__target__("no-abm")));
...@@ -31,6 +32,7 @@ extern void test_no_sse4 (void) __attribute__((__target__("no-sse4"))); ...@@ -31,6 +32,7 @@ extern void test_no_sse4 (void) __attribute__((__target__("no-sse4")));
extern void test_no_sse4_1 (void) __attribute__((__target__("no-sse4.1"))); extern void test_no_sse4_1 (void) __attribute__((__target__("no-sse4.1")));
extern void test_no_sse4_2 (void) __attribute__((__target__("no-sse4.2"))); extern void test_no_sse4_2 (void) __attribute__((__target__("no-sse4.2")));
extern void test_no_sse4a (void) __attribute__((__target__("no-sse4a"))); extern void test_no_sse4a (void) __attribute__((__target__("no-sse4a")));
extern void test_no_fma4 (void) __attribute__((__target__("no-fma4")));
extern void test_no_ssse3 (void) __attribute__((__target__("no-ssse3"))); extern void test_no_ssse3 (void) __attribute__((__target__("no-ssse3")));
extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona"))); extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona")));
......
...@@ -104,6 +104,25 @@ generic_insertq (__m128i a, __m128i b) ...@@ -104,6 +104,25 @@ generic_insertq (__m128i a, __m128i b)
return __builtin_ia32_insertq (a, b); /* { dg-error "needs isa option" } */ return __builtin_ia32_insertq (a, b); /* { dg-error "needs isa option" } */
} }
#ifdef __FMA4__
#error "-mfma4 should not be set for this test"
#endif
__m128d fma4_fmaddpd (__m128d a, __m128d b, __m128d c) __attribute__((__target__("fma4")));
__m128d generic_fmaddpd (__m128d a, __m128d b, __m128d c);
__m128d
fma4_fmaddpd (__m128d a, __m128d b, __m128d c)
{
return __builtin_ia32_vfmaddpd (a, b, c);
}
__m128d
generic_fmaddpd (__m128d a, __m128d b, __m128d c)
{
return __builtin_ia32_vfmaddpd (a, b, c); /* { dg-error "needs isa option" } */
}
#ifdef __AES__ #ifdef __AES__
#error "-maes should not be set for this test" #error "-maes should not be set for this test"
#endif #endif
......
/* Test whether using target specific options, we can generate FMA4 code. */
/* { dg-do compile } */
/* { dg-options "-O2 -march=k8 -mfpmath=sse -msse2" } */
extern void exit (int);
#ifdef __FMA4__
#warning "__FMA4__ should not be defined before #pragma GCC target."
#endif
#pragma GCC push_options
#pragma GCC target ("fma4")
#ifndef __FMA4__
#warning "__FMA4__ should have be defined after #pragma GCC target."
#endif
float
flt_mul_add (float a, float b, float c)
{
return (a * b) + c;
}
#pragma GCC pop_options
#ifdef __FMA4__
#warning "__FMA4__ should not be defined after #pragma GCC pop target."
#endif
double
dbl_mul_add (double a, double b, double c)
{
return (a * b) + c;
}
/* { dg-final { scan-assembler "vfmaddss" } } */
/* { dg-final { scan-assembler "addsd" } } */
...@@ -120,6 +120,20 @@ proc check_effective_target_sse4a { } { ...@@ -120,6 +120,20 @@ proc check_effective_target_sse4a { } {
} "-O2 -msse4a" ] } "-O2 -msse4a" ]
} }
# Return 1 if fma4 instructions can be compiled.
proc check_effective_target_fma4 { } {
return [check_no_compiler_messages fma4 object {
typedef float __m128 __attribute__ ((__vector_size__ (16)));
typedef float __v4sf __attribute__ ((__vector_size__ (16)));
__m128 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A,
(__v4sf)__B,
(__v4sf)__C);
}
} "-O2 -mfma4" ]
}
# If a testcase doesn't have special options, use these. # If a testcase doesn't have special options, use these.
global DEFAULT_CFLAGS global DEFAULT_CFLAGS
if ![info exists DEFAULT_CFLAGS] then { if ![info exists DEFAULT_CFLAGS] then {
......
...@@ -27,5 +27,11 @@ main () ...@@ -27,5 +27,11 @@ main ()
#if defined __SSE4A__ #if defined __SSE4A__
abort (); abort ();
#endif #endif
#if defined __AVX__
abort ();
#endif
#if defined __FMA4__
abort ();
#endif
return 0; return 0;
} }
/* { dg-do run } */
/* { dg-options "-march=x86-64 -mfma4 -mno-sse4" } */
extern void abort (void);
int
main ()
{
#if !defined __SSE__
abort ();
#endif
#if !defined __SSE2__
abort ();
#endif
#if !defined __SSE3__
abort ();
#endif
#if !defined __SSSE3__
abort ();
#endif
#if defined __SSE4_1__
abort ();
#endif
#if defined __SSE4_2__
abort ();
#endif
#if !defined __SSE4A__
abort ();
#endif
#if defined __FMA4__
abort ();
#endif
return 0;
}
/* { dg-do run } */
/* { dg-options "-march=x86-64 -mfma4 -mno-ssse3" } */
extern void abort (void);
int
main ()
{
#if !defined __SSE__
abort ();
#endif
#if !defined __SSE2__
abort ();
#endif
#if !defined __SSE3__
abort ();
#endif
#if defined __SSSE3__
abort ();
#endif
#if defined __SSE4_1__
abort ();
#endif
#if defined __SSE4_2__
abort ();
#endif
#if !defined __SSE4A__
abort ();
#endif
#if defined __FMA4__
abort ();
#endif
return 0;
}
/* { dg-do run } */
/* { dg-options "-march=x86-64 -mfma4 -mno-sse3" } */
extern void abort (void);
int
main ()
{
#if !defined __SSE__
abort ();
#endif
#if !defined __SSE2__
abort ();
#endif
#if defined __SSE3__
abort ();
#endif
#if defined __SSSE3__
abort ();
#endif
#if defined __SSE4_1__
abort ();
#endif
#if defined __SSE4_2__
abort ();
#endif
#if defined __SSE4A__
abort ();
#endif
#if defined __FMA4__
abort ();
#endif
return 0;
}
/* { dg-do run } */
/* { dg-options "-march=x86-64 -mfma4 -mno-sse2" } */
extern void abort (void);
int
main ()
{
#if !defined __SSE__
abort ();
#endif
#if defined __SSE2__
abort ();
#endif
#if defined __SSE3__
abort ();
#endif
#if defined __SSSE3__
abort ();
#endif
#if defined __SSE4_1__
abort ();
#endif
#if defined __SSE4_2__
abort ();
#endif
#if defined __SSE4A__
abort ();
#endif
#if defined __FMA4__
abort ();
#endif
return 0;
}
/* { dg-do run } */ /* { dg-do run } */
/* { dg-options "-march=x86-64 -msse4a -mno-sse" } */ /* { dg-options "-march=x86-64 -msse4a -mfma4 -mno-sse" } */
extern void abort (void); extern void abort (void);
...@@ -27,5 +27,8 @@ main () ...@@ -27,5 +27,8 @@ main ()
#if defined __SSE4A__ #if defined __SSE4A__
abort (); abort ();
#endif #endif
#if defined __FMA4__
abort ();
#endif
return 0; return 0;
} }
/* { dg-do run } */
/* { dg-options "-march=x86-64 -msse4 -mfma4" } */
extern void abort (void);
int
main ()
{
#if !defined __SSE__
abort ();
#endif
#if !defined __SSE2__
abort ();
#endif
#if !defined __SSE3__
abort ();
#endif
#if !defined __SSSE3__
abort ();
#endif
#if !defined __SSE4_1__
abort ();
#endif
#if !defined __SSE4_2__
abort ();
#endif
#if !defined __SSE4A__
abort ();
#endif
#if !defined __AVX__
abort ();
#endif
#if !defined __FMA4__
abort ();
#endif
return 0;
}
/* { dg-do run } */
/* { dg-options "-march=x86-64 -msse4 -mfma4 -msse4a" } */
extern void abort (void);
int
main ()
{
#if !defined __SSE__
abort ();
#endif
#if !defined __SSE2__
abort ();
#endif
#if !defined __SSE3__
abort ();
#endif
#if !defined __SSSE3__
abort ();
#endif
#if !defined __SSE4_1__
abort ();
#endif
#if !defined __SSE4_2__
abort ();
#endif
#if !defined __SSE4A__
abort ();
#endif
#if !defined __AVX__
abort ();
#endif
#if !defined __FMA4__
abort ();
#endif
return 0;
}
/* { dg-do run } */
/* { dg-options "-march=core2 -mfma4 -mno-sse4" } */
extern void abort (void);
int
main ()
{
#if !defined __SSE__
abort ();
#endif
#if !defined __SSE2__
abort ();
#endif
#if !defined __SSE3__
abort ();
#endif
#if !defined __SSSE3__
abort ();
#endif
#if defined __SSE4_1__
abort ();
#endif
#if defined __SSE4_2__
abort ();
#endif
#if !defined __SSE4A__
abort ();
#endif
#if defined __AVX__
abort ();
#endif
#if defined __FMA4__
abort ();
#endif
return 0;
}
...@@ -27,5 +27,11 @@ main () ...@@ -27,5 +27,11 @@ main ()
#if !defined __SSE4A__ #if !defined __SSE4A__
abort (); abort ();
#endif #endif
#if defined __AVX__
abort ();
#endif
#if defined __FMA4__
abort ();
#endif
return 0; return 0;
} }
...@@ -28,5 +28,11 @@ main () ...@@ -28,5 +28,11 @@ main ()
#if !defined __SSE4A__ #if !defined __SSE4A__
abort (); abort ();
#endif #endif
#if defined __AVX__
abort ();
#endif
#if defined __FMA4__
abort ();
#endif
return 0; return 0;
} }
/* { dg-do run } */
/* { dg-options "-march=amdfam10 -mfma4 -mno-sse4" } */
extern void abort (void);
int
main ()
{
#if !defined __SSE__
abort ();
#endif
#if !defined __SSE2__
abort ();
#endif
#if !defined __SSE3__
abort ();
#endif
#if !defined __SSSE3__
abort ();
#endif
#if defined __SSE4_1__
abort ();
#endif
#if defined __SSE4_2__
abort ();
#endif
#if !defined __SSE4A__
abort ();
#endif
#if defined __AVX__
abort ();
#endif
#if defined __FMA4__
abort ();
#endif
return 0;
}
/* { dg-do run } */
/* { dg-options "-march=amdfam10 -mfma4 -mno-sse4a" } */
extern void abort (void);
int
main ()
{
#if !defined __SSE__
abort ();
#endif
#if !defined __SSE2__
abort ();
#endif
#if !defined __SSE3__
abort ();
#endif
#if !defined __SSSE3__
abort ();
#endif
#if !defined __SSE4_1__
abort ();
#endif
#if !defined __SSE4_2__
abort ();
#endif
#if defined __SSE4A__
abort ();
#endif
#if defined __FMA4__
abort ();
#endif
return 0;
}
/* { dg-do run } */
/* { dg-options "-march=amdfam10 -mno-fma4" } */
extern void abort (void);
int
main ()
{
#if !defined __SSE__
abort ();
#endif
#if !defined __SSE2__
abort ();
#endif
#if !defined __SSE3__
abort ();
#endif
#if defined __SSSE3__
abort ();
#endif
#if defined __SSE4_1__
abort ();
#endif
#if defined __SSE4_2__
abort ();
#endif
#if !defined __SSE4A__
abort ();
#endif
#if defined __FMA4__
abort ();
#endif
return 0;
}
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are
usable with -O -std=c89 -pedantic-errors. */
/* { dg-do compile } */
/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -m3dnow -mavx -mfma4 -maes -mpclmul" } */
#include <x86intrin.h>
int dummy;
/* { dg-do compile } */
/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -maes -mpclmul" } */
#include <mm_malloc.h>
/* Test that the intrinsics compile with optimization. All of them are
defined as inline functions in {,x,e,p,t,s,w,a,b,i}mmintrin.h and mm3dnow.h
that reference the proper builtin functions. Defining away "extern" and
"__inline" results in all of them being compiled as proper functions. */
#define extern
#define __inline
/* Following intrinsics require immediate arguments. */
/* ammintrin.h */
#define __builtin_ia32_extrqi(X, I, L) __builtin_ia32_extrqi(X, 1, 1)
#define __builtin_ia32_insertqi(X, Y, I, L) __builtin_ia32_insertqi(X, Y, 1, 1)
/* immintrin.h */
#define __builtin_ia32_blendpd256(X, Y, M) __builtin_ia32_blendpd256(X, Y, 1)
#define __builtin_ia32_blendps256(X, Y, M) __builtin_ia32_blendps256(X, Y, 1)
#define __builtin_ia32_dpps256(X, Y, M) __builtin_ia32_dpps256(X, Y, 1)
#define __builtin_ia32_shufpd256(X, Y, M) __builtin_ia32_shufpd256(X, Y, 1)
#define __builtin_ia32_shufps256(X, Y, M) __builtin_ia32_shufps256(X, Y, 1)
#define __builtin_ia32_cmpsd(X, Y, O) __builtin_ia32_cmpsd(X, Y, 1)
#define __builtin_ia32_cmpss(X, Y, O) __builtin_ia32_cmpss(X, Y, 1)
#define __builtin_ia32_cmppd(X, Y, O) __builtin_ia32_cmppd(X, Y, 1)
#define __builtin_ia32_cmpps(X, Y, O) __builtin_ia32_cmpps(X, Y, 1)
#define __builtin_ia32_cmppd256(X, Y, O) __builtin_ia32_cmppd256(X, Y, 1)
#define __builtin_ia32_cmpps256(X, Y, O) __builtin_ia32_cmpps256(X, Y, 1)
#define __builtin_ia32_vextractf128_pd256(X, N) __builtin_ia32_vextractf128_pd256(X, 1)
#define __builtin_ia32_vextractf128_ps256(X, N) __builtin_ia32_vextractf128_ps256(X, 1)
#define __builtin_ia32_vextractf128_si256(X, N) __builtin_ia32_vextractf128_si256(X, 1)
#define __builtin_ia32_vpermilpd(X, N) __builtin_ia32_vpermilpd(X, 1)
#define __builtin_ia32_vpermilpd256(X, N) __builtin_ia32_vpermilpd256(X, 1)
#define __builtin_ia32_vpermilps(X, N) __builtin_ia32_vpermilps(X, 1)
#define __builtin_ia32_vpermilps256(X, N) __builtin_ia32_vpermilps256(X, 1)
#define __builtin_ia32_vpermil2pd(X, Y, C, I) __builtin_ia32_vpermil2pd(X, Y, C, 1)
#define __builtin_ia32_vpermil2pd256(X, Y, C, I) __builtin_ia32_vpermil2pd256(X, Y, C, 1)
#define __builtin_ia32_vpermil2ps(X, Y, C, I) __builtin_ia32_vpermil2ps(X, Y, C, 1)
#define __builtin_ia32_vpermil2ps256(X, Y, C, I) __builtin_ia32_vpermil2ps256(X, Y, C, 1)
#define __builtin_ia32_vperm2f128_pd256(X, Y, C) __builtin_ia32_vperm2f128_pd256(X, Y, 1)
#define __builtin_ia32_vperm2f128_ps256(X, Y, C) __builtin_ia32_vperm2f128_ps256(X, Y, 1)
#define __builtin_ia32_vperm2f128_si256(X, Y, C) __builtin_ia32_vperm2f128_si256(X, Y, 1)
#define __builtin_ia32_vinsertf128_pd256(X, Y, C) __builtin_ia32_vinsertf128_pd256(X, Y, 1)
#define __builtin_ia32_vinsertf128_ps256(X, Y, C) __builtin_ia32_vinsertf128_ps256(X, Y, 1)
#define __builtin_ia32_vinsertf128_si256(X, Y, C) __builtin_ia32_vinsertf128_si256(X, Y, 1)
#define __builtin_ia32_roundpd256(V, M) __builtin_ia32_roundpd256(V, 1)
#define __builtin_ia32_roundps256(V, M) __builtin_ia32_roundps256(V, 1)
/* wmmintrin.h */
#define __builtin_ia32_aeskeygenassist128(X, C) __builtin_ia32_aeskeygenassist128(X, 1)
#define __builtin_ia32_pclmulqdq128(X, Y, I) __builtin_ia32_pclmulqdq128(X, Y, 1)
/* smmintrin.h */
#define __builtin_ia32_roundpd(V, M) __builtin_ia32_roundpd(V, 1)
#define __builtin_ia32_roundsd(D, V, M) __builtin_ia32_roundsd(D, V, 1)
#define __builtin_ia32_roundps(V, M) __builtin_ia32_roundps(V, 1)
#define __builtin_ia32_roundss(D, V, M) __builtin_ia32_roundss(D, V, 1)
#define __builtin_ia32_pblendw128(X, Y, M) __builtin_ia32_pblendw128 (X, Y, 1)
#define __builtin_ia32_blendps(X, Y, M) __builtin_ia32_blendps(X, Y, 1)
#define __builtin_ia32_blendpd(X, Y, M) __builtin_ia32_blendpd(X, Y, 1)
#define __builtin_ia32_dpps(X, Y, M) __builtin_ia32_dpps(X, Y, 1)
#define __builtin_ia32_dppd(X, Y, M) __builtin_ia32_dppd(X, Y, 1)
#define __builtin_ia32_insertps128(D, S, N) __builtin_ia32_insertps128(D, S, 1)
#define __builtin_ia32_vec_ext_v4sf(X, N) __builtin_ia32_vec_ext_v4sf(X, 1)
#define __builtin_ia32_vec_set_v16qi(D, S, N) __builtin_ia32_vec_set_v16qi(D, S, 1)
#define __builtin_ia32_vec_set_v4si(D, S, N) __builtin_ia32_vec_set_v4si(D, S, 1)
#define __builtin_ia32_vec_set_v2di(D, S, N) __builtin_ia32_vec_set_v2di(D, S, 1)
#define __builtin_ia32_vec_ext_v16qi(X, N) __builtin_ia32_vec_ext_v16qi(X, 1)
#define __builtin_ia32_vec_ext_v4si(X, N) __builtin_ia32_vec_ext_v4si(X, 1)
#define __builtin_ia32_vec_ext_v2di(X, N) __builtin_ia32_vec_ext_v2di(X, 1)
#define __builtin_ia32_mpsadbw128(X, Y, M) __builtin_ia32_mpsadbw128(X, Y, 1)
#define __builtin_ia32_pcmpistrm128(X, Y, M) \
__builtin_ia32_pcmpistrm128(X, Y, 1)
#define __builtin_ia32_pcmpistri128(X, Y, M) \
__builtin_ia32_pcmpistri128(X, Y, 1)
#define __builtin_ia32_pcmpestrm128(X, LX, Y, LY, M) \
__builtin_ia32_pcmpestrm128(X, LX, Y, LY, 1)
#define __builtin_ia32_pcmpestri128(X, LX, Y, LY, M) \
__builtin_ia32_pcmpestri128(X, LX, Y, LY, 1)
#define __builtin_ia32_pcmpistria128(X, Y, M) \
__builtin_ia32_pcmpistria128(X, Y, 1)
#define __builtin_ia32_pcmpistric128(X, Y, M) \
__builtin_ia32_pcmpistric128(X, Y, 1)
#define __builtin_ia32_pcmpistrio128(X, Y, M) \
__builtin_ia32_pcmpistrio128(X, Y, 1)
#define __builtin_ia32_pcmpistris128(X, Y, M) \
__builtin_ia32_pcmpistris128(X, Y, 1)
#define __builtin_ia32_pcmpistriz128(X, Y, M) \
__builtin_ia32_pcmpistriz128(X, Y, 1)
#define __builtin_ia32_pcmpestria128(X, LX, Y, LY, M) \
__builtin_ia32_pcmpestria128(X, LX, Y, LY, 1)
#define __builtin_ia32_pcmpestric128(X, LX, Y, LY, M) \
__builtin_ia32_pcmpestric128(X, LX, Y, LY, 1)
#define __builtin_ia32_pcmpestrio128(X, LX, Y, LY, M) \
__builtin_ia32_pcmpestrio128(X, LX, Y, LY, 1)
#define __builtin_ia32_pcmpestris128(X, LX, Y, LY, M) \
__builtin_ia32_pcmpestris128(X, LX, Y, LY, 1)
#define __builtin_ia32_pcmpestriz128(X, LX, Y, LY, M) \
__builtin_ia32_pcmpestriz128(X, LX, Y, LY, 1)
/* tmmintrin.h */
#define __builtin_ia32_palignr128(X, Y, N) __builtin_ia32_palignr128(X, Y, 8)
#define __builtin_ia32_palignr(X, Y, N) __builtin_ia32_palignr(X, Y, 8)
/* emmintrin.h */
#define __builtin_ia32_psrldqi128(A, B) __builtin_ia32_psrldqi128(A, 8)
#define __builtin_ia32_pslldqi128(A, B) __builtin_ia32_pslldqi128(A, 8)
#define __builtin_ia32_pshufhw(A, N) __builtin_ia32_pshufhw(A, 0)
#define __builtin_ia32_pshuflw(A, N) __builtin_ia32_pshuflw(A, 0)
#define __builtin_ia32_pshufd(A, N) __builtin_ia32_pshufd(A, 0)
#define __builtin_ia32_vec_set_v8hi(A, D, N) \
__builtin_ia32_vec_set_v8hi(A, D, 0)
#define __builtin_ia32_vec_ext_v8hi(A, N) __builtin_ia32_vec_ext_v8hi(A, 0)
#define __builtin_ia32_shufpd(A, B, N) __builtin_ia32_shufpd(A, B, 0)
/* xmmintrin.h */
#define __builtin_prefetch(P, A, I) __builtin_prefetch(P, A, _MM_HINT_NTA)
#define __builtin_ia32_pshufw(A, N) __builtin_ia32_pshufw(A, 0)
#define __builtin_ia32_vec_set_v4hi(A, D, N) \
__builtin_ia32_vec_set_v4hi(A, D, 0)
#define __builtin_ia32_vec_ext_v4hi(A, N) __builtin_ia32_vec_ext_v4hi(A, 0)
#define __builtin_ia32_shufps(A, B, N) __builtin_ia32_shufps(A, B, 0)
#include <x86intrin.h>
/* { dg-do compile } */
/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */
#include <mm_malloc.h>
/* Test that the intrinsics compile without optimization. All of them are
defined as inline functions in {,x,e,p,t,s,w,a}mmintrin.h and mm3dnow.h
that reference the proper builtin functions. Defining away "extern" and
"__inline" results in all of them being compiled as proper functions. */
#define extern
#define __inline
#include <x86intrin.h>
#define _CONCAT(x,y) x ## y
#define test_1(func, type, op1_type, imm) \
type _CONCAT(_,func) (op1_type A, int const I) \
{ return func (A, imm); }
#define test_1x(func, type, op1_type, imm1, imm2) \
type _CONCAT(_,func) (op1_type A, int const I, int const L) \
{ return func (A, imm1, imm2); }
#define test_2(func, type, op1_type, op2_type, imm) \
type _CONCAT(_,func) (op1_type A, op2_type B, int const I) \
{ return func (A, B, imm); }
#define test_2x(func, type, op1_type, op2_type, imm1, imm2) \
type _CONCAT(_,func) (op1_type A, op2_type B, int const I, int const L) \
{ return func (A, B, imm1, imm2); }
#define test_3(func, type, op1_type, op2_type, op3_type, imm) \
type _CONCAT(_,func) (op1_type A, op2_type B, \
op3_type C, int const I) \
{ return func (A, B, C, imm); }
#define test_4(func, type, op1_type, op2_type, op3_type, op4_type, imm) \
type _CONCAT(_,func) (op1_type A, op2_type B, \
op3_type C, op4_type D, int const I) \
{ return func (A, B, C, D, imm); }
/* Following intrinsics require immediate arguments. They
are defined as macros for non-optimized compilations. */
/* ammintrin.h */
test_1x (_mm_extracti_si64, __m128i, __m128i, 1, 1)
test_2x (_mm_inserti_si64, __m128i, __m128i, __m128i, 1, 1)
/* immintrin.h */
test_2 (_mm256_blend_pd, __m256d, __m256d, __m256d, 1)
test_2 (_mm256_blend_ps, __m256, __m256, __m256, 1)
test_2 (_mm256_dp_ps, __m256, __m256, __m256, 1)
test_2 (_mm256_shuffle_pd, __m256d, __m256d, __m256d, 1)
test_2 (_mm256_shuffle_ps, __m256, __m256, __m256, 1)
test_2 (_mm_cmp_sd, __m128d, __m128d, __m128d, 1)
test_2 (_mm_cmp_ss, __m128, __m128, __m128, 1)
test_2 (_mm_cmp_pd, __m128d, __m128d, __m128d, 1)
test_2 (_mm_cmp_ps, __m128, __m128, __m128, 1)
test_2 (_mm256_cmp_pd, __m256d, __m256d, __m256d, 1)
test_2 (_mm256_cmp_ps, __m256, __m256, __m256, 1)
test_1 (_mm256_extractf128_pd, __m128d, __m256d, 1)
test_1 (_mm256_extractf128_ps, __m128, __m256, 1)
test_1 (_mm256_extractf128_si256, __m128i, __m256i, 1)
test_1 (_mm256_extract_epi8, int, __m256i, 20)
test_1 (_mm256_extract_epi16, int, __m256i, 13)
test_1 (_mm256_extract_epi32, int, __m256i, 6)
#ifdef __x86_64__
test_1 (_mm256_extract_epi64, long long, __m256i, 2)
#endif
test_1 (_mm_permute_pd, __m128d, __m128d, 1)
test_1 (_mm256_permute_pd, __m256d, __m256d, 1)
test_1 (_mm_permute_ps, __m128, __m128, 1)
test_1 (_mm256_permute_ps, __m256, __m256, 1)
test_2 (_mm256_permute2f128_pd, __m256d, __m256d, __m256d, 1)
test_2 (_mm256_permute2f128_ps, __m256, __m256, __m256, 1)
test_2 (_mm256_permute2f128_si256, __m256i, __m256i, __m256i, 1)
test_2 (_mm256_insertf128_pd, __m256d, __m256d, __m128d, 1)
test_2 (_mm256_insertf128_ps, __m256, __m256, __m128, 1)
test_2 (_mm256_insertf128_si256, __m256i, __m256i, __m128i, 1)
test_2 (_mm256_insert_epi8, __m256i, __m256i, int, 30)
test_2 (_mm256_insert_epi16, __m256i, __m256i, int, 7)
test_2 (_mm256_insert_epi32, __m256i, __m256i, int, 3)
#ifdef __x86_64__
test_2 (_mm256_insert_epi64, __m256i, __m256i, long long, 1)
#endif
test_1 (_mm256_round_pd, __m256d, __m256d, 1)
test_1 (_mm256_round_ps, __m256, __m256, 1)
/* wmmintrin.h */
test_1 (_mm_aeskeygenassist_si128, __m128i, __m128i, 1)
test_2 (_mm_clmulepi64_si128, __m128i, __m128i, __m128i, 1)
/* smmintrin.h */
test_1 (_mm_round_pd, __m128d, __m128d, 1)
test_1 (_mm_round_ps, __m128, __m128, 1)
test_2 (_mm_round_sd, __m128d, __m128d, __m128d, 1)
test_2 (_mm_round_ss, __m128, __m128, __m128, 1)
test_2 (_mm_blend_epi16, __m128i, __m128i, __m128i, 1)
test_2 (_mm_blend_ps, __m128, __m128, __m128, 1)
test_2 (_mm_blend_pd, __m128d, __m128d, __m128d, 1)
test_2 (_mm_dp_ps, __m128, __m128, __m128, 1)
test_2 (_mm_dp_pd, __m128d, __m128d, __m128d, 1)
test_2 (_mm_insert_ps, __m128, __m128, __m128, 1)
test_1 (_mm_extract_ps, int, __m128, 1)
test_2 (_mm_insert_epi8, __m128i, __m128i, int, 1)
test_2 (_mm_insert_epi32, __m128i, __m128i, int, 1)
#ifdef __x86_64__
test_2 (_mm_insert_epi64, __m128i, __m128i, long long, 1)
#endif
test_1 (_mm_extract_epi8, int, __m128i, 1)
test_1 (_mm_extract_epi32, int, __m128i, 1)
#ifdef __x86_64__
test_1 (_mm_extract_epi64, long long, __m128i, 1)
#endif
test_2 (_mm_mpsadbw_epu8, __m128i, __m128i, __m128i, 1)
test_2 (_mm_cmpistrm, __m128i, __m128i, __m128i, 1)
test_2 (_mm_cmpistri, int, __m128i, __m128i, 1)
test_4 (_mm_cmpestrm, __m128i, __m128i, int, __m128i, int, 1)
test_4 (_mm_cmpestri, int, __m128i, int, __m128i, int, 1)
test_2 (_mm_cmpistra, int, __m128i, __m128i, 1)
test_2 (_mm_cmpistrc, int, __m128i, __m128i, 1)
test_2 (_mm_cmpistro, int, __m128i, __m128i, 1)
test_2 (_mm_cmpistrs, int, __m128i, __m128i, 1)
test_2 (_mm_cmpistrz, int, __m128i, __m128i, 1)
test_4 (_mm_cmpestra, int, __m128i, int, __m128i, int, 1)
test_4 (_mm_cmpestrc, int, __m128i, int, __m128i, int, 1)
test_4 (_mm_cmpestro, int, __m128i, int, __m128i, int, 1)
test_4 (_mm_cmpestrs, int, __m128i, int, __m128i, int, 1)
test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1)
/* tmmintrin.h */
test_2 (_mm_alignr_epi8, __m128i, __m128i, __m128i, 1)
test_2 (_mm_alignr_pi8, __m64, __m64, __m64, 1)
/* emmintrin.h */
test_2 (_mm_shuffle_pd, __m128d, __m128d, __m128d, 1)
test_1 (_mm_srli_si128, __m128i, __m128i, 1)
test_1 (_mm_slli_si128, __m128i, __m128i, 1)
test_1 (_mm_extract_epi16, int, __m128i, 1)
test_2 (_mm_insert_epi16, __m128i, __m128i, int, 1)
test_1 (_mm_shufflehi_epi16, __m128i, __m128i, 1)
test_1 (_mm_shufflelo_epi16, __m128i, __m128i, 1)
test_1 (_mm_shuffle_epi32, __m128i, __m128i, 1)
/* xmmintrin.h */
test_2 (_mm_shuffle_ps, __m128, __m128, __m128, 1)
test_1 (_mm_extract_pi16, int, __m64, 1)
test_1 (_m_pextrw, int, __m64, 1)
test_2 (_mm_insert_pi16, __m64, __m64, int, 1)
test_2 (_m_pinsrw, __m64, __m64, int, 1)
test_1 (_mm_shuffle_pi16, __m64, __m64, 1)
test_1 (_m_pshufw, __m64, __m64, 1)
test_1 (_mm_prefetch, void, void *, _MM_HINT_NTA)
/* Same as sse-14, except converted to use #pragma GCC option. */
/* { dg-do compile } */
/* { dg-options "-O0 -Werror-implicit-function-declaration" } */
#include <mm_malloc.h>
/* Test that the intrinsics compile without optimization. All of them are
defined as inline functions in {,x,e,p,t,s,w,a}mmintrin.h and mm3dnow.h
that reference the proper builtin functions. Defining away "extern" and
"__inline" results in all of them being compiled as proper functions. */
#define extern
#define __inline
#define _CONCAT(x,y) x ## y
#define test_1(func, type, op1_type, imm) \
type _CONCAT(_,func) (op1_type A, int const I) \
{ return func (A, imm); }
#define test_1x(func, type, op1_type, imm1, imm2) \
type _CONCAT(_,func) (op1_type A, int const I, int const L) \
{ return func (A, imm1, imm2); }
#define test_2(func, type, op1_type, op2_type, imm) \
type _CONCAT(_,func) (op1_type A, op2_type B, int const I) \
{ return func (A, B, imm); }
#define test_2x(func, type, op1_type, op2_type, imm1, imm2) \
type _CONCAT(_,func) (op1_type A, op2_type B, int const I, int const L) \
{ return func (A, B, imm1, imm2); }
#define test_4(func, type, op1_type, op2_type, op3_type, op4_type, imm) \
type _CONCAT(_,func) (op1_type A, op2_type B, \
op3_type C, op4_type D, int const I) \
{ return func (A, B, C, D, imm); }
#ifndef DIFFERENT_PRAGMAS
#pragma GCC target ("mmx,3dnow,sse,sse2,sse3,ssse3,sse4.1,sse4.2,sse4a,aes,pclmul")
#endif
/* Following intrinsics require immediate arguments. They
are defined as macros for non-optimized compilations. */
/* mmintrin.h (MMX). */
#ifdef DIFFERENT_PRAGMAS
#pragma GCC target ("mmx")
#endif
#include <mmintrin.h>
/* mm3dnow.h (3DNOW). */
#ifdef DIFFERENT_PRAGMAS
#pragma GCC target ("3dnow")
#endif
#include <mm3dnow.h>
/* xmmintrin.h (SSE). */
#ifdef DIFFERENT_PRAGMAS
#pragma GCC target ("sse")
#endif
#include <xmmintrin.h>
test_2 (_mm_shuffle_ps, __m128, __m128, __m128, 1)
test_1 (_mm_extract_pi16, int, __m64, 1)
test_1 (_m_pextrw, int, __m64, 1)
test_2 (_mm_insert_pi16, __m64, __m64, int, 1)
test_2 (_m_pinsrw, __m64, __m64, int, 1)
test_1 (_mm_shuffle_pi16, __m64, __m64, 1)
test_1 (_m_pshufw, __m64, __m64, 1)
test_1 (_mm_prefetch, void, void *, _MM_HINT_NTA)
/* emmintrin.h (SSE2). */
#ifdef DIFFERENT_PRAGMAS
#pragma GCC target ("sse2")
#endif
#include <emmintrin.h>
test_2 (_mm_shuffle_pd, __m128d, __m128d, __m128d, 1)
test_1 (_mm_srli_si128, __m128i, __m128i, 1)
test_1 (_mm_slli_si128, __m128i, __m128i, 1)
test_1 (_mm_extract_epi16, int, __m128i, 1)
test_2 (_mm_insert_epi16, __m128i, __m128i, int, 1)
test_1 (_mm_shufflehi_epi16, __m128i, __m128i, 1)
test_1 (_mm_shufflelo_epi16, __m128i, __m128i, 1)
test_1 (_mm_shuffle_epi32, __m128i, __m128i, 1)
/* pmmintrin.h (SSE3). */
#ifdef DIFFERENT_PRAGMAS
#pragma GCC target ("sse3")
#endif
#include <pmmintrin.h>
/* tmmintrin.h (SSSE3). */
#ifdef DIFFERENT_PRAGMAS
#pragma GCC target ("ssse3")
#endif
#include <tmmintrin.h>
test_2 (_mm_alignr_epi8, __m128i, __m128i, __m128i, 1)
test_2 (_mm_alignr_pi8, __m64, __m64, __m64, 1)
/* ammintrin.h (SSE4A). */
#ifdef DIFFERENT_PRAGMAS
#pragma GCC target ("sse4a")
#endif
#include <ammintrin.h>
test_1x (_mm_extracti_si64, __m128i, __m128i, 1, 1)
test_2x (_mm_inserti_si64, __m128i, __m128i, __m128i, 1, 1)
/* smmintrin.h (SSE4.1). */
/* nmmintrin.h (SSE4.2). */
/* Note, nmmintrin.h includes smmintrin.h, and smmintrin.h checks for the
#ifdef. So just set the option to SSE4.2. */
#ifdef DIFFERENT_PRAGMAS
#pragma GCC target ("sse4.2")
#endif
#include <nmmintrin.h>
test_2 (_mm_blend_epi16, __m128i, __m128i, __m128i, 1)
test_2 (_mm_blend_ps, __m128, __m128, __m128, 1)
test_2 (_mm_blend_pd, __m128d, __m128d, __m128d, 1)
test_2 (_mm_dp_ps, __m128, __m128, __m128, 1)
test_2 (_mm_dp_pd, __m128d, __m128d, __m128d, 1)
test_2 (_mm_insert_ps, __m128, __m128, __m128, 1)
test_1 (_mm_extract_ps, int, __m128, 1)
test_2 (_mm_insert_epi8, __m128i, __m128i, int, 1)
test_2 (_mm_insert_epi32, __m128i, __m128i, int, 1)
#ifdef __x86_64__
test_2 (_mm_insert_epi64, __m128i, __m128i, long long, 1)
#endif
test_1 (_mm_extract_epi8, int, __m128i, 1)
test_1 (_mm_extract_epi32, int, __m128i, 1)
#ifdef __x86_64__
test_1 (_mm_extract_epi64, long long, __m128i, 1)
#endif
test_2 (_mm_mpsadbw_epu8, __m128i, __m128i, __m128i, 1)
test_2 (_mm_cmpistrm, __m128i, __m128i, __m128i, 1)
test_2 (_mm_cmpistri, int, __m128i, __m128i, 1)
test_4 (_mm_cmpestrm, __m128i, __m128i, int, __m128i, int, 1)
test_4 (_mm_cmpestri, int, __m128i, int, __m128i, int, 1)
test_2 (_mm_cmpistra, int, __m128i, __m128i, 1)
test_2 (_mm_cmpistrc, int, __m128i, __m128i, 1)
test_2 (_mm_cmpistro, int, __m128i, __m128i, 1)
test_2 (_mm_cmpistrs, int, __m128i, __m128i, 1)
test_2 (_mm_cmpistrz, int, __m128i, __m128i, 1)
test_4 (_mm_cmpestra, int, __m128i, int, __m128i, int, 1)
test_4 (_mm_cmpestrc, int, __m128i, int, __m128i, int, 1)
test_4 (_mm_cmpestro, int, __m128i, int, __m128i, int, 1)
test_4 (_mm_cmpestrs, int, __m128i, int, __m128i, int, 1)
test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1)
/* wmmintrin.h (AES/PCLMUL). */
#ifdef DIFFERENT_PRAGMAS
#pragma GCC target ("aes,pclmul")
#endif
#include <wmmintrin.h>
test_1 (_mm_aeskeygenassist_si128, __m128i, __m128i, 1)
test_2 (_mm_clmulepi64_si128, __m128i, __m128i, __m128i, 1)
/* smmintrin.h (SSE4.1). */
test_1 (_mm_round_pd, __m128d, __m128d, 1)
test_1 (_mm_round_ps, __m128, __m128, 1)
test_2 (_mm_round_sd, __m128d, __m128d, __m128d, 1)
test_2 (_mm_round_ss, __m128, __m128, __m128, 1)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment