Commit 1d5cf7fc by Thomas Koenig

re PR libfortran/78379 (Processor-specific versions for matmul)

2017-05-25  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR libfortran/78379
	* Makefile.am: Add generated/matmulavx128_*.c files.
	Handle them for compiling and setting the right flags.
	* acinclude.m4: Add tests for FMA3, FMA4 and AVX128.
	* configure.ac: Call them.
	* Makefile.in: Regenerated.
	* config.h.in: Regenerated.
	* configure: Regenerated.
	* m4/matmul.m4:  Handle AMD chips by calling 128-bit AVX
	versions which use FMA3 or FMA4.
	* m4/matmulavx128.m4: New file.
        * generated/matmul_c10.c: Regenerated.
        * generated/matmul_c16.c: Regenerated.
        * generated/matmul_c4.c: Regenerated.
        * generated/matmul_c8.c: Regenerated.
        * generated/matmul_i1.c: Regenerated.
        * generated/matmul_i16.c: Regenerated.
        * generated/matmul_i2.c: Regenerated.
        * generated/matmul_i4.c: Regenerated.
        * generated/matmul_i8.c: Regenerated.
        * generated/matmul_r10.c: Regenerated.
        * generated/matmul_r16.c: Regenerated.
        * generated/matmul_r4.c: Regenerated.
        * generated/matmul_r8.c: Regenerated.
        * generated/matmulavx128_c10.c: New file.
        * generated/matmulavx128_c16.c: New file.
        * generated/matmulavx128_c4.c: New file.
        * generated/matmulavx128_c8.c: New file.
        * generated/matmulavx128_i1.c: New file.
        * generated/matmulavx128_i16.c: New file.
        * generated/matmulavx128_i2.c: New file.
        * generated/matmulavx128_i4.c: New file.
        * generated/matmulavx128_i8.c: New file.
        * generated/matmulavx128_r10.c: New file.
        * generated/matmulavx128_r16.c: New file.
        * generated/matmulavx128_r4.c: New file.
        * generated/matmulavx128_r8.c: New file.

From-SVN: r248472
parent 87e1e603
2017-05-25 Thomas Koenig <tkoenig@gcc.gnu.org>
PR libfortran/78379
* Makefile.am: Add generated/matmulavx128_*.c files.
Handle them for compiling and setting the right flags.
* acinclude.m4: Add tests for FMA3, FMA4 and AVX128.
* configure.ac: Call them.
* Makefile.in: Regenerated.
* config.h.in: Regenerated.
* configure: Regenerated.
* m4/matmul.m4: Handle AMD chips by calling 128-bit AVX
versions which use FMA3 or FMA4.
* m4/matmulavx128.m4: New file.
* generated/matmul_c10.c: Regenerated.
* generated/matmul_c16.c: Regenerated.
* generated/matmul_c4.c: Regenerated.
* generated/matmul_c8.c: Regenerated.
* generated/matmul_i1.c: Regenerated.
* generated/matmul_i16.c: Regenerated.
* generated/matmul_i2.c: Regenerated.
* generated/matmul_i4.c: Regenerated.
* generated/matmul_i8.c: Regenerated.
* generated/matmul_r10.c: Regenerated.
* generated/matmul_r16.c: Regenerated.
* generated/matmul_r4.c: Regenerated.
* generated/matmul_r8.c: Regenerated.
* generated/matmulavx128_c10.c: New file.
* generated/matmulavx128_c16.c: New file.
* generated/matmulavx128_c4.c: New file.
* generated/matmulavx128_c8.c: New file.
* generated/matmulavx128_i1.c: New file.
* generated/matmulavx128_i16.c: New file.
* generated/matmulavx128_i2.c: New file.
* generated/matmulavx128_i4.c: New file.
* generated/matmulavx128_i8.c: New file.
* generated/matmulavx128_r10.c: New file.
* generated/matmulavx128_r16.c: New file.
* generated/matmulavx128_r4.c: New file.
* generated/matmulavx128_r8.c: New file.
2017-05-19 Paul Thomas <pault@gcc.gnu.org>
Jerry DeLisle <jvdelisle@gcc.gnu.org>
......@@ -14,7 +54,7 @@
(st_endfile): Likewise.
(st_rewind): Likewise.
(st_flush): Likewise.
2017-05-15 Jerry DeLisle <jvdelisle@gcc.gnu.org>
PR libgfortran/80727
......
......@@ -460,6 +460,21 @@ $(srcdir)/generated/matmul_c8.c \
$(srcdir)/generated/matmul_c10.c \
$(srcdir)/generated/matmul_c16.c
i_matmulavx128_c= \
$(srcdir)/generated/matmulavx128_i1.c \
$(srcdir)/generated/matmulavx128_i2.c \
$(srcdir)/generated/matmulavx128_i4.c \
$(srcdir)/generated/matmulavx128_i8.c \
$(srcdir)/generated/matmulavx128_i16.c \
$(srcdir)/generated/matmulavx128_r4.c \
$(srcdir)/generated/matmulavx128_r8.c \
$(srcdir)/generated/matmulavx128_r10.c \
$(srcdir)/generated/matmulavx128_r16.c \
$(srcdir)/generated/matmulavx128_c4.c \
$(srcdir)/generated/matmulavx128_c8.c \
$(srcdir)/generated/matmulavx128_c10.c \
$(srcdir)/generated/matmulavx128_c16.c
i_matmull_c= \
$(srcdir)/generated/matmul_l4.c \
$(srcdir)/generated/matmul_l8.c \
......@@ -641,7 +656,7 @@ gfor_built_src= $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
$(i_iparity_c) $(i_norm2_c) $(i_parity_c) \
$(i_matmul_c) $(i_matmull_c) $(i_shape_c) $(i_eoshift1_c) \
$(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
$(i_pow_c) $(i_pack_c) $(i_unpack_c) \
$(i_pow_c) $(i_pack_c) $(i_unpack_c) $(i_matmulavx128_c) \
$(i_spread_c) selected_int_kind.inc selected_real_kind.inc kinds.h \
$(i_cshift0_c) kinds.inc c99_protos.inc fpu-target.h fpu-target.inc
......@@ -796,7 +811,12 @@ intrinsics/dprod_r8.f90 \
intrinsics/f2c_specifics.F90
# Turn on vectorization and loop unrolling for matmul.
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
if HAVE_AVX128
# Turn on AVX128 for AMD-specific matmul, but only if the compiler understands -mprefer-avx128
$(patsubst %.c,%.lo,$(notdir $(i_matmulavx128_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 -mprefer-avx128
endif
# Logical matmul doesn't vectorize.
$(patsubst %.c,%.lo,$(notdir $(i_matmull_c))): AM_CFLAGS += -funroll-loops
......@@ -936,6 +956,9 @@ $(i_sum_c): m4/sum.m4 $(I_M4_DEPS1)
$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
$(i_matmulavx128_c): m4/matmulavx128.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmulavx128.m4 > $@
$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmull.m4 > $@
......
......@@ -452,3 +452,53 @@ AC_DEFUN([LIBGFOR_CHECK_AVX512F], [
[])
CFLAGS="$ac_save_CFLAGS"
])
dnl Check for FMA3
dnl
AC_DEFUN([LIBGFOR_CHECK_FMA3], [
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mfma -mno-fma4"
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
float
flt_mul_add (float a, float b, float c)
{
return __builtin_fmaf (a, b, c);
}]], [[]])],
AC_DEFINE(HAVE_FMA3, 1,
[Define if FMA3 instructions can be compiled.]),
[])
CFLAGS="$ac_save_CFLAGS"
])
dnl Check for FMA4
dnl
AC_DEFUN([LIBGFOR_CHECK_FMA4], [
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mfma4 -mno-fma"
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
float
flt_mul_add (float a, float b, float c)
{
return __builtin_fmaf (a, b, c);
}]], [[]])],
AC_DEFINE(HAVE_FMA4, 1,
[Define if FMA4 instructions can be compiled.]),
[])
CFLAGS="$ac_save_CFLAGS"
])
dnl Check for -mprefer-avx128
dnl This also defines an automake conditional.
AC_DEFUN([LIBGFOR_CHECK_AVX128], [
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mavx -mprefer-avx128"
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
void foo()
{
}]], [[]])],
AC_DEFINE(HAVE_AVX128, 1,
[Define if -mprefer-avx128 is supported.])
AM_CONDITIONAL([HAVE_AVX128],true),
[])
CFLAGS="$ac_save_CFLAGS"
])
......@@ -81,6 +81,9 @@
/* Define if AVX instructions can be compiled. */
#undef HAVE_AVX
/* Define if -mprefer-avx128 is supported. */
#undef HAVE_AVX128
/* Define if AVX2 instructions can be compiled. */
#undef HAVE_AVX2
......@@ -375,6 +378,12 @@
/* Define to 1 if you have the `floorl' function. */
#undef HAVE_FLOORL
/* Define if FMA3 instructions can be compiled. */
#undef HAVE_FMA3
/* Define if FMA4 instructions can be compiled. */
#undef HAVE_FMA4
/* Define to 1 if you have the `fmod' function. */
#undef HAVE_FMOD
......
......@@ -606,6 +606,8 @@ am__EXEEXT_TRUE
LTLIBOBJS
LIBOBJS
get_gcc_base_ver
HAVE_AVX128_FALSE
HAVE_AVX128_TRUE
IEEE_FLAGS
IEEE_SUPPORT
IEEE_SUPPORT_FALSE
......@@ -12421,7 +12423,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
#line 12424 "configure"
#line 12426 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
......@@ -12527,7 +12529,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
#line 12530 "configure"
#line 12532 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
......@@ -26363,6 +26365,99 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$ac_save_CFLAGS"
# Check for FMA3 extensions
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mfma -mno-fma4"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
float
flt_mul_add (float a, float b, float c)
{
return __builtin_fmaf (a, b, c);
}
int
main ()
{
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define HAVE_FMA3 1" >>confdefs.h
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$ac_save_CFLAGS"
# Check for FMA4 extensions
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mfma4 -mno-fma"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
float
flt_mul_add (float a, float b, float c)
{
return __builtin_fmaf (a, b, c);
}
int
main ()
{
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define HAVE_FMA4 1" >>confdefs.h
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$ac_save_CFLAGS"
# Check if AVX128 works
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mavx -mprefer-avx128"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
void foo()
{
}
int
main ()
{
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define HAVE_AVX128 1" >>confdefs.h
if true; then
HAVE_AVX128_TRUE=
HAVE_AVX128_FALSE='#'
else
HAVE_AVX128_TRUE='#'
HAVE_AVX128_FALSE=
fi
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$ac_save_CFLAGS"
# Determine what GCC version number to use in filesystem paths.
get_gcc_base_ver="cat"
......@@ -26615,6 +26710,10 @@ if test -z "${IEEE_SUPPORT_TRUE}" && test -z "${IEEE_SUPPORT_FALSE}"; then
as_fn_error "conditional \"IEEE_SUPPORT\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${HAVE_AVX128_TRUE}" && test -z "${HAVE_AVX128_FALSE}"; then
as_fn_error "conditional \"HAVE_AVX128\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
: ${CONFIG_STATUS=./config.status}
ac_write_fail=0
......
......@@ -624,6 +624,15 @@ LIBGFOR_CHECK_AVX2
# Check wether we support AVX512f extensions
LIBGFOR_CHECK_AVX512F
# Check for FMA3 extensions
LIBGFOR_CHECK_FMA3
# Check for FMA4 extensions
LIBGFOR_CHECK_FMA4
# Check if AVX128 works
LIBGFOR_CHECK_AVX128
# Determine what GCC version number to use in filesystem paths.
GCC_BASE_VER
......
......@@ -1734,6 +1734,24 @@ matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c10_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c10_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_c10_vanilla (gfc_array_c10 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_c10 (gfc_array_c10 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_c10_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_c10_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -1734,6 +1734,24 @@ matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c16_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c16_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_c16_vanilla (gfc_array_c16 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_c16 (gfc_array_c16 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_c16_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_c16_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -1734,6 +1734,24 @@ matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c4_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c4_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_c4_vanilla (gfc_array_c4 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_c4 (gfc_array_c4 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_c4_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_c4_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -1734,6 +1734,24 @@ matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c8_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c8_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_c8_vanilla (gfc_array_c8 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_c8 (gfc_array_c8 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_c8_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_c8_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -1734,6 +1734,24 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i1_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i1_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i1_vanilla (gfc_array_i1 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_i1 (gfc_array_i1 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_i1_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_i1_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -1734,6 +1734,24 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray,
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i16_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray,
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i16_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i16_vanilla (gfc_array_i16 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_i16 (gfc_array_i16 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_i16_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_i16_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -1734,6 +1734,24 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i2_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i2_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i2_vanilla (gfc_array_i2 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_i2 (gfc_array_i2 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_i2_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_i2_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -1734,6 +1734,24 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_i4_avx128_fma3 (gfc_array_i4 * const restrict retarray,
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i4_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_i4_avx128_fma4 (gfc_array_i4 * const restrict retarray,
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i4_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i4_vanilla (gfc_array_i4 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_i4 (gfc_array_i4 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_i4_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_i4_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -1734,6 +1734,24 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray,
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i8_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray,
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i8_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i8_vanilla (gfc_array_i8 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_i8 (gfc_array_i8 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_i8_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_i8_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -1734,6 +1734,24 @@ matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r10_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r10_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_r10_vanilla (gfc_array_r10 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_r10 (gfc_array_r10 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_r10_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_r10_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -1734,6 +1734,24 @@ matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r16_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r16_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_r16_vanilla (gfc_array_r16 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_r16 (gfc_array_r16 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_r16_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_r16_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -1734,6 +1734,24 @@ matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r4_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r4_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_r4 (gfc_array_r4 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_r4_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_r4_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -1734,6 +1734,24 @@ matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r8_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r8_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_r8_vanilla (gfc_array_r8 * const restrict retarray,
......@@ -2332,6 +2350,26 @@ void matmul_r8 (gfc_array_r8 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_r8_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_r8_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
......@@ -106,6 +106,26 @@ static' include(matmul_internal.m4)dnl
static' include(matmul_internal.m4)dnl
`#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
`void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto('matmul_name`);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
`void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto('matmul_name`);
#endif
/* Function to fall back to if there is no special processor-specific version. */
'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
`static' include(matmul_internal.m4)dnl
......@@ -161,6 +181,26 @@ void matmul_'rtype_code` ('rtype` * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_'rtype_code`_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_'rtype_code`_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}
......
`/* Implementation of the MATMUL intrinsic
Copyright (C) 2002-2017 Free Software Foundation, Inc.
Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
This file is part of the GNU Fortran runtime library (libgfortran).
Libgfortran is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.
Libgfortran is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
#include "libgfortran.h"
#include <string.h>
#include <assert.h>'
include(iparm.m4)dnl
/* These are the specific versions of matmul with -mprefer-avx128. */
`#if defined (HAVE_'rtype_name`)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const 'rtype_name` *, const 'rtype_name` *,
const int *, const 'rtype_name` *, const int *,
const 'rtype_name` *, 'rtype_name` *, const int *,
int, int);
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
`void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto('matmul_name`);
'include(matmul_internal.m4)dnl
`#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
`void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto('matmul_name`);
'include(matmul_internal.m4)dnl
`#endif
#endif
'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment