re PR libfortran/78379 (Processor-specific versions for matmul)

2017-05-25 Thomas Koenig <tkoenig@gcc.gnu.org> PR libfortran/78379 * Makefile.am: Add generated/matmulavx128_*.c files. Handle them for compiling and setting the right flags. * acinclude.m4: Add tests for FMA3, FMA4 and AVX128. * configure.ac: Call them. * Makefile.in: Regenerated. * config.h.in: Regenerated. * configure: Regenerated. * m4/matmul.m4: Handle AMD chips by calling 128-bit AVX versions which use FMA3 or FMA4. * m4/matmulavx128.m4: New file. * generated/matmul_c10.c: Regenerated. * generated/matmul_c16.c: Regenerated. * generated/matmul_c4.c: Regenerated. * generated/matmul_c8.c: Regenerated. * generated/matmul_i1.c: Regenerated. * generated/matmul_i16.c: Regenerated. * generated/matmul_i2.c: Regenerated. * generated/matmul_i4.c: Regenerated. * generated/matmul_i8.c: Regenerated. * generated/matmul_r10.c: Regenerated. * generated/matmul_r16.c: Regenerated. * generated/matmul_r4.c: Regenerated. * generated/matmul_r8.c: Regenerated. * generated/matmulavx128_c10.c: New file. * generated/matmulavx128_c16.c: New file. * generated/matmulavx128_c4.c: New file. * generated/matmulavx128_c8.c: New file. * generated/matmulavx128_i1.c: New file. * generated/matmulavx128_i16.c: New file. * generated/matmulavx128_i2.c: New file. * generated/matmulavx128_i4.c: New file. * generated/matmulavx128_i8.c: New file. * generated/matmulavx128_r10.c: New file. * generated/matmulavx128_r16.c: New file. * generated/matmulavx128_r4.c: New file. * generated/matmulavx128_r8.c: New file. From-SVN: r248472

re PR libfortran/78379 (Processor-specific versions for matmul)
2017-05-25 Thomas Koenig <tkoenig@gcc.gnu.org> PR libfortran/78379 * Makefile.am: Add generated/matmulavx128_*.c files. Handle them for compiling and setting the right flags. * acinclude.m4: Add tests for FMA3, FMA4 and AVX128. * configure.ac: Call them. * Makefile.in: Regenerated. * config.h.in: Regenerated. * configure: Regenerated. * m4/matmul.m4: Handle AMD chips by calling 128-bit AVX versions which use FMA3 or FMA4. * m4/matmulavx128.m4: New file. * generated/matmul_c10.c: Regenerated. * generated/matmul_c16.c: Regenerated. * generated/matmul_c4.c: Regenerated. * generated/matmul_c8.c: Regenerated. * generated/matmul_i1.c: Regenerated. * generated/matmul_i16.c: Regenerated. * generated/matmul_i2.c: Regenerated. * generated/matmul_i4.c: Regenerated. * generated/matmul_i8.c: Regenerated. * generated/matmul_r10.c: Regenerated. * generated/matmul_r16.c: Regenerated. * generated/matmul_r4.c: Regenerated. * generated/matmul_r8.c: Regenerated. * generated/matmulavx128_c10.c: New file. * generated/matmulavx128_c16.c: New file. * generated/matmulavx128_c4.c: New file. * generated/matmulavx128_c8.c: New file. * generated/matmulavx128_i1.c: New file. * generated/matmulavx128_i16.c: New file. * generated/matmulavx128_i2.c: New file. * generated/matmulavx128_i4.c: New file. * generated/matmulavx128_i8.c: New file. * generated/matmulavx128_r10.c: New file. * generated/matmulavx128_r16.c: New file. * generated/matmulavx128_r4.c: New file. * generated/matmulavx128_r8.c: New file. From-SVN: r248472
1d5cf7fc · Thomas Koenig · 87e1e603 · 1d5cf7fc · 1d5cf7fc · 1d5cf7fc
Commit 1d5cf7fc authored May 25, 2017 by Thomas Koenig
35 changed files
--- a/libgfortran/ChangeLog
+++ b/libgfortran/ChangeLog
+2017-05-25  Thomas Koenig  <tkoenig@gcc.gnu.org>
+
+	PR libfortran/78379
+	* Makefile.am: Add generated/matmulavx128_*.c files.
+	Handle them for compiling and setting the right flags.
+	* acinclude.m4: Add tests for FMA3, FMA4 and AVX128.
+	* configure.ac: Call them.
+	* Makefile.in: Regenerated.
+	* config.h.in: Regenerated.
+	* configure: Regenerated.
+	* m4/matmul.m4:  Handle AMD chips by calling 128-bit AVX
+	versions which use FMA3 or FMA4.
+	* m4/matmulavx128.m4: New file.
+        * generated/matmul_c10.c: Regenerated.
+        * generated/matmul_c16.c: Regenerated.
+        * generated/matmul_c4.c: Regenerated.
+        * generated/matmul_c8.c: Regenerated.
+        * generated/matmul_i1.c: Regenerated.
+        * generated/matmul_i16.c: Regenerated.
+        * generated/matmul_i2.c: Regenerated.
+        * generated/matmul_i4.c: Regenerated.
+        * generated/matmul_i8.c: Regenerated.
+        * generated/matmul_r10.c: Regenerated.
+        * generated/matmul_r16.c: Regenerated.
+        * generated/matmul_r4.c: Regenerated.
+        * generated/matmul_r8.c: Regenerated.
+        * generated/matmulavx128_c10.c: New file.
+        * generated/matmulavx128_c16.c: New file.
+        * generated/matmulavx128_c4.c: New file.
+        * generated/matmulavx128_c8.c: New file.
+        * generated/matmulavx128_i1.c: New file.
+        * generated/matmulavx128_i16.c: New file.
+        * generated/matmulavx128_i2.c: New file.
+        * generated/matmulavx128_i4.c: New file.
+        * generated/matmulavx128_i8.c: New file.
+        * generated/matmulavx128_r10.c: New file.
+        * generated/matmulavx128_r16.c: New file.
+        * generated/matmulavx128_r4.c: New file.
+        * generated/matmulavx128_r8.c: New file.
+
 2017-05-19  Paul Thomas  <pault@gcc.gnu.org>
 	    Jerry DeLisle  <jvdelisle@gcc.gnu.org>

@@ -14,7 +54,7 @@
 	(st_endfile): Likewise.
 	(st_rewind): Likewise.
 	(st_flush): Likewise.
-	
+
 2017-05-15  Jerry DeLisle  <jvdelisle@gcc.gnu.org>

 	PR libgfortran/80727

--- a/libgfortran/Makefile.am
+++ b/libgfortran/Makefile.am
@@ -460,6 +460,21 @@ $(srcdir)/generated/matmul_c8.c \
 $(srcdir)/generated/matmul_c10.c \
 $(srcdir)/generated/matmul_c16.c

+i_matmulavx128_c= \
+$(srcdir)/generated/matmulavx128_i1.c \
+$(srcdir)/generated/matmulavx128_i2.c \
+$(srcdir)/generated/matmulavx128_i4.c \
+$(srcdir)/generated/matmulavx128_i8.c \
+$(srcdir)/generated/matmulavx128_i16.c \
+$(srcdir)/generated/matmulavx128_r4.c \
+$(srcdir)/generated/matmulavx128_r8.c \
+$(srcdir)/generated/matmulavx128_r10.c \
+$(srcdir)/generated/matmulavx128_r16.c \
+$(srcdir)/generated/matmulavx128_c4.c \
+$(srcdir)/generated/matmulavx128_c8.c \
+$(srcdir)/generated/matmulavx128_c10.c \
+$(srcdir)/generated/matmulavx128_c16.c
+
 i_matmull_c= \
 $(srcdir)/generated/matmul_l4.c \
 $(srcdir)/generated/matmul_l8.c \
@@ -641,7 +656,7 @@ gfor_built_src= $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
    $(i_iparity_c) $(i_norm2_c) $(i_parity_c) \
    $(i_matmul_c) $(i_matmull_c) $(i_shape_c) $(i_eoshift1_c) \
    $(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
-    $(i_pow_c) $(i_pack_c) $(i_unpack_c) \
+    $(i_pow_c) $(i_pack_c) $(i_unpack_c) $(i_matmulavx128_c) \
    $(i_spread_c) selected_int_kind.inc selected_real_kind.inc kinds.h \
    $(i_cshift0_c) kinds.inc c99_protos.inc fpu-target.h fpu-target.inc

@@ -796,7 +811,12 @@ intrinsics/dprod_r8.f90 \
 intrinsics/f2c_specifics.F90

 # Turn on vectorization and loop unrolling for matmul.
-$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 
+$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
+
+if HAVE_AVX128
+# Turn on AVX128 for AMD-specific matmul, but only if the compiler understands -mprefer-avx128
+$(patsubst %.c,%.lo,$(notdir $(i_matmulavx128_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 -mprefer-avx128
+endif
 # Logical matmul doesn't vectorize.
 $(patsubst %.c,%.lo,$(notdir $(i_matmull_c))): AM_CFLAGS += -funroll-loops

@@ -936,6 +956,9 @@ $(i_sum_c): m4/sum.m4 $(I_M4_DEPS1)
 $(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
 	$(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@

+$(i_matmulavx128_c): m4/matmulavx128.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
+	$(M4) -Dfile=$@ -I$(srcdir)/m4 matmulavx128.m4 > $@
+
 $(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
 	$(M4) -Dfile=$@ -I$(srcdir)/m4 matmull.m4 > $@


--- a/libgfortran/Makefile.in
+++ b/libgfortran/Makefile.in
--- a/libgfortran/acinclude.m4
+++ b/libgfortran/acinclude.m4
@@ -452,3 +452,53 @@ AC_DEFUN([LIBGFOR_CHECK_AVX512F], [
 	[])
  CFLAGS="$ac_save_CFLAGS"
 ])
+
+dnl Check for FMA3
+dnl
+AC_DEFUN([LIBGFOR_CHECK_FMA3], [
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mfma -mno-fma4"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+	float
+	flt_mul_add (float a, float b, float c)
+	{
+		return __builtin_fmaf (a, b, c);
+        }]], [[]])],
+	AC_DEFINE(HAVE_FMA3, 1,
+	[Define if FMA3 instructions can be compiled.]),
+	[])
+  CFLAGS="$ac_save_CFLAGS"
+])
+
+dnl Check for FMA4
+dnl
+AC_DEFUN([LIBGFOR_CHECK_FMA4], [
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mfma4 -mno-fma"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+	float
+	flt_mul_add (float a, float b, float c)
+	{
+		return __builtin_fmaf (a, b, c);
+        }]], [[]])],
+	AC_DEFINE(HAVE_FMA4, 1,
+	[Define if FMA4 instructions can be compiled.]),
+	[])
+  CFLAGS="$ac_save_CFLAGS"
+])
+
+dnl Check for -mprefer-avx128
+dnl This also defines an automake conditional.
+AC_DEFUN([LIBGFOR_CHECK_AVX128], [
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mavx -mprefer-avx128"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+        void foo()
+	{
+        }]], [[]])],
+	AC_DEFINE(HAVE_AVX128, 1,
+	[Define if -mprefer-avx128 is supported.])
+	AM_CONDITIONAL([HAVE_AVX128],true),
+	[])
+  CFLAGS="$ac_save_CFLAGS"
+])
--- a/libgfortran/config.h.in
+++ b/libgfortran/config.h.in
@@ -81,6 +81,9 @@
 /* Define if AVX instructions can be compiled. */
 #undef HAVE_AVX

+/* Define if -mprefer-avx128 is supported. */
+#undef HAVE_AVX128
+
 /* Define if AVX2 instructions can be compiled. */
 #undef HAVE_AVX2

@@ -375,6 +378,12 @@
 /* Define to 1 if you have the `floorl' function. */
 #undef HAVE_FLOORL

+/* Define if FMA3 instructions can be compiled. */
+#undef HAVE_FMA3
+
+/* Define if FMA4 instructions can be compiled. */
+#undef HAVE_FMA4
+
 /* Define to 1 if you have the `fmod' function. */
 #undef HAVE_FMOD


--- a/libgfortran/configure
+++ b/libgfortran/configure
@@ -606,6 +606,8 @@ am__EXEEXT_TRUE
 LTLIBOBJS
 LIBOBJS
 get_gcc_base_ver
+HAVE_AVX128_FALSE
+HAVE_AVX128_TRUE
 IEEE_FLAGS
 IEEE_SUPPORT
 IEEE_SUPPORT_FALSE
@@ -12421,7 +12423,7 @@ else
  lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
  lt_status=$lt_dlunknown
  cat > conftest.$ac_ext <<_LT_EOF
-#line 12424 "configure"
+#line 12426 "configure"
 #include "confdefs.h"

 #if HAVE_DLFCN_H
@@ -12527,7 +12529,7 @@ else
  lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
  lt_status=$lt_dlunknown
  cat > conftest.$ac_ext <<_LT_EOF
-#line 12530 "configure"
+#line 12532 "configure"
 #include "confdefs.h"

 #if HAVE_DLFCN_H
@@ -26363,6 +26365,99 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
  CFLAGS="$ac_save_CFLAGS"


+# Check for FMA3 extensions
+
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mfma -mno-fma4"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+	float
+	flt_mul_add (float a, float b, float c)
+	{
+		return __builtin_fmaf (a, b, c);
+        }
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_FMA3 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS="$ac_save_CFLAGS"
+
+
+# Check for FMA4 extensions
+
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mfma4 -mno-fma"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+	float
+	flt_mul_add (float a, float b, float c)
+	{
+		return __builtin_fmaf (a, b, c);
+        }
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_FMA4 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS="$ac_save_CFLAGS"
+
+
+# Check if AVX128 works
+
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mavx -mprefer-avx128"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+        void foo()
+	{
+        }
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX128 1" >>confdefs.h
+
+	 if true; then
+  HAVE_AVX128_TRUE=
+  HAVE_AVX128_FALSE='#'
+else
+  HAVE_AVX128_TRUE='#'
+  HAVE_AVX128_FALSE=
+fi
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS="$ac_save_CFLAGS"
+
+
 # Determine what GCC version number to use in filesystem paths.

  get_gcc_base_ver="cat"
@@ -26615,6 +26710,10 @@ if test -z "${IEEE_SUPPORT_TRUE}" && test -z "${IEEE_SUPPORT_FALSE}"; then
  as_fn_error "conditional \"IEEE_SUPPORT\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${HAVE_AVX128_TRUE}" && test -z "${HAVE_AVX128_FALSE}"; then
+  as_fn_error "conditional \"HAVE_AVX128\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi

 : ${CONFIG_STATUS=./config.status}
 ac_write_fail=0

--- a/libgfortran/configure.ac
+++ b/libgfortran/configure.ac
@@ -624,6 +624,15 @@ LIBGFOR_CHECK_AVX2
 # Check wether we support AVX512f extensions
 LIBGFOR_CHECK_AVX512F

+# Check for FMA3 extensions
+LIBGFOR_CHECK_FMA3
+
+# Check for FMA4 extensions
+LIBGFOR_CHECK_FMA4
+
+# Check if AVX128 works
+LIBGFOR_CHECK_AVX128
+
 # Determine what GCC version number to use in filesystem paths.
 GCC_BASE_VER


--- a/libgfortran/generated/matmul_c10.c
+++ b/libgfortran/generated/matmul_c10.c
@@ -1734,6 +1734,24 @@ matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray, 
+	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c10_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray, 
+	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c10_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_c10_vanilla (gfc_array_c10 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_c10 (gfc_array_c10 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_c10_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_c10_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmul_c16.c
+++ b/libgfortran/generated/matmul_c16.c
@@ -1734,6 +1734,24 @@ matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray, 
+	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c16_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray, 
+	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c16_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_c16_vanilla (gfc_array_c16 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_c16 (gfc_array_c16 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_c16_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_c16_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmul_c4.c
+++ b/libgfortran/generated/matmul_c4.c
@@ -1734,6 +1734,24 @@ matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray, 
+	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c4_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray, 
+	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c4_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_c4_vanilla (gfc_array_c4 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_c4 (gfc_array_c4 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_c4_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_c4_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmul_c8.c
+++ b/libgfortran/generated/matmul_c8.c
@@ -1734,6 +1734,24 @@ matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray, 
+	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c8_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray, 
+	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c8_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_c8_vanilla (gfc_array_c8 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_c8 (gfc_array_c8 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_c8_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_c8_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmul_i1.c
+++ b/libgfortran/generated/matmul_i1.c
@@ -1734,6 +1734,24 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray, 
+	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i1_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray, 
+	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i1_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_i1 (gfc_array_i1 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_i1_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_i1_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmul_i16.c
+++ b/libgfortran/generated/matmul_i16.c
@@ -1734,6 +1734,24 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray, 
+	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i16_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray, 
+	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i16_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_i16 (gfc_array_i16 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_i16_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_i16_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmul_i2.c
+++ b/libgfortran/generated/matmul_i2.c
@@ -1734,6 +1734,24 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray, 
+	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i2_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray, 
+	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i2_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_i2 (gfc_array_i2 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_i2_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_i2_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmul_i4.c
+++ b/libgfortran/generated/matmul_i4.c
@@ -1734,6 +1734,24 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i4_avx128_fma3 (gfc_array_i4 * const restrict retarray, 
+	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i4_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i4_avx128_fma4 (gfc_array_i4 * const restrict retarray, 
+	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i4_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_i4 (gfc_array_i4 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_i4_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_i4_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmul_i8.c
+++ b/libgfortran/generated/matmul_i8.c
@@ -1734,6 +1734,24 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray, 
+	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i8_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray, 
+	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i8_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_i8 (gfc_array_i8 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_i8_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_i8_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmul_r10.c
+++ b/libgfortran/generated/matmul_r10.c
@@ -1734,6 +1734,24 @@ matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray, 
+	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r10_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray, 
+	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r10_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_r10_vanilla (gfc_array_r10 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_r10 (gfc_array_r10 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_r10_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_r10_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmul_r16.c
+++ b/libgfortran/generated/matmul_r16.c
@@ -1734,6 +1734,24 @@ matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray, 
+	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r16_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray, 
+	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r16_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_r16_vanilla (gfc_array_r16 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_r16 (gfc_array_r16 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_r16_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_r16_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmul_r4.c
+++ b/libgfortran/generated/matmul_r4.c
@@ -1734,6 +1734,24 @@ matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray, 
+	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r4_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray, 
+	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r4_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_r4_vanilla (gfc_array_r4 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_r4 (gfc_array_r4 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_r4_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_r4_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmul_r8.c
+++ b/libgfortran/generated/matmul_r8.c
@@ -1734,6 +1734,24 @@ matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray, 
+	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r8_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray, 
+	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r8_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_r8_vanilla (gfc_array_r8 * const restrict retarray, 
@@ -2332,6 +2350,26 @@ void matmul_r8 (gfc_array_r8 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_r8_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_r8_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/generated/matmulavx128_c10.c
+++ b/libgfortran/generated/matmulavx128_c10.c
--- a/libgfortran/generated/matmulavx128_c16.c
+++ b/libgfortran/generated/matmulavx128_c16.c
--- a/libgfortran/generated/matmulavx128_c4.c
+++ b/libgfortran/generated/matmulavx128_c4.c
--- a/libgfortran/generated/matmulavx128_c8.c
+++ b/libgfortran/generated/matmulavx128_c8.c
--- a/libgfortran/generated/matmulavx128_i1.c
+++ b/libgfortran/generated/matmulavx128_i1.c
--- a/libgfortran/generated/matmulavx128_i16.c
+++ b/libgfortran/generated/matmulavx128_i16.c
--- a/libgfortran/generated/matmulavx128_i2.c
+++ b/libgfortran/generated/matmulavx128_i2.c
--- a/libgfortran/generated/matmulavx128_i4.c
+++ b/libgfortran/generated/matmulavx128_i4.c
--- a/libgfortran/generated/matmulavx128_i8.c
+++ b/libgfortran/generated/matmulavx128_i8.c
--- a/libgfortran/generated/matmulavx128_r10.c
+++ b/libgfortran/generated/matmulavx128_r10.c
--- a/libgfortran/generated/matmulavx128_r16.c
+++ b/libgfortran/generated/matmulavx128_r16.c
--- a/libgfortran/generated/matmulavx128_r4.c
+++ b/libgfortran/generated/matmulavx128_r4.c
--- a/libgfortran/generated/matmulavx128_r8.c
+++ b/libgfortran/generated/matmulavx128_r8.c
--- a/libgfortran/m4/matmul.m4
+++ b/libgfortran/m4/matmul.m4
@@ -106,6 +106,26 @@ static' include(matmul_internal.m4)dnl
 static' include(matmul_internal.m4)dnl
 `#endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
+`void
+'matmul_name` ('rtype` * const restrict retarray, 
+	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto('matmul_name`);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
+`void
+'matmul_name` ('rtype` * const restrict retarray, 
+	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto('matmul_name`);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
 `static' include(matmul_internal.m4)dnl
@@ -161,6 +181,26 @@ void matmul_'rtype_code` ('rtype` * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_'rtype_code`_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_'rtype_code`_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }

--- a/libgfortran/m4/matmulavx128.m4
+++ b/libgfortran/m4/matmulavx128.m4
+`/* Implementation of the MATMUL intrinsic
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>'
+
+include(iparm.m4)dnl
+
+/* These are the specific versions of matmul with -mprefer-avx128.  */
+
+`#if defined (HAVE_'rtype_name`)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const 'rtype_name` *, const 'rtype_name` *,
+                          const int *, const 'rtype_name` *, const int *,
+                          const 'rtype_name` *, 'rtype_name` *, const int *,
+                          int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
+`void
+'matmul_name` ('rtype` * const restrict retarray, 
+	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto('matmul_name`);
+'include(matmul_internal.m4)dnl
+`#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
+`void
+'matmul_name` ('rtype` * const restrict retarray, 
+	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto('matmul_name`);
+'include(matmul_internal.m4)dnl
+`#endif
+
+#endif
+'