i386: Emulate MMX maskmovq with SSE2 maskmovdqu

Emulate MMX maskmovq with SSE2 maskmovdqu for TARGET_MMX_WITH_SSE by zero-extending source and mask operands to 128 bits. Handle unmapped bits 64:127 at memory address by adjusting source and mask operands together with memory address. PR target/89021 * config/i386/xmmintrin.h: Emulate MMX maskmovq with SSE2 maskmovdqu for __MMX_WITH_SSE__. From-SVN: r271234

i386: Emulate MMX maskmovq with SSE2 maskmovdqu
Emulate MMX maskmovq with SSE2 maskmovdqu for TARGET_MMX_WITH_SSE by zero-extending source and mask operands to 128 bits. Handle unmapped bits 64:127 at memory address by adjusting source and mask operands together with memory address. PR target/89021 * config/i386/xmmintrin.h: Emulate MMX maskmovq with SSE2 maskmovdqu for __MMX_WITH_SSE__. From-SVN: r271234
55cd2379 · H.J. Lu · H.J. Lu · 9377b54a · 55cd2379 · 55cd2379
Commit 55cd2379 authored May 15, 2019 by H.J. Lu Committed by H.J. Lu May 15, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 67 additions and 0 deletions

gcc/ChangeLog
+6 -0

gcc/config/i386/xmmintrin.h
+61 -0

No files found.
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
 2019-05-15  H.J. Lu  <hongjiu.lu@intel.com>

 	PR target/89021
+	* config/i386/xmmintrin.h: Emulate MMX maskmovq with SSE2
+	maskmovdqu for __MMX_WITH_SSE__.
+
+2019-05-15  H.J. Lu  <hongjiu.lu@intel.com>
+
+	PR target/89021
 	* config/i386/mmx.md (mmx_umulv4hi3_highpart): Also check
 	TARGET_MMX and TARGET_MMX_WITH_SSE.
 	(*mmx_umulv4hi3_highpart): Add SSE emulation.

--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1165,7 +1165,68 @@ _m_pshufw (__m64 __A, int const __N)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
 {
+#ifdef __MMX_WITH_SSE__
+  /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits
+     64:127 at address __P.  */
+  typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+  typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+  /* Zero-extend __A and __N to 128 bits.  */
+  __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };
+  __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };
+
+  /* Check the alignment of __P.  */
+  __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;
+  if (offset)
+    {
+      /* If the misalignment of __P > 8, subtract __P by 8 bytes.
+	 Otherwise, subtract __P by the misalignment.  */
+      if (offset > 8)
+	offset = 8;
+      __P = (char *) (((__SIZE_TYPE__) __P) - offset);
+
+      /* Shift __A128 and __N128 to the left by the adjustment.  */
+      switch (offset)
+	{
+	case 1:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 8);
+	  break;
+	case 2:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8);
+	  break;
+	case 3:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8);
+	  break;
+	case 4:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8);
+	  break;
+	case 5:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8);
+	  break;
+	case 6:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8);
+	  break;
+	case 7:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8);
+	  break;
+	case 8:
+	  __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8);
+	  __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8);
+	  break;
+	default:
+	  break;
+	}
+    }
+  __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);
+#else
  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
+#endif
 }

 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))