re PR target/83008 ([performance] Is it better to avoid extra instructions in…

re PR target/83008 ([performance] Is it better to avoid extra instructions in data passing between loops?) PR target/83008 * config/i386/x86-tune-costs.h (skylake_cost): Fix cost of storing integer register in SImode. Fix cost of 256 and 512 byte aligned SSE register store. * config/i386/i386.c (ix86_multiplication_cost): Fix multiplication cost for TARGET_AVX512DQ. testsuite/ChangeLog: PR target/83008 * gcc.target/i386/pr83008.c: New test. From-SVN: r257505

re PR target/83008 ([performance] Is it better to avoid extra instructions in…
re PR target/83008 ([performance] Is it better to avoid extra instructions in data passing between loops?) PR target/83008 * config/i386/x86-tune-costs.h (skylake_cost): Fix cost of storing integer register in SImode. Fix cost of 256 and 512 byte aligned SSE register store. * config/i386/i386.c (ix86_multiplication_cost): Fix multiplication cost for TARGET_AVX512DQ. testsuite/ChangeLog: PR target/83008 * gcc.target/i386/pr83008.c: New test. From-SVN: r257505
001e7337 · Sergey Shalnov · Uros Bizjak · 2318f3b4 · 001e7337 · 001e7337
Commit 001e7337 authored Feb 08, 2018 by Sergey Shalnov Committed by Uros Bizjak Feb 08, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 63 additions and 2 deletions

gcc/ChangeLog
+12 -0

gcc/config/i386/i386.c
+4 -0

gcc/config/i386/x86-tune-costs.h
+2 -2

gcc/testsuite/ChangeLog
+5 -0

gcc/testsuite/gcc.target/i386/pr83008.c
+40 -0

No files found.
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2018-02-08  Sergey Shalnov  <sergey.shalnov@intel.com>
+	PR target/83008
+	* config/i386/x86-tune-costs.h (skylake_cost): Fix cost of
+	storing integer register in SImode.  Fix cost of 256 and 512
+	byte aligned SSE register store.
+2018-02-08  Sergey Shalnov  <sergey.shalnov@intel.com>
+	* config/i386/i386.c (ix86_multiplication_cost): Fix
+	multiplication cost for TARGET_AVX512DQ.
 2018-02-08  Marek Polacek  <polacek@redhat.com>
 	PR tree-optimization/84238

--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -40402,6 +40402,10 @@ ix86_multiplication_cost (const struct processor_costs *cost,
 			   ? cost->mulsd : cost->mulss, true);
  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
    {
+      /* vpmullq is used in this case. No emulation is needed.  */
+      if (TARGET_AVX512DQ)
+	return ix86_vec_cost (mode, cost->mulss, true);
      /* V*QImode is emulated with 7-13 insns.  */
      if (mode == V16QImode || mode == V32QImode)
 	{
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1557,7 +1557,7 @@ struct processor_costs skylake_cost = {
  {4, 4, 4},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
-  {6, 6, 6},				/* cost of storing integer registers */
+  {6, 6, 3},				/* cost of storing integer registers */
  2,					/* cost of reg,reg fld/fst */
  {6, 6, 8},				/* cost of loading fp registers
 					   in SFmode, DFmode and XFmode */
@@ -1572,7 +1572,7 @@ struct processor_costs skylake_cost = {
  {6, 6, 6, 10, 20},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit */
  {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
-  {8, 8, 8, 8, 16},			/* cost of storing SSE registers
+  {8, 8, 8, 12, 24},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit */
  {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
  2, 2,					/* SSE->integer and integer->SSE moves */

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
+2018-02-08  Sergey Shalnov  <sergey.shalnov@intel.com>
+	PR target/83008
+	* gcc.target/i386/pr83008.c: New test.
 2018-02-08  Peter Bergner  <bergner@vnet.ibm.com>
 	PR target/81143

--- a/gcc/testsuite/gcc.target/i386/pr83008.c
+++ b/gcc/testsuite/gcc.target/i386/pr83008.c
+/* PR target/83008 */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -funroll-loops -march=skylake-avx512 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-not "vmovdq(a|u)(32|64)" } } */
+int
+pr83008 (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+  unsigned int tmp[4][4];
+  unsigned int a0, a1, a2, a3;
+  int sum = 0;
+  for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+      a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+      a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+      a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+      int t0 = a0 + a1;
+      int t1 = a0 - a1;
+      int t2 = a2 + a3;
+      int t3 = a2 - a3;
+      tmp[i][0] = t0 + t2;
+      tmp[i][2] = t0 - t2;
+      tmp[i][1] = t1 + t3;
+      tmp[i][3] = t1 - t3;
+    }
+  for (int i = 0; i < 4; i++)
+    {
+      int t0 = tmp[0][i] + tmp[1][i];
+      int t1 = tmp[0][i] - tmp[1][i];
+      int t2 = tmp[2][i] + tmp[3][i];
+      int t3 = tmp[2][i] - tmp[3][i];
+      a0 = t0 + t2;
+      a2 = t0 - t2;
+      a1 = t1 + t3;
+      a3 = t1 - t3;
+      sum += (a0) + (a1) + (a2) + (a3);
+    }
+  return (sum + ((unsigned int) sum >> 16)) >> 1;
+}