re PR tree-optimization/92645 (Hand written vector code is 450 times slower when…

re PR tree-optimization/92645 (Hand written vector code is 450 times slower when compiled with GCC compared to Clang) 2019-11-28 Richard Biener <rguenther@suse.de> PR tree-optimization/92645 * tree-ssa-forwprop.c (get_bit_field_ref_def): Also handle conversions inside a mode class. Remove restriction on preserving the element size. (simplify_vector_constructor): Deal with the above and for identity permutes also try using VEC_UNPACK_[FLOAT_]LO_EXPR and VEC_PACK_TRUNC_EXPR. * gcc.target/i386/pr92645-4.c: New testcase. From-SVN: r278806

re PR tree-optimization/92645 (Hand written vector code is 450 times slower when…
re PR tree-optimization/92645 (Hand written vector code is 450 times slower when compiled with GCC compared to Clang) 2019-11-28 Richard Biener <rguenther@suse.de> PR tree-optimization/92645 * tree-ssa-forwprop.c (get_bit_field_ref_def): Also handle conversions inside a mode class. Remove restriction on preserving the element size. (simplify_vector_constructor): Deal with the above and for identity permutes also try using VEC_UNPACK_[FLOAT_]LO_EXPR and VEC_PACK_TRUNC_EXPR. * gcc.target/i386/pr92645-4.c: New testcase. From-SVN: r278806
78307657 · Richard Biener · Richard Biener · 09f8027c · 78307657 · 78307657
Commit 78307657 authored Nov 28, 2019 by Richard Biener Committed by Richard Biener Nov 28, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 154 additions and 13 deletions

gcc/ChangeLog
+10 -0

gcc/testsuite/ChangeLog
+5 -0

gcc/testsuite/gcc.target/i386/pr92645-4.c
+56 -0

gcc/tree-ssa-forwprop.c
+83 -13

No files found.
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2019-11-28  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/92645
+	* tree-ssa-forwprop.c (get_bit_field_ref_def): Also handle
+	conversions inside a mode class.  Remove restriction on
+	preserving the element size.
+	(simplify_vector_constructor): Deal with the above and for
+	identity permutes also try using VEC_UNPACK_[FLOAT_]LO_EXPR
+	and VEC_PACK_TRUNC_EXPR.
+
 2019-11-28  Georg-Johann Lay  <avr@gjlay.de>

 	Must use push insn to pass varargs arguments of DFmode because
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
+2019-11-28  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/92645
+	* gcc.target/i386/pr92645-4.c: New testcase.
+
 2019-11-28  Christophe Lyon  <christophe.lyon@linaro.org>

 	* gcc.target/arm/asm-flag-4.c: Use -mfloat-abi=softfp.

--- a/gcc/testsuite/gcc.target/i386/pr92645-4.c
+++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -fdump-tree-optimized -Wno-psabi" } */
+
+typedef unsigned int u32v4 __attribute__((vector_size(16)));
+typedef unsigned short u16v16 __attribute__((vector_size(32)));
+typedef unsigned char u8v16 __attribute__((vector_size(16)));
+
+union vec128 {
+  u8v16 u8;
+  u32v4 u32;
+};
+
+#define memcpy __builtin_memcpy
+
+static u16v16 zxt(u8v16 x)
+{
+  return (u16v16) {
+    x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+  };
+}
+
+static u8v16 narrow(u16v16 x)
+{
+  return (u8v16) {
+    x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+  };
+}
+
+void f(char *dst, char *src, unsigned long n, unsigned c)
+{
+  unsigned ia = 255 - (c >> 24);
+  ia += ia >> 7;
+
+  union vec128 c4 = {0}, ia16 = {0};
+  c4.u32 += c;
+  ia16.u8 += (unsigned char)ia;
+
+  u16v16 c16 = (zxt(c4.u8) << 8) + 128;
+
+  for (; n; src += 16, dst += 16, n -= 4) {
+    union vec128 s;
+    memcpy(&s, src, sizeof s);
+    s.u8 = narrow((zxt(s.u8)*zxt(ia16.u8) + c16) >> 8);
+    memcpy(dst, &s, sizeof s);
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 3 "optimized" } } */
+/* We're missing an opportunity to, after later optimizations, combine
+   a uniform CTOR with a vec_unpack_lo_expr to a CTOR on a converted
+   element.  */
+/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 2 "optimized" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2004,16 +2004,12 @@ get_bit_field_ref_def (tree val, enum tree_code &conv_code)
    return NULL_TREE;
  enum tree_code code = gimple_assign_rhs_code (def_stmt);
  if (code == FLOAT_EXPR
-      || code == FIX_TRUNC_EXPR)
+      || code == FIX_TRUNC_EXPR
+      || CONVERT_EXPR_CODE_P (code))
    {
      tree op1 = gimple_assign_rhs1 (def_stmt);
      if (conv_code == ERROR_MARK)
-	{
-	  if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (val))),
-			GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
-	    return NULL_TREE;
-	  conv_code = code;
-	}
+	conv_code = code;
      else if (conv_code != code)
 	return NULL_TREE;
      if (TREE_CODE (op1) != SSA_NAME)
@@ -2078,9 +2074,8 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 	  && VECTOR_TYPE_P (TREE_TYPE (ref))
 	  && useless_type_conversion_p (TREE_TYPE (op1),
 					TREE_TYPE (TREE_TYPE (ref)))
-	  && known_eq (bit_field_size (op1), elem_size)
 	  && constant_multiple_p (bit_field_offset (op1),
-				  elem_size, &elem)
+				  bit_field_size (op1), &elem)
 	  && TYPE_VECTOR_SUBPARTS (TREE_TYPE (ref)).is_constant (&refnelts))
 	{
 	  unsigned int j;
@@ -2153,7 +2148,83 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
      if (conv_code != ERROR_MARK
 	  && !supportable_convert_operation (conv_code, type, conv_src_type,
 					     &conv_code))
-	return false;
+	{
+	  /* Only few targets implement direct conversion patterns so try
+	     some simple special cases via VEC_[UN]PACK[_FLOAT]_LO_EXPR.  */
+	  optab optab;
+	  tree halfvectype, dblvectype;
+	  if (CONVERT_EXPR_CODE_P (conv_code)
+	      && (2 * TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0])))
+		  == TYPE_PRECISION (TREE_TYPE (type)))
+	      && mode_for_vector (as_a <scalar_mode>
+				  (TYPE_MODE (TREE_TYPE (TREE_TYPE (orig[0])))),
+				  nelts * 2).exists ()
+	      && (dblvectype
+		  = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])),
+				       nelts * 2))
+	      && (optab = optab_for_tree_code (FLOAT_TYPE_P (TREE_TYPE (type))
+					       ? VEC_UNPACK_FLOAT_LO_EXPR
+					       : VEC_UNPACK_LO_EXPR,
+					       dblvectype,
+					       optab_default))
+	      && (optab_handler (optab, TYPE_MODE (dblvectype))
+		  != CODE_FOR_nothing))
+	    {
+	      gimple_seq stmts = NULL;
+	      tree dbl;
+	      if (refnelts == nelts)
+		{
+		  /* ???  Paradoxical subregs don't exist, so insert into
+		     the lower half of a wider zero vector.  */
+		  dbl = gimple_build (&stmts, BIT_INSERT_EXPR, dblvectype,
+				      build_zero_cst (dblvectype), orig[0],
+				      bitsize_zero_node);
+		}
+	      else if (refnelts == 2 * nelts)
+		dbl = orig[0];
+	      else
+		dbl = gimple_build (&stmts, BIT_FIELD_REF, dblvectype,
+				    orig[0], TYPE_SIZE (dblvectype),
+				    bitsize_zero_node);
+	      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	      gimple_assign_set_rhs_with_ops (gsi,
+					      FLOAT_TYPE_P (TREE_TYPE (type))
+					      ? VEC_UNPACK_FLOAT_LO_EXPR
+					      : VEC_UNPACK_LO_EXPR,
+					      dbl);
+	    }
+	  else if (CONVERT_EXPR_CODE_P (conv_code)
+		   && (TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0])))
+		       == 2 * TYPE_PRECISION (TREE_TYPE (type)))
+		   && mode_for_vector (as_a <scalar_mode>
+				         (TYPE_MODE
+					   (TREE_TYPE (TREE_TYPE (orig[0])))),
+				       nelts / 2).exists ()
+		   && (halfvectype
+		         = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])),
+					      nelts / 2))
+		   && (optab = optab_for_tree_code (VEC_PACK_TRUNC_EXPR,
+						    halfvectype,
+						    optab_default))
+		   && (optab_handler (optab, TYPE_MODE (halfvectype))
+		       != CODE_FOR_nothing))
+	    {
+	      gimple_seq stmts = NULL;
+	      tree low = gimple_build (&stmts, BIT_FIELD_REF, halfvectype,
+				       orig[0], TYPE_SIZE (halfvectype),
+				       bitsize_zero_node);
+	      tree hig = gimple_build (&stmts, BIT_FIELD_REF, halfvectype,
+				       orig[0], TYPE_SIZE (halfvectype),
+				       TYPE_SIZE (halfvectype));
+	      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	      gimple_assign_set_rhs_with_ops (gsi, VEC_PACK_TRUNC_EXPR,
+					      low, hig);
+	    }
+	  else
+	    return false;
+	  update_stmt (gsi_stmt (*gsi));
+	  return true;
+	}
      if (nelts != refnelts)
 	{
 	  gassign *lowpart
@@ -2178,9 +2249,8 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
 		       ? perm_type
 		       : build_vector_type (TREE_TYPE (perm_type), nelts));
      if (conv_code != ERROR_MARK
-	  && (!supportable_convert_operation (conv_code, type, conv_src_type,
-					      &conv_code)
-	      || conv_code == CALL_EXPR))
+	  && !supportable_convert_operation (conv_code, type, conv_src_type,
+					     &conv_code))
 	return false;

      /* Now that we know the number of elements of the source build the