[PATCH AArch64 1/2] Improve codegen of vector compares inc. tst instruction

gcc/: * config/aarch64/aarch64-builtins.c (aarch64_types_cmtst_qualifiers, TYPES_TST): Define. (aarch64_fold_builtin): Update pattern for cmtst. * config/aarch64/aarch64-protos.h (aarch64_const_vec_all_same_int_p): Declare. * config/aarch64/aarch64-simd-builtins.def (cmtst): Update qualifiers. * config/aarch64/aarch64-simd.md (aarch64_vcond_internal<mode><mode>): Switch operands, separate out more cases, refactor. (aarch64_cmtst<mode>): Rewrite pattern to match (plus ... -1). * config/aarch64.c (aarch64_const_vec_all_same_int_p): Take single argument; rename old version to... (aarch64_const_vec_all_same_in_range_p): ...this. (aarch64_print_operand, aarch64_simd_shift_imm_p): Follow renaming. * config/aarch64/predicates.md (aarch64_simd_imm_minus_one): Define. gcc/testsuite/: * gcc.target/aarch64/simd/int_comparisons.x: New file. * gcc.target/aarch64/simd/int_comparisons_1.c: New test. * gcc.target/aarch64/simd/int_comparisons_2.c: Ditto. From-SVN: r214948

[PATCH AArch64 1/2] Improve codegen of vector compares inc. tst instruction
gcc/: * config/aarch64/aarch64-builtins.c (aarch64_types_cmtst_qualifiers, TYPES_TST): Define. (aarch64_fold_builtin): Update pattern for cmtst. * config/aarch64/aarch64-protos.h (aarch64_const_vec_all_same_int_p): Declare. * config/aarch64/aarch64-simd-builtins.def (cmtst): Update qualifiers. * config/aarch64/aarch64-simd.md (aarch64_vcond_internal<mode><mode>): Switch operands, separate out more cases, refactor. (aarch64_cmtst<mode>): Rewrite pattern to match (plus ... -1). * config/aarch64.c (aarch64_const_vec_all_same_int_p): Take single argument; rename old version to... (aarch64_const_vec_all_same_in_range_p): ...this. (aarch64_print_operand, aarch64_simd_shift_imm_p): Follow renaming. * config/aarch64/predicates.md (aarch64_simd_imm_minus_one): Define. gcc/testsuite/: * gcc.target/aarch64/simd/int_comparisons.x: New file. * gcc.target/aarch64/simd/int_comparisons_1.c: New test. * gcc.target/aarch64/simd/int_comparisons_2.c: Ditto. From-SVN: r214948
ddeabd3e · Alan Lawrence · Alan Lawrence · e625e715 · ddeabd3e · ddeabd3e
Commit ddeabd3e authored Sep 05, 2014 by Alan Lawrence Committed by Alan Lawrence Sep 05, 2014
11 changed files
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
 2014-09-05  Alan Lawrence  <alan.lawrence@arm.com>
+	* config/aarch64/aarch64-builtins.c (aarch64_types_cmtst_qualifiers,
+	TYPES_TST): Define.
+	(aarch64_fold_builtin): Update pattern for cmtst.
+	* config/aarch64/aarch64-protos.h (aarch64_const_vec_all_same_int_p):
+	Declare.
+	* config/aarch64/aarch64-simd-builtins.def (cmtst): Update qualifiers.
+	* config/aarch64/aarch64-simd.md (aarch64_vcond_internal<mode><mode>):
+	Switch operands, separate out more cases, refactor.
+	(aarch64_cmtst<mode>): Rewrite pattern to match (plus ... -1).
+	* config/aarch64.c (aarch64_const_vec_all_same_int_p): Take single
+	argument; rename old version to...
+	(aarch64_const_vec_all_same_in_range_p): ...this.
+	(aarch64_print_operand, aarch64_simd_shift_imm_p): Follow renaming.
+	* config/aarch64/predicates.md (aarch64_simd_imm_minus_one): Define.
+2014-09-05  Alan Lawrence  <alan.lawrence@arm.com>
 	* config/aarch64/aarch64-builtins.c (enum aarch64_type_qualifiers):
 	Remove qualifier_const_pointer, update comment.
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -144,6 +144,11 @@ aarch64_types_binop_qualifiers[SIMD_MAX_BUILTIN_ARGS]
  = { qualifier_none, qualifier_none, qualifier_maybe_immediate };
 #define TYPES_BINOP (aarch64_types_binop_qualifiers)
 static enum aarch64_type_qualifiers
+aarch64_types_cmtst_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none,
+      qualifier_internal, qualifier_internal };
+#define TYPES_TST (aarch64_types_cmtst_qualifiers)
+static enum aarch64_type_qualifiers
 aarch64_types_binopv_qualifiers[SIMD_MAX_BUILTIN_ARGS]
  = { qualifier_void, qualifier_none, qualifier_none };
 #define TYPES_BINOPV (aarch64_types_binopv_qualifiers)
@@ -1285,7 +1290,7 @@ aarch64_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED, tree *args,
      BUILTIN_VALLDI (BINOP, cmeq, 0)
 	return fold_build2 (EQ_EXPR, type, args[0], args[1]);
 	break;
-      BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0)
+      BUILTIN_VSDQ_I_DI (TST, cmtst, 0)
 	{
 	  tree and_node = fold_build2 (BIT_AND_EXPR, type, args[0], args[1]);
 	  tree vec_zero_node = build_zero_cst (type);

--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -180,6 +180,7 @@ bool aarch64_cannot_change_mode_class (enum machine_mode,
 				       enum reg_class);
 enum aarch64_symbol_type
 aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context);
+bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
 bool aarch64_constant_address_p (rtx);
 bool aarch64_expand_movmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);

--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -246,7 +246,7 @@
  /* Implemented by aarch64_cm<cmp><mode>.  */
  BUILTIN_VSDQ_I_DI (BINOP, cmgeu, 0)
  BUILTIN_VSDQ_I_DI (BINOP, cmgtu, 0)
-  BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0)
+  BUILTIN_VSDQ_I_DI (TST, cmtst, 0)
  /* Implemented by reduc_<sur>plus_<mode>.  */
  BUILTIN_VALL (UNOP, reduc_splus_, 10)

--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1912,58 +1912,94 @@
 	  (match_operand:VDQ 2 "nonmemory_operand")))]
  "TARGET_SIMD"
 {
-  int inverse = 0, has_zero_imm_form = 0;
  rtx op1 = operands[1];
  rtx op2 = operands[2];
  rtx mask = gen_reg_rtx (<MODE>mode);
+  enum rtx_code code = GET_CODE (operands[3]);
+  /* Switching OP1 and OP2 is necessary for NE (to output a cmeq insn),
+     and desirable for other comparisons if it results in FOO ? -1 : 0
+     (this allows direct use of the comparison result without a bsl).  */
+  if (code == NE
+      || (code != EQ
+	  && op1 == CONST0_RTX (<V_cmp_result>mode)
+	  && op2 == CONSTM1_RTX (<V_cmp_result>mode)))
+    {
+      op1 = operands[2];
+      op2 = operands[1];
+      switch (code)
+        {
+        case LE: code = GT; break;
+        case LT: code = GE; break;
+        case GE: code = LT; break;
+        case GT: code = LE; break;
+        /* No case EQ.  */
+        case NE: code = EQ; break;
+        case LTU: code = GEU; break;
+        case LEU: code = GTU; break;
+        case GTU: code = LEU; break;
+        case GEU: code = LTU; break;
+        default: gcc_unreachable ();
+        }
+    }
-  switch (GET_CODE (operands[3]))
+  /* Make sure we can handle the last operand.  */
+  switch (code)
    {
+    case NE:
+      /* Normalized to EQ above.  */
+      gcc_unreachable ();
    case LE:
    case LT:
-    case NE:
-      inverse = 1;
-      /* Fall through.  */
    case GE:
    case GT:
    case EQ:
-      has_zero_imm_form = 1;
+      /* These instructions have a form taking an immediate zero.  */
-      break;
+      if (operands[5] == CONST0_RTX (<MODE>mode))
-    case LEU:
+        break;
-    case LTU:
+      /* Fall through, as may need to load into register.  */
-      inverse = 1;
-      break;
    default:
+      if (!REG_P (operands[5]))
+        operands[5] = force_reg (<MODE>mode, operands[5]);
      break;
    }
-  if (!REG_P (operands[5])
+  switch (code)
-      && (operands[5] != CONST0_RTX (<MODE>mode) || !has_zero_imm_form))
-    operands[5] = force_reg (<MODE>mode, operands[5]);
-  switch (GET_CODE (operands[3]))
    {
    case LT:
+      emit_insn (gen_aarch64_cmlt<mode> (mask, operands[4], operands[5]));
+      break;
    case GE:
      emit_insn (gen_aarch64_cmge<mode> (mask, operands[4], operands[5]));
      break;
    case LE:
+      emit_insn (gen_aarch64_cmle<mode> (mask, operands[4], operands[5]));
+      break;
    case GT:
      emit_insn (gen_aarch64_cmgt<mode> (mask, operands[4], operands[5]));
      break;
    case LTU:
+      emit_insn (gen_aarch64_cmgtu<mode> (mask, operands[5], operands[4]));
+      break;
    case GEU:
      emit_insn (gen_aarch64_cmgeu<mode> (mask, operands[4], operands[5]));
      break;
    case LEU:
+      emit_insn (gen_aarch64_cmgeu<mode> (mask, operands[5], operands[4]));
+      break;
    case GTU:
      emit_insn (gen_aarch64_cmgtu<mode> (mask, operands[4], operands[5]));
      break;
-    case NE:
+    /* NE has been normalized to EQ above.  */
    case EQ:
      emit_insn (gen_aarch64_cmeq<mode> (mask, operands[4], operands[5]));
      break;
@@ -1972,12 +2008,6 @@
      gcc_unreachable ();
    }
-  if (inverse)
-    {
-      op1 = operands[2];
-      op2 = operands[1];
-    }
    /* If we have (a = (b CMP c) ? -1 : 0);
       Then we can simply move the generated mask.  */
@@ -3932,14 +3962,22 @@
 ;; cmtst
+;; Although neg (ne (and x y) 0) is the natural way of expressing a cmtst,
+;; we don't have any insns using ne, and aarch64_vcond_internal outputs
+;; not (neg (eq (and x y) 0))
+;; which is rewritten by simplify_rtx as
+;; plus (eq (and x y) 0) -1.
 (define_insn "aarch64_cmtst<mode>"
  [(set (match_operand:<V_cmp_result> 0 "register_operand" "=w")
-	(neg:<V_cmp_result>
+	(plus:<V_cmp_result>
-	  (ne:<V_cmp_result>
+	  (eq:<V_cmp_result>
 	    (and:VDQ
 	      (match_operand:VDQ 1 "register_operand" "w")
 	      (match_operand:VDQ 2 "register_operand" "w"))
-	    (vec_duplicate:<V_cmp_result> (const_int 0)))))]
+	    (match_operand:VDQ 3 "aarch64_simd_imm_zero"))
+	  (match_operand:<V_cmp_result> 4 "aarch64_simd_imm_minus_one")))
+  ]
  "TARGET_SIMD"
  "cmtst\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
  [(set_attr "type" "neon_tst<q>")]

--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -137,9 +137,6 @@ static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 static void aarch64_override_options_after_change (void);
 static bool aarch64_vector_mode_supported_p (enum machine_mode);
 static unsigned bit_count (unsigned HOST_WIDE_INT);
-static bool aarch64_const_vec_all_same_int_p (rtx,
-					      HOST_WIDE_INT, HOST_WIDE_INT);
 static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
 						 const unsigned char *sel);
 static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool);
@@ -3576,6 +3573,36 @@ aarch64_get_condition_code (rtx x)
    }
 }
+bool
+aarch64_const_vec_all_same_in_range_p (rtx x,
+				  HOST_WIDE_INT minval,
+				  HOST_WIDE_INT maxval)
+{
+  HOST_WIDE_INT firstval;
+  int count, i;
+  if (GET_CODE (x) != CONST_VECTOR
+      || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
+    return false;
+  firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
+  if (firstval < minval || firstval > maxval)
+    return false;
+  count = CONST_VECTOR_NUNITS (x);
+  for (i = 1; i < count; i++)
+    if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
+      return false;
+  return true;
+}
+bool
+aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
+{
+  return aarch64_const_vec_all_same_in_range_p (x, val, val);
+}
 static unsigned
 bit_count (unsigned HOST_WIDE_INT value)
 {
@@ -3827,9 +3854,10 @@ aarch64_print_operand (FILE *f, rtx x, char code)
 	case CONST_VECTOR:
 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
 	    {
-	      gcc_assert (aarch64_const_vec_all_same_int_p (x,
+	      gcc_assert (
-							    HOST_WIDE_INT_MIN,
+		  aarch64_const_vec_all_same_in_range_p (x,
-							    HOST_WIDE_INT_MAX));
+							 HOST_WIDE_INT_MIN,
+							 HOST_WIDE_INT_MAX));
 	      asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
 	    }
 	  else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
@@ -7732,39 +7760,15 @@ aarch64_simd_valid_immediate (rtx op, enum machine_mode mode, bool inverse,
 #undef CHECK
 }
-static bool
-aarch64_const_vec_all_same_int_p (rtx x,
-				  HOST_WIDE_INT minval,
-				  HOST_WIDE_INT maxval)
-{
-  HOST_WIDE_INT firstval;
-  int count, i;
-  if (GET_CODE (x) != CONST_VECTOR
-      || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
-    return false;
-  firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
-  if (firstval < minval || firstval > maxval)
-    return false;
-  count = CONST_VECTOR_NUNITS (x);
-  for (i = 1; i < count; i++)
-    if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
-      return false;
-  return true;
-}
 /* Check of immediate shift constants are within range.  */
 bool
 aarch64_simd_shift_imm_p (rtx x, enum machine_mode mode, bool left)
 {
  int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
  if (left)
-    return aarch64_const_vec_all_same_int_p (x, 0, bit_width - 1);
+    return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
  else
-    return aarch64_const_vec_all_same_int_p (x, 1, bit_width);
+    return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
 }
 /* Return true if X is a uniform vector where all elements

--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -273,3 +273,9 @@
 {
  return aarch64_simd_imm_zero_p (op, mode);
 })
+(define_special_predicate "aarch64_simd_imm_minus_one"
+  (match_code "const_vector")
+{
+  return aarch64_const_vec_all_same_int_p (op, -1);
+})
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
 2014-09-05  Alan Lawrence  <alan.lawrence@arm.com>
+	* gcc.target/aarch64/simd/int_comparisons.x: New file.
+	* gcc.target/aarch64/simd/int_comparisons_1.c: New test.
+	* gcc.target/aarch64/simd/int_comparisons_2.c: Ditto.
+2014-09-05  Alan Lawrence  <alan.lawrence@arm.com>
 	* gcc.target/aarch64/simd/vrbit_1.c: New test.
 2014-09-05  Richard Biener  <rguenther@suse.de>

--- a/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons.x
+++ b/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons.x
+/*  test_vcXXX wrappers for all the vcXXX (vector compare) and vtst intrinsics
+    in arm_neon.h (excluding the 64x1 variants as these generally produce scalar
+    not vector ops).  */
+#include "arm_neon.h"
+#define DONT_FORCE(X)
+#define FORCE_SIMD(V1)   asm volatile ("mov %d0, %1.d[0]"       \
+           : "=w"(V1)                                           \
+           : "w"(V1)                                            \
+           : /* No clobbers */);
+#define OP1(SIZE, OP, BASETYPE, SUFFIX, FORCE) uint##SIZE##_t	\
+test_v##OP##SUFFIX (BASETYPE##SIZE##_t a)			\
+{								\
+  uint##SIZE##_t res;						\
+  FORCE (a);							\
+  res = v##OP##SUFFIX (a);					\
+  FORCE (res);							\
+  return res;							\
+}
+#define OP2(SIZE, OP, BASETYPE, SUFFIX, FORCE) uint##SIZE##_t	\
+test_v##OP##SUFFIX (BASETYPE##SIZE##_t a, BASETYPE##SIZE##_t b) \
+{								\
+  uint##SIZE##_t res;						\
+  FORCE (a);							\
+  FORCE (b);							\
+  res = v##OP##SUFFIX (a, b);					\
+  FORCE (res);							\
+  return res;							\
+}
+#define UNSIGNED_OPS(SIZE, BASETYPE, SUFFIX, FORCE) \
+OP2 (SIZE, tst, BASETYPE, SUFFIX, FORCE) \
+OP1 (SIZE, ceqz, BASETYPE, SUFFIX, FORCE) \
+OP2 (SIZE, ceq, BASETYPE, SUFFIX, FORCE) \
+OP2 (SIZE, cge, BASETYPE, SUFFIX, FORCE) \
+OP2 (SIZE, cgt, BASETYPE, SUFFIX, FORCE) \
+OP2 (SIZE, cle, BASETYPE, SUFFIX, FORCE) \
+OP2 (SIZE, clt, BASETYPE, SUFFIX, FORCE)
+#define ALL_OPS(SIZE, BASETYPE, SUFFIX, FORCE) \
+OP1 (SIZE, cgez, BASETYPE, SUFFIX, FORCE) \
+OP1 (SIZE, cgtz, BASETYPE, SUFFIX, FORCE) \
+OP1 (SIZE, clez, BASETYPE, SUFFIX, FORCE) \
+OP1 (SIZE, cltz, BASETYPE, SUFFIX, FORCE) \
+UNSIGNED_OPS (SIZE, BASETYPE, SUFFIX, FORCE)
+ALL_OPS (8x8, int, _s8, DONT_FORCE)
+ALL_OPS (16x4, int, _s16, DONT_FORCE)
+ALL_OPS (32x2, int, _s32, DONT_FORCE)
+ALL_OPS (64x1, int, _s64, DONT_FORCE)
+ALL_OPS (64, int, d_s64, FORCE_SIMD)
+ALL_OPS (8x16, int, q_s8, DONT_FORCE)
+ALL_OPS (16x8, int, q_s16, DONT_FORCE)
+ALL_OPS (32x4, int, q_s32, DONT_FORCE)
+ALL_OPS (64x2, int, q_s64, DONT_FORCE)
+UNSIGNED_OPS (8x8, uint, _u8, DONT_FORCE)
+UNSIGNED_OPS (16x4, uint, _u16, DONT_FORCE)
+UNSIGNED_OPS (32x2, uint, _u32, DONT_FORCE)
+UNSIGNED_OPS (64x1, uint, _u64, DONT_FORCE)
+UNSIGNED_OPS (64, uint, d_u64, FORCE_SIMD)
+UNSIGNED_OPS (8x16, uint, q_u8, DONT_FORCE)
+UNSIGNED_OPS (16x8, uint, q_u16, DONT_FORCE)
+UNSIGNED_OPS (32x4, uint, q_u32, DONT_FORCE)
+UNSIGNED_OPS (64x2, uint, q_u64, DONT_FORCE)
--- a/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons_1.c
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline" } */
+/* Scan-assembler test, so, incorporate as little other code as possible.  */
+#include "arm_neon.h"
+#include "int_comparisons.x"
+/* Operations on all 18 integer types:  (q?)_[su](8|16|32|64), d_[su]64.
+   (d?)_[us]64 generate regs of form 'd0' rather than e.g. 'v0.2d'.  */
+/* { dg-final { scan-assembler-times "\[ \t\]cmeq\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmeq\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?0" 4 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmeq\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmeq\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmtst\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmtst\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */
+/* vcge + vcle both implemented with cmge (signed) or cmhs (unsigned).  */
+/* { dg-final { scan-assembler-times "\[ \t\]cmge\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmge\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmhs\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmhs\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */
+/* vcgt + vclt both implemented with cmgt (signed) or cmhi (unsigned).  */
+/* { dg-final { scan-assembler-times "\[ \t\]cmgt\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmgt\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmhi\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmhi\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */
+/* Comparisons against immediate zero, on the 8 signed integer types only.  */
+/* { dg-final { scan-assembler-times "\[ \t\]cmge\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 7 } } */
+/*  For int64_t and int64x1_t, combine_simplify_rtx failure of
+    https://gcc.gnu.org/ml/gcc/2014-06/msg00253.html
+    prevents generation of cmge....#0, instead producing mvn + sshr.  */
+/* { #dg-final { scan-assembler-times "\[ \t\]cmge\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?0" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmgt\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 7 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmgt\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?0" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmle\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 7 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmle\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?0" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmlt\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 7 } } */
+/* For int64_t and int64x1_t, cmlt ... #0 and sshr ... #63 are equivalent,
+   so allow either.  cmgez issue above results in extra 2 * sshr....63.  */
+/* { dg-final { scan-assembler-times "\[ \t\](?:cmlt|sshr)\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?(?:0|63)" 4 } } */
+// All should have been compiled into single insns without inverting result:
+/* { dg-final { scan-assembler-not "not" } } */
--- a/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/int_comparisons_2.c
+/* { dg-do run } */
+/* { dg-options "-O2 -fno-inline" } */
+/* Stops the test_xxx methods being inlined into main, thus preventing constant
+   propagation.  */
+#include "int_comparisons.x"
+extern void abort (void);
+#define CHECK2(R0, R1) if (res[0] != R0 || res[1] != R1) abort ()
+#define TEST2(BASETYPE, SUFFIX, RESTYPE, ST1_SUFFIX) {			\
+  BASETYPE##_t _a[2] = {2, 3};						\
+  BASETYPE##x2_t a = vld1##SUFFIX (_a);					\
+  BASETYPE##_t _b[2] = {1, 3};						\
+  BASETYPE##x2_t b = vld1##SUFFIX (_b);					\
+  RESTYPE res[2];							\
+  vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (a, b)); CHECK2 (0, 0);	\
+  vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (b, a)); CHECK2 (-1, 0);	\
+  vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (a, b)); CHECK2 (0, -1);	\
+  vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (b, a)); CHECK2 (-1, -1);	\
+  vst1##ST1_SUFFIX (res, test_vceq##SUFFIX (a, b)); CHECK2 (0, -1);	\
+  vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (a, b)); CHECK2 (-1, -1);	\
+  vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (b, a)); CHECK2 (0, -1);	\
+  vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (a, b)); CHECK2 (-1, 0);	\
+  vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (b, a)); CHECK2 (0, 0);	\
+  vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a, b)); CHECK2 (0, -1);	\
+  vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a + 1, b)); CHECK2 (-1, 0); \
+}
+#define CHECK4(T, R0, R1, R2, R3)		\
+  if (res[0] != (T)R0 || res[1] != (T)R1	\
+      || res[2] != (T)R2 || res[3] != (T)R3) abort ()
+#define TEST4(BASETYPE, SUFFIX, RESTYPE, ST1_SUFFIX) {	\
+  BASETYPE##_t _a[4] = {1, 2, 3, 4};			\
+  BASETYPE##x4_t a = vld1##SUFFIX (_a);			\
+  BASETYPE##_t _b[4] = {4, 2, 1, 3};			\
+  BASETYPE##x4_t b = vld1##SUFFIX (_b);			\
+  RESTYPE res[4];					\
+  vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (a, b));	\
+  CHECK4 (RESTYPE, -1, 0, 0, 0);			\
+  vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (a, b));	\
+  CHECK4 (RESTYPE, -1, -1, 0, 0);			\
+  vst1##ST1_SUFFIX (res, test_vceq##SUFFIX (a, b));	\
+  CHECK4 (RESTYPE, 0, -1, 0, 0);			\
+  vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (a, b));	\
+  CHECK4 (RESTYPE, 0, -1, -1, -1);			\
+  vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (a, b));	\
+  CHECK4 (RESTYPE, 0, 0, -1, -1);			\
+  vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a, b));	\
+  CHECK4 (RESTYPE, 0, -1, -1, 0);			\
+}
+#define CHECK8(T, R0, R1, R2, R3, R4, R5, R6, R7)			       \
+  if (res[0] != (T)R0 || res[1] != (T)R1 || res[2] != (T)R2 || res[3] != (T)R3 \
+      || res[4] != (T)R4 || res[5] != (T)R5 || res[6] != (T)R6		       \
+      || res[7] != (T)R7) abort ()
+#define TEST8(BASETYPE, SUFFIX, RESTYPE, ST1_SUFFIX) {	\
+  BASETYPE##_t _a[8] = {1, 2, 3, 4, 5, 6, 7, 8};	\
+  BASETYPE##x8_t a = vld1##SUFFIX (_a);			\
+  BASETYPE##_t _b[8] = {4, 2, 1, 3, 2, 6, 8, 9};	\
+  BASETYPE##x8_t b = vld1##SUFFIX (_b);			\
+  RESTYPE res[8];					\
+  vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (a, b));	\
+  CHECK8 (RESTYPE, -1, 0, 0, 0, 0, 0, -1, -1);		\
+  vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (a, b));	\
+  CHECK8 (RESTYPE, -1, -1, 0, 0, 0, -1, -1, -1);	\
+  vst1##ST1_SUFFIX (res, test_vceq##SUFFIX (a, b));	\
+  CHECK8 (RESTYPE, 0, -1, 0, 0, 0, -1, 0, 0);		\
+  vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (a, b));	\
+  CHECK8 (RESTYPE, 0, -1, -1, -1, -1, -1, 0, 0);	\
+  vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (a, b));	\
+  CHECK8 (RESTYPE, 0, 0, -1, -1, -1, 0, 0, 0);		\
+  vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a, b));	\
+  CHECK8 (RESTYPE, 0, -1, -1, 0, 0, -1, 0, -1);		\
+}
+/* 16-way tests use same 8 values twice.  */
+#define CHECK16(T, R0, R1, R2, R3, R4, R5, R6, R7)			       \
+  if (res[0] != (T)R0 || res[1] != (T)R1 || res[2] != (T)R2 || res[3] != (T)R3 \
+      || res[4] != (T)R4 || res[5] != (T)R5 || res[6] != (T)R6		       \
+      || res[7] != (T)R7 || res[8] != (T)R0 || res[9] != (T)R1		       \
+      || res[10] != (T)R2 || res[11] != (T)R3 || res[12] != (T)R4	       \
+      || res[13] != (T)R5 || res[14] != (T)R6 || res[15] != (T)R7) abort ()
+#define TEST16(BASETYPE, SUFFIX, RESTYPE, ST1_SUFFIX) {			  \
+  BASETYPE##_t _a[16] = {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}; \
+  BASETYPE##x16_t a = vld1##SUFFIX (_a);				  \
+  BASETYPE##_t _b[16] = {4, 2, 1, 3, 2, 6, 8, 9, 4, 2, 1, 3, 2, 6, 8, 9}; \
+  BASETYPE##x16_t b = vld1##SUFFIX (_b);				  \
+  RESTYPE res[16];							  \
+  vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (a, b));			  \
+  CHECK16 (RESTYPE, -1, 0, 0, 0, 0, 0, -1, -1);				  \
+  vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (a, b));			  \
+  CHECK16 (RESTYPE, -1, -1, 0, 0, 0, -1, -1, -1);			  \
+  vst1##ST1_SUFFIX (res, test_vceq##SUFFIX (a, b));			  \
+  CHECK16 (RESTYPE, 0, -1, 0, 0, 0, -1, 0, 0);				  \
+  vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (a, b));			  \
+  CHECK16 (RESTYPE, 0, -1, -1, -1, -1, -1, 0, 0);			  \
+  vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (a, b));			  \
+  CHECK16 (RESTYPE, 0, 0, -1, -1, -1, 0, 0, 0);				  \
+  vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a, b));			  \
+  CHECK16 (RESTYPE, 0, -1, -1, 0, 0, -1, 0, -1);			  \
+}
+int
+main (int argc, char **argv)
+{
+  TEST2 (int32, _s32, uint32_t, _u32);
+  TEST2 (uint32, _u32, uint32_t, _u32);
+  TEST2 (int64, q_s64, uint64_t, q_u64);
+  TEST2 (uint64, q_u64, uint64_t, q_u64);
+  TEST4 (int16, _s16, uint16_t, _u16);
+  TEST4 (uint16, _u16, uint16_t, _u16);
+  TEST4 (int32, q_s32, uint32_t, q_u32);
+  TEST4 (uint32, q_u32, uint32_t, q_u32);
+  TEST8 (int8, _s8, uint8_t, _u8);
+  TEST8 (uint8, _u8, uint8_t, _u8);
+  TEST8 (int16, q_s16, uint16_t, q_u16);
+  TEST8 (uint16, q_u16, uint16_t, q_u16);
+  TEST16 (int8, q_s8, uint8_t, q_u8);
+  TEST16 (uint8, q_u8, uint8_t, q_u8);
+  return 0;
+}