rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): Add macro to say we can…

rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): Add macro to say we can efficiently handle overlapping unaligned loads. 2016-10-09 Aaron Sawdey <acsawdey@linux.vnet.ibm.com> * config/rs6000/rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): Add macro to say we can efficiently handle overlapping unaligned loads. * config/rs6000/rs6000.c (expand_block_compare): Avoid generating poor code for processors older than p8. From-SVN: r240908

rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): Add macro to say we can…
rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): Add macro to say we can efficiently handle overlapping unaligned loads. 2016-10-09 Aaron Sawdey <acsawdey@linux.vnet.ibm.com> * config/rs6000/rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): Add macro to say we can efficiently handle overlapping unaligned loads. * config/rs6000/rs6000.c (expand_block_compare): Avoid generating poor code for processors older than p8. From-SVN: r240908
87b44b83 · Aaron Sawdey · Aaron Sawdey · 4815e7d4 · 87b44b83 · 87b44b83
Commit 87b44b83 authored Oct 10, 2016 by Aaron Sawdey Committed by Aaron Sawdey Oct 09, 2016
Hide whitespace changes
Inline Side-by-side

Showing with 33 additions and 4 deletions

gcc/ChangeLog
+8 -0

gcc/config/rs6000/rs6000.c
+22 -4

gcc/config/rs6000/rs6000.h
+3 -0

No files found.
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2016-10-09  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>
+	* config/rs6000/rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): 
+	Add macro to say we can efficiently handle overlapping unaligned
+	loads.
+	* config/rs6000/rs6000.c (expand_block_compare): Avoid generating
+	poor code for processors older than p8.
 2016-10-09  Eric Botcazou  <ebotcazou@adacore.com>
 	* gen-pass-instances.awk: Remove GNUism.

--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -18771,6 +18771,14 @@ expand_block_compare (rtx operands[])
  if (bytes <= 0)
    return true;
+  /* The code generated for p7 and older is not faster than glibc
+     memcmp if alignment is small and length is not short, so bail
+     out to avoid those conditions.  */
+  if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
+      && ((base_align == 1 && bytes > 16)
+	  || (base_align == 2 && bytes > 32)))
+    return false;
  rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
  rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
@@ -18820,13 +18828,18 @@ expand_block_compare (rtx operands[])
  while (bytes > 0)
    {
      int align = compute_current_alignment (base_align, offset);
-      load_mode = select_block_compare_mode(offset, bytes, align, word_mode_ok);
+      if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
+	load_mode = select_block_compare_mode (offset, bytes, align,
+					       word_mode_ok);
+      else
+	load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
      load_mode_size = GET_MODE_SIZE (load_mode);
      if (bytes >= load_mode_size)
 	cmp_bytes = load_mode_size;
-      else
+      else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
 	{
-	  /* Move this load back so it doesn't go past the end.  */
+	  /* Move this load back so it doesn't go past the end.
+	     P8/P9 can do this efficiently.  */
 	  int extra_bytes = load_mode_size - bytes;
 	  cmp_bytes = bytes;
 	  if (extra_bytes < offset)
@@ -18836,7 +18849,12 @@ expand_block_compare (rtx operands[])
 	      bytes = cmp_bytes;
 	    }
 	}
+      else
+	/* P7 and earlier can't do the overlapping load trick fast,
+	   so this forces a non-overlapping load and a shift to get
+	   rid of the extra bytes.  */
+	cmp_bytes = bytes;
      src1 = adjust_address (orig_src1, load_mode, offset);
      src2 = adjust_address (orig_src2, load_mode, offset);
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -607,6 +607,9 @@ extern int rs6000_vector_align[];
 				 && TARGET_POWERPC64)
 #define TARGET_VEXTRACTUB	(TARGET_P9_VECTOR && TARGET_DIRECT_MOVE \
 				 && TARGET_UPPER_REGS_DI && TARGET_POWERPC64)
+/* This wants to be set for p8 and newer.  On p7, overlapping unaligned
+   loads are slow. */
+#define TARGET_EFFICIENT_OVERLAPPING_UNALIGNED TARGET_EFFICIENT_UNALIGNED_VSX
 /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present
   in power7, so conditionalize them on p8 features.  TImode syncs need quad