Commit 87b44b83 by Aaron Sawdey Committed by Aaron Sawdey

rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): Add macro to say we can…

rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): Add macro to say we can efficiently handle overlapping unaligned loads.

2016-10-09  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>

	* config/rs6000/rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): 
	Add macro to say we can efficiently handle overlapping unaligned
	loads.
	* config/rs6000/rs6000.c (expand_block_compare): Avoid generating
	poor code for processors older than p8.

From-SVN: r240908
parent 4815e7d4
2016-10-09 Aaron Sawdey <acsawdey@linux.vnet.ibm.com>
* config/rs6000/rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED):
Add macro to say we can efficiently handle overlapping unaligned
loads.
* config/rs6000/rs6000.c (expand_block_compare): Avoid generating
poor code for processors older than p8.
2016-10-09 Eric Botcazou <ebotcazou@adacore.com> 2016-10-09 Eric Botcazou <ebotcazou@adacore.com>
* gen-pass-instances.awk: Remove GNUism. * gen-pass-instances.awk: Remove GNUism.
......
...@@ -18771,6 +18771,14 @@ expand_block_compare (rtx operands[]) ...@@ -18771,6 +18771,14 @@ expand_block_compare (rtx operands[])
if (bytes <= 0) if (bytes <= 0)
return true; return true;
/* The code generated for p7 and older is not faster than glibc
memcmp if alignment is small and length is not short, so bail
out to avoid those conditions. */
if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
&& ((base_align == 1 && bytes > 16)
|| (base_align == 2 && bytes > 32)))
return false;
rtx tmp_reg_src1 = gen_reg_rtx (word_mode); rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
rtx tmp_reg_src2 = gen_reg_rtx (word_mode); rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
...@@ -18820,13 +18828,18 @@ expand_block_compare (rtx operands[]) ...@@ -18820,13 +18828,18 @@ expand_block_compare (rtx operands[])
while (bytes > 0) while (bytes > 0)
{ {
int align = compute_current_alignment (base_align, offset); int align = compute_current_alignment (base_align, offset);
load_mode = select_block_compare_mode(offset, bytes, align, word_mode_ok); if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
load_mode = select_block_compare_mode (offset, bytes, align,
word_mode_ok);
else
load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
load_mode_size = GET_MODE_SIZE (load_mode); load_mode_size = GET_MODE_SIZE (load_mode);
if (bytes >= load_mode_size) if (bytes >= load_mode_size)
cmp_bytes = load_mode_size; cmp_bytes = load_mode_size;
else else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
{ {
/* Move this load back so it doesn't go past the end. */ /* Move this load back so it doesn't go past the end.
P8/P9 can do this efficiently. */
int extra_bytes = load_mode_size - bytes; int extra_bytes = load_mode_size - bytes;
cmp_bytes = bytes; cmp_bytes = bytes;
if (extra_bytes < offset) if (extra_bytes < offset)
...@@ -18836,7 +18849,12 @@ expand_block_compare (rtx operands[]) ...@@ -18836,7 +18849,12 @@ expand_block_compare (rtx operands[])
bytes = cmp_bytes; bytes = cmp_bytes;
} }
} }
else
/* P7 and earlier can't do the overlapping load trick fast,
so this forces a non-overlapping load and a shift to get
rid of the extra bytes. */
cmp_bytes = bytes;
src1 = adjust_address (orig_src1, load_mode, offset); src1 = adjust_address (orig_src1, load_mode, offset);
src2 = adjust_address (orig_src2, load_mode, offset); src2 = adjust_address (orig_src2, load_mode, offset);
...@@ -607,6 +607,9 @@ extern int rs6000_vector_align[]; ...@@ -607,6 +607,9 @@ extern int rs6000_vector_align[];
&& TARGET_POWERPC64) && TARGET_POWERPC64)
#define TARGET_VEXTRACTUB (TARGET_P9_VECTOR && TARGET_DIRECT_MOVE \ #define TARGET_VEXTRACTUB (TARGET_P9_VECTOR && TARGET_DIRECT_MOVE \
&& TARGET_UPPER_REGS_DI && TARGET_POWERPC64) && TARGET_UPPER_REGS_DI && TARGET_POWERPC64)
/* This wants to be set for p8 and newer. On p7, overlapping unaligned
loads are slow. */
#define TARGET_EFFICIENT_OVERLAPPING_UNALIGNED TARGET_EFFICIENT_UNALIGNED_VSX
/* Byte/char syncs were added as phased in for ISA 2.06B, but are not present /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present
in power7, so conditionalize them on p8 features. TImode syncs need quad in power7, so conditionalize them on p8 features. TImode syncs need quad
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment