Commit 8597cd33 by Andreas Krebbel Committed by Andreas Krebbel

S/390: Unroll mvc/xc loop for memset with small constant

 lengths.

When expanding a memset we emit a loop of MVCs/XCs instructions dealing
with 256 byte blocks.  This loop used to get unrolled with older GCCs
when using constant length operands.  GCC lost this ability probably
when more of the loop unrolling stuff has been moved to tree level.

With this patch the unrolling is done manually when emitting the RTL
insns.

2017-01-05  Andreas Krebbel  <krebbel@linux.vnet.ibm.com>

	* gcc.target/s390/memset-1.c: New test.

gcc/ChangeLog:

2017-01-05  Andreas Krebbel  <krebbel@linux.vnet.ibm.com>

	* config/s390/s390.c (s390_expand_setmem): Unroll the loop for
	small constant length operands.

From-SVN: r244097
parent 587790e6
2017-01-05 Andreas Krebbel <krebbel@linux.vnet.ibm.com>
* config/s390/s390.c (s390_expand_setmem): Unroll the loop for
small constant length operands.
2017-01-05 Andreas Krebbel <krebbel@linux.vnet.ibm.com>
* config/s390/s390.c (s390_expand_setmem): Avoid overlapping bytes
between loop iterations.
......
......@@ -5348,34 +5348,46 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
{
const int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
if (GET_CODE (len) == CONST_INT && INTVAL (len) == 0)
if (GET_CODE (len) == CONST_INT && INTVAL (len) <= 0)
return;
gcc_assert (GET_CODE (val) == CONST_INT || GET_MODE (val) == QImode);
if (GET_CODE (len) == CONST_INT && INTVAL (len) > 0 && INTVAL (len) <= 257)
/* Expand setmem/clrmem for a constant length operand without a
loop if it will be shorter that way.
With a constant length and without pfd argument a
clrmem loop is 32 bytes -> 5.3 * xc
setmem loop is 36 bytes -> 3.6 * (mvi/stc + mvc) */
if (GET_CODE (len) == CONST_INT
&& ((INTVAL (len) <= 256 * 5 && val == const0_rtx)
|| INTVAL (len) <= 257 * 3)
&& (!TARGET_MVCLE || INTVAL (len) <= 256))
{
if (val == const0_rtx && INTVAL (len) <= 256)
emit_insn (gen_clrmem_short (dst, GEN_INT (INTVAL (len) - 1)));
else
{
/* Initialize memory by storing the first byte. */
emit_move_insn (adjust_address (dst, QImode, 0), val);
HOST_WIDE_INT o, l;
if (INTVAL (len) > 1)
{
/* Initiate 1 byte overlap move.
The first byte of DST is propagated through DSTP1.
Prepare a movmem for: DST+1 = DST (length = LEN - 1).
DST is set to size 1 so the rest of the memory location
does not count as source operand. */
rtx dstp1 = adjust_address (dst, VOIDmode, 1);
set_mem_size (dst, 1);
emit_insn (gen_movmem_short (dstp1, dst,
GEN_INT (INTVAL (len) - 2)));
}
}
if (val == const0_rtx)
/* clrmem: emit 256 byte blockwise XCs. */
for (l = INTVAL (len), o = 0; l > 0; l -= 256, o += 256)
{
rtx newdst = adjust_address (dst, BLKmode, o);
emit_insn (gen_clrmem_short (newdst,
GEN_INT (l > 256 ? 255 : l - 1)));
}
else
/* setmem: emit 1(mvi) + 256(mvc) byte blockwise memsets by
setting first byte to val and using a 256 byte mvc with one
byte overlap to propagate the byte. */
for (l = INTVAL (len), o = 0; l > 0; l -= 257, o += 257)
{
rtx newdst = adjust_address (dst, BLKmode, o);
emit_move_insn (adjust_address (dst, QImode, o), val);
if (l > 1)
{
rtx newdstp1 = adjust_address (dst, BLKmode, o + 1);
emit_insn (gen_movmem_short (newdstp1, newdst,
GEN_INT (l > 257 ? 255 : l - 2)));
}
}
}
else if (TARGET_MVCLE)
......
/* Make sure that short memset's with constant length are emitted
without loop statements. */
/* { dg-do compile } */
/* { dg-options "-O3 -mzarch" } */
/* 1 mvc */
void
*memset1(void *s, int c)
{
return __builtin_memset (s, c, 42);
}
/* 3 mvc */
void
*memset2(void *s, int c)
{
return __builtin_memset (s, c, 700);
}
/* nop */
void
*memset3(void *s, int c)
{
return __builtin_memset (s, c, 0);
}
/* mvc */
void
*memset4(void *s, int c)
{
return __builtin_memset (s, c, 256);
}
/* 2 mvc */
void
*memset5(void *s, int c)
{
return __builtin_memset (s, c, 512);
}
/* still 2 mvc through the additional first byte */
void
*memset6(void *s, int c)
{
return __builtin_memset (s, c, 514);
}
/* 3 mvc */
void
*memset7(void *s, int c)
{
return __builtin_memset (s, c, 515);
}
/* still 3 mvc through the additional first byte */
void
*memset8(void *s, int c)
{
return __builtin_memset (s, c, 771);
}
/* Use mvc loop: 2 mvc */
void
*memset9(void *s, int c)
{
return __builtin_memset (s, c, 772);
}
/* 3 mvc with displacement overflow after the first */
void
*memset10(void *s, int c)
{
return __builtin_memset ((char*)s + 4000, c, 700);
}
/* 1 xc */
void
*clrmem1(void *s)
{
return __builtin_memset (s, 0, 42);
}
/* 3 xc */
void
*clrmem2(void *s)
{
return __builtin_memset (s, 0, 700);
}
/* nop */
void
*clrmem3(void *s)
{
return __builtin_memset (s, 0, 0);
}
/* 1 xc */
void
*clrmem4(void *s)
{
return __builtin_memset (s, 0, 256);
}
/* 2 xc */
void
*clrmem5(void *s)
{
return __builtin_memset (s, 0, 512);
}
/* 3 xc */
void
*clrmem6(void *s)
{
return __builtin_memset (s, 0, 768);
}
/* start using xc loop */
void
*clrmem7(void *s)
{
return __builtin_memset (s, 0, 1281);
}
/* 3 xc with displacement overflow after the first */
void
*clrmem8(void *s)
{
return __builtin_memset (s + 4000, 0, 700);
}
/* { dg-final { scan-assembler-times "mvc" 19 } } */
/* { dg-final { scan-assembler-times "xc" 15 } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment