Commit 89c52e5e by Tamar Christina Committed by Tamar Christina

Simplify movmem code by always doing overlapping copies when larger than 8 bytes on AArch64.

This changes the movmem code in AArch64 that does copy for data between 4 and 7
bytes to use the smallest possible mode capable of copying the remaining bytes in one
go and then overlapping the reads if needed.

This means that if we're copying 5 bytes we would issue an SImode and QImode
load instead of two SImode loads.

This does smaller memory accesses but also gives the mid-end a chance to realise
that it can CSE the loads in certain circumstances. e.g. when you have something
like

return foo;

where foo is a struct. This would be transformed by the mid-end into SSA form as

D.XXXX = foo;

return D.XXXX;

This movmem routine will handle the first copy, but it's usually not needed,
the mid-end would do SImode and QImode stores into X0 for the 5 bytes example
but without the first copies being in the same mode, it doesn't know it doesn't
need the stores at all.

From-SVN: r262434
parent bdfc619e
2018-07-05 Tamar Christina <tamar.christina@arm.com>
* config/aarch64/aarch64.c (aarch64_expand_movmem): Fix mode size.
2018-07-05 Jakub Jelinek <jakub@redhat.com> 2018-07-05 Jakub Jelinek <jakub@redhat.com>
Revert Revert
......
...@@ -16137,26 +16137,29 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, ...@@ -16137,26 +16137,29 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
bool bool
aarch64_expand_movmem (rtx *operands) aarch64_expand_movmem (rtx *operands)
{ {
unsigned int n; int n, mode_bits;
rtx dst = operands[0]; rtx dst = operands[0];
rtx src = operands[1]; rtx src = operands[1];
rtx base; rtx base;
machine_mode cur_mode = BLKmode, next_mode;
bool speed_p = !optimize_function_for_size_p (cfun); bool speed_p = !optimize_function_for_size_p (cfun);
/* When optimizing for size, give a better estimate of the length of a /* When optimizing for size, give a better estimate of the length of a
memcpy call, but use the default otherwise. */ memcpy call, but use the default otherwise. Moves larger than 8 bytes
unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2; will always require an even number of instructions to do now. And each
operation requires both a load+store, so devide the max number by 2. */
int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
/* We can't do anything smart if the amount to copy is not constant. */ /* We can't do anything smart if the amount to copy is not constant. */
if (!CONST_INT_P (operands[2])) if (!CONST_INT_P (operands[2]))
return false; return false;
n = UINTVAL (operands[2]); n = INTVAL (operands[2]);
/* Try to keep the number of instructions low. For cases below 16 bytes we /* Try to keep the number of instructions low. For all cases we will do at
need to make at most two moves. For cases above 16 bytes it will be one most two moves for the residual amount, since we'll always overlap the
move for each 16 byte chunk, then at most two additional moves. */ remainder. */
if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions) if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
return false; return false;
base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
...@@ -16165,81 +16168,36 @@ aarch64_expand_movmem (rtx *operands) ...@@ -16165,81 +16168,36 @@ aarch64_expand_movmem (rtx *operands)
base = copy_to_mode_reg (Pmode, XEXP (src, 0)); base = copy_to_mode_reg (Pmode, XEXP (src, 0));
src = adjust_automodify_address (src, VOIDmode, base, 0); src = adjust_automodify_address (src, VOIDmode, base, 0);
/* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a /* Convert n to bits to make the rest of the code simpler. */
1-byte chunk. */ n = n * BITS_PER_UNIT;
if (n < 4)
{
if (n >= 2)
{
aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
n -= 2;
}
if (n == 1)
aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
return true;
}
/* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second while (n > 0)
4-byte chunk, partially overlapping with the previously copied chunk. */
if (n < 8)
{ {
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); /* Find the largest mode in which to do the copy in without over reading
n -= 4; or writing. */
if (n > 0) opt_scalar_int_mode mode_iter;
{ FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
int move = n - 4; if (GET_MODE_BITSIZE (mode_iter.require ()) <= n)
cur_mode = mode_iter.require ();
src = aarch64_move_pointer (src, move); gcc_assert (cur_mode != BLKmode);
dst = aarch64_move_pointer (dst, move);
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
}
return true;
}
/* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
them, then (if applicable) an 8-byte chunk. */ aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
while (n >= 8)
{
if (n / 16)
{
aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
n -= 16;
}
else
{
aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
n -= 8;
}
}
/* Finish the final bytes of the copy. We can always do this in one n -= mode_bits;
instruction. We either copy the exact amount we need, or partially
overlap with the previous chunk we copied and copy 8-bytes. */
if (n == 0)
return true;
else if (n == 1)
aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
else if (n == 2)
aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
else if (n == 4)
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
else
{
if (n == 3)
{
src = aarch64_move_pointer (src, -1);
dst = aarch64_move_pointer (dst, -1);
aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
}
else
{
int move = n - 8;
src = aarch64_move_pointer (src, move); /* Do certain trailing copies as overlapping if it's going to be
dst = aarch64_move_pointer (dst, move); cheaper. i.e. less instructions to do so. For instance doing a 15
aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode); byte copy it's more efficient to do two overlapping 8 byte copies than
8 + 6 + 1. */
next_mode = smallest_mode_for_size (n, MODE_INT);
int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT)
{
src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
n = n_bits;
} }
} }
......
2018-07-05 Tamar Christina <tamar.christina@arm.com>
* gcc.target/aarch64/struct_cpy.c: New.
2018-07-05 Christophe Lyon <christophe.lyon@linaro.org> 2018-07-05 Christophe Lyon <christophe.lyon@linaro.org>
* c-c++-common/unroll-1.c: Remove 'note:' in matching string. * c-c++-common/unroll-1.c: Remove 'note:' in matching string.
......
/* { dg-do compile } */
struct struct1 { char a;};
struct struct2 { char a, b;};
struct struct3 { char a, b, c; };
struct struct4 { char a, b, c, d; };
struct struct5 { char a, b, c, d, e; };
struct struct6 { char a, b, c, d, e, f; };
struct struct7 { char a, b, c, d, e, f, g; };
struct struct8 { char a, b, c, d, e, f, g, h; };
struct struct9 { char a, b, c, d, e, f, g, h, i; };
struct struct10 { char a, b, c, d, e, f, g, h, i, j; };
struct struct11 { char a, b, c, d, e, f, g, h, i, j, k; };
struct struct12 { char a, b, c, d, e, f, g, h, i, j, k, l; };
struct struct13 { char a, b, c, d, e, f, g, h, i, j, k, l, m; };
struct struct14 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n; };
struct struct15 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n, o; };
struct struct16 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p; };
struct struct1 foo1 = {'1'};
struct struct2 foo2 = { 'a', 'b'};
struct struct3 foo3 = { 'A', 'B', 'C'};
struct struct4 foo4 = {'1', '2', '3', '4'};
struct struct5 foo5 = {'a', 'b', 'c', 'd', 'e'};
struct struct6 foo6 = {'A', 'B', 'C', 'D', 'E', 'F'};
struct struct7 foo7 = {'1', '2', '3', '4', '5', '6', '7'};
struct struct8 foo8 = {'1', '2', '3', '4', '5', '6', '7', '8'};
struct struct9 foo9 = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'};
struct struct10 foo10 = {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'};
struct struct11 foo11 = {
'1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B'};
struct struct12 foo12 = {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L'};
struct struct13 foo13 = {
'a','b','c','d','e','f','g','h','i','j','k','l','m'};
struct struct14 foo14 = {
'a','b','c','d','e','f','g','h','i','j','k','l','m','n'};
struct struct15 foo15 = {
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o'};
struct struct16 foo16 = {
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'};
#define FUN(x) void fun##x ()\
{ \
volatile struct struct##x var##x = foo##x; \
}
FUN(1);
FUN(2);
FUN(3);
FUN(4);
FUN(5);
FUN(6);
FUN(7);
FUN(8);
FUN(9);
FUN(10);
FUN(11);
FUN(12);
FUN(13);
FUN(14);
FUN(15);
FUN(16);
/* { dg-final { scan-assembler-times {ldr\s} 18 } } */
/* { dg-final { scan-assembler-times {ldrb} 4 } } */
/* { dg-final { scan-assembler-times {ldrh} 4 } } */
/* { dg-final { scan-assembler-times {ldp} 1 } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment