Commit 6cc4833a by Julian Brown Committed by Julian Brown

gcc/

    * config/arm/arm.c (arm_block_move_unaligned_straight)
    (arm_adjust_block_mem, arm_block_move_unaligned_loop)
    (arm_movmemqi_unaligned): New.
    (arm_gen_movmemqi): Support unaligned block copies.

    gcc/testsuite/
    * lib/target-supports.exp (check_effective_target_arm_unaligned):
    New.
    * gcc.target/arm/unaligned-memcpy-1.c: New.
    * gcc.target/arm/unaligned-memcpy-2.c: New.
    * gcc.target/arm/unaligned-memcpy-3.c: New.
    * gcc.target/arm/unaligned-memcpy-4.c: New.

From-SVN: r180131
parent 00820a2a
2011-10-18 Julian Brown <julian@codesourcery.com>
* config/arm/arm.c (arm_block_move_unaligned_straight)
(arm_adjust_block_mem, arm_block_move_unaligned_loop)
(arm_movmemqi_unaligned): New.
(arm_gen_movmemqi): Support unaligned block copies.
2011-10-18 Ira Rosen <ira.rosen@linaro.org> 2011-10-18 Ira Rosen <ira.rosen@linaro.org>
* doc/md.texi (vec_widen_ushiftl_hi, vec_widen_ushiftl_lo, * doc/md.texi (vec_widen_ushiftl_hi, vec_widen_ushiftl_lo,
...@@ -10766,6 +10766,335 @@ gen_const_stm_seq (rtx *operands, int nops) ...@@ -10766,6 +10766,335 @@ gen_const_stm_seq (rtx *operands, int nops)
return true; return true;
} }
/* Copy a block of memory using plain ldr/str/ldrh/strh instructions, to permit
unaligned copies on processors which support unaligned semantics for those
instructions. INTERLEAVE_FACTOR can be used to attempt to hide load latency
(using more registers) by doing e.g. load/load/store/store for a factor of 2.
An interleave factor of 1 (the minimum) will perform no interleaving.
Load/store multiple are used for aligned addresses where possible. */
static void
arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase,
HOST_WIDE_INT length,
unsigned int interleave_factor)
{
rtx *regs = XALLOCAVEC (rtx, interleave_factor);
int *regnos = XALLOCAVEC (int, interleave_factor);
HOST_WIDE_INT block_size_bytes = interleave_factor * UNITS_PER_WORD;
HOST_WIDE_INT i, j;
HOST_WIDE_INT remaining = length, words;
rtx halfword_tmp = NULL, byte_tmp = NULL;
rtx dst, src;
bool src_aligned = MEM_ALIGN (srcbase) >= BITS_PER_WORD;
bool dst_aligned = MEM_ALIGN (dstbase) >= BITS_PER_WORD;
HOST_WIDE_INT srcoffset, dstoffset;
HOST_WIDE_INT src_autoinc, dst_autoinc;
rtx mem, addr;
gcc_assert (1 <= interleave_factor && interleave_factor <= 4);
/* Use hard registers if we have aligned source or destination so we can use
load/store multiple with contiguous registers. */
if (dst_aligned || src_aligned)
for (i = 0; i < interleave_factor; i++)
regs[i] = gen_rtx_REG (SImode, i);
else
for (i = 0; i < interleave_factor; i++)
regs[i] = gen_reg_rtx (SImode);
dst = copy_addr_to_reg (XEXP (dstbase, 0));
src = copy_addr_to_reg (XEXP (srcbase, 0));
srcoffset = dstoffset = 0;
/* Calls to arm_gen_load_multiple and arm_gen_store_multiple update SRC/DST.
For copying the last bytes we want to subtract this offset again. */
src_autoinc = dst_autoinc = 0;
for (i = 0; i < interleave_factor; i++)
regnos[i] = i;
/* Copy BLOCK_SIZE_BYTES chunks. */
for (i = 0; i + block_size_bytes <= length; i += block_size_bytes)
{
/* Load words. */
if (src_aligned && interleave_factor > 1)
{
emit_insn (arm_gen_load_multiple (regnos, interleave_factor, src,
TRUE, srcbase, &srcoffset));
src_autoinc += UNITS_PER_WORD * interleave_factor;
}
else
{
for (j = 0; j < interleave_factor; j++)
{
addr = plus_constant (src, srcoffset + j * UNITS_PER_WORD
- src_autoinc);
mem = adjust_automodify_address (srcbase, SImode, addr,
srcoffset + j * UNITS_PER_WORD);
emit_insn (gen_unaligned_loadsi (regs[j], mem));
}
srcoffset += block_size_bytes;
}
/* Store words. */
if (dst_aligned && interleave_factor > 1)
{
emit_insn (arm_gen_store_multiple (regnos, interleave_factor, dst,
TRUE, dstbase, &dstoffset));
dst_autoinc += UNITS_PER_WORD * interleave_factor;
}
else
{
for (j = 0; j < interleave_factor; j++)
{
addr = plus_constant (dst, dstoffset + j * UNITS_PER_WORD
- dst_autoinc);
mem = adjust_automodify_address (dstbase, SImode, addr,
dstoffset + j * UNITS_PER_WORD);
emit_insn (gen_unaligned_storesi (mem, regs[j]));
}
dstoffset += block_size_bytes;
}
remaining -= block_size_bytes;
}
/* Copy any whole words left (note these aren't interleaved with any
subsequent halfword/byte load/stores in the interests of simplicity). */
words = remaining / UNITS_PER_WORD;
gcc_assert (words < interleave_factor);
if (src_aligned && words > 1)
{
emit_insn (arm_gen_load_multiple (regnos, words, src, TRUE, srcbase,
&srcoffset));
src_autoinc += UNITS_PER_WORD * words;
}
else
{
for (j = 0; j < words; j++)
{
addr = plus_constant (src,
srcoffset + j * UNITS_PER_WORD - src_autoinc);
mem = adjust_automodify_address (srcbase, SImode, addr,
srcoffset + j * UNITS_PER_WORD);
emit_insn (gen_unaligned_loadsi (regs[j], mem));
}
srcoffset += words * UNITS_PER_WORD;
}
if (dst_aligned && words > 1)
{
emit_insn (arm_gen_store_multiple (regnos, words, dst, TRUE, dstbase,
&dstoffset));
dst_autoinc += words * UNITS_PER_WORD;
}
else
{
for (j = 0; j < words; j++)
{
addr = plus_constant (dst,
dstoffset + j * UNITS_PER_WORD - dst_autoinc);
mem = adjust_automodify_address (dstbase, SImode, addr,
dstoffset + j * UNITS_PER_WORD);
emit_insn (gen_unaligned_storesi (mem, regs[j]));
}
dstoffset += words * UNITS_PER_WORD;
}
remaining -= words * UNITS_PER_WORD;
gcc_assert (remaining < 4);
/* Copy a halfword if necessary. */
if (remaining >= 2)
{
halfword_tmp = gen_reg_rtx (SImode);
addr = plus_constant (src, srcoffset - src_autoinc);
mem = adjust_automodify_address (srcbase, HImode, addr, srcoffset);
emit_insn (gen_unaligned_loadhiu (halfword_tmp, mem));
/* Either write out immediately, or delay until we've loaded the last
byte, depending on interleave factor. */
if (interleave_factor == 1)
{
addr = plus_constant (dst, dstoffset - dst_autoinc);
mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
emit_insn (gen_unaligned_storehi (mem,
gen_lowpart (HImode, halfword_tmp)));
halfword_tmp = NULL;
dstoffset += 2;
}
remaining -= 2;
srcoffset += 2;
}
gcc_assert (remaining < 2);
/* Copy last byte. */
if ((remaining & 1) != 0)
{
byte_tmp = gen_reg_rtx (SImode);
addr = plus_constant (src, srcoffset - src_autoinc);
mem = adjust_automodify_address (srcbase, QImode, addr, srcoffset);
emit_move_insn (gen_lowpart (QImode, byte_tmp), mem);
if (interleave_factor == 1)
{
addr = plus_constant (dst, dstoffset - dst_autoinc);
mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
byte_tmp = NULL;
dstoffset++;
}
remaining--;
srcoffset++;
}
/* Store last halfword if we haven't done so already. */
if (halfword_tmp)
{
addr = plus_constant (dst, dstoffset - dst_autoinc);
mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
emit_insn (gen_unaligned_storehi (mem,
gen_lowpart (HImode, halfword_tmp)));
dstoffset += 2;
}
/* Likewise for last byte. */
if (byte_tmp)
{
addr = plus_constant (dst, dstoffset - dst_autoinc);
mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
dstoffset++;
}
gcc_assert (remaining == 0 && srcoffset == dstoffset);
}
/* From mips_adjust_block_mem:
Helper function for doing a loop-based block operation on memory
reference MEM. Each iteration of the loop will operate on LENGTH
bytes of MEM.
Create a new base register for use within the loop and point it to
the start of MEM. Create a new memory reference that uses this
register. Store them in *LOOP_REG and *LOOP_MEM respectively. */
static void
arm_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
rtx *loop_mem)
{
*loop_reg = copy_addr_to_reg (XEXP (mem, 0));
/* Although the new mem does not refer to a known location,
it does keep up to LENGTH bytes of alignment. */
*loop_mem = change_address (mem, BLKmode, *loop_reg);
set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
}
/* From mips_block_move_loop:
Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
bytes at a time. LENGTH must be at least BYTES_PER_ITER. Assume that
the memory regions do not overlap. */
static void
arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length,
unsigned int interleave_factor,
HOST_WIDE_INT bytes_per_iter)
{
rtx label, src_reg, dest_reg, final_src, test;
HOST_WIDE_INT leftover;
leftover = length % bytes_per_iter;
length -= leftover;
/* Create registers and memory references for use within the loop. */
arm_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
arm_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);
/* Calculate the value that SRC_REG should have after the last iteration of
the loop. */
final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length),
0, 0, OPTAB_WIDEN);
/* Emit the start of the loop. */
label = gen_label_rtx ();
emit_label (label);
/* Emit the loop body. */
arm_block_move_unaligned_straight (dest, src, bytes_per_iter,
interleave_factor);
/* Move on to the next block. */
emit_move_insn (src_reg, plus_constant (src_reg, bytes_per_iter));
emit_move_insn (dest_reg, plus_constant (dest_reg, bytes_per_iter));
/* Emit the loop condition. */
test = gen_rtx_NE (VOIDmode, src_reg, final_src);
emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label));
/* Mop up any left-over bytes. */
if (leftover)
arm_block_move_unaligned_straight (dest, src, leftover, interleave_factor);
}
/* Emit a block move when either the source or destination is unaligned (not
aligned to a four-byte boundary). This may need further tuning depending on
core type, optimize_size setting, etc. */
static int
arm_movmemqi_unaligned (rtx *operands)
{
HOST_WIDE_INT length = INTVAL (operands[2]);
if (optimize_size)
{
bool src_aligned = MEM_ALIGN (operands[1]) >= BITS_PER_WORD;
bool dst_aligned = MEM_ALIGN (operands[0]) >= BITS_PER_WORD;
/* Inlined memcpy using ldr/str/ldrh/strh can be quite big: try to limit
size of code if optimizing for size. We'll use ldm/stm if src_aligned
or dst_aligned though: allow more interleaving in those cases since the
resulting code can be smaller. */
unsigned int interleave_factor = (src_aligned || dst_aligned) ? 2 : 1;
HOST_WIDE_INT bytes_per_iter = (src_aligned || dst_aligned) ? 8 : 4;
if (length > 12)
arm_block_move_unaligned_loop (operands[0], operands[1], length,
interleave_factor, bytes_per_iter);
else
arm_block_move_unaligned_straight (operands[0], operands[1], length,
interleave_factor);
}
else
{
/* Note that the loop created by arm_block_move_unaligned_loop may be
subject to loop unrolling, which makes tuning this condition a little
redundant. */
if (length > 32)
arm_block_move_unaligned_loop (operands[0], operands[1], length, 4, 16);
else
arm_block_move_unaligned_straight (operands[0], operands[1], length, 4);
}
return 1;
}
int int
arm_gen_movmemqi (rtx *operands) arm_gen_movmemqi (rtx *operands)
{ {
...@@ -10778,8 +11107,13 @@ arm_gen_movmemqi (rtx *operands) ...@@ -10778,8 +11107,13 @@ arm_gen_movmemqi (rtx *operands)
if (GET_CODE (operands[2]) != CONST_INT if (GET_CODE (operands[2]) != CONST_INT
|| GET_CODE (operands[3]) != CONST_INT || GET_CODE (operands[3]) != CONST_INT
|| INTVAL (operands[2]) > 64 || INTVAL (operands[2]) > 64)
|| INTVAL (operands[3]) & 3) return 0;
if (unaligned_access && (INTVAL (operands[3]) & 3) != 0)
return arm_movmemqi_unaligned (operands);
if (INTVAL (operands[3]) & 3)
return 0; return 0;
dstbase = operands[0]; dstbase = operands[0];
......
2011-10-18 Julian Brown <julian@codesourcery.com>
* lib/target-supports.exp (check_effective_target_arm_unaligned): New.
* gcc.target/arm/unaligned-memcpy-1.c: New.
* gcc.target/arm/unaligned-memcpy-2.c: New.
* gcc.target/arm/unaligned-memcpy-3.c: New.
* gcc.target/arm/unaligned-memcpy-4.c: New.
2011-10-18 Janus Weil <janus@gcc.gnu.org> 2011-10-18 Janus Weil <janus@gcc.gnu.org>
PR fortran/47023 PR fortran/47023
......
/* { dg-do compile } */
/* { dg-require-effective-target arm_unaligned } */
/* { dg-options "-O2" } */
#include <string.h>
void unknown_alignment (char *dest, char *src)
{
memcpy (dest, src, 15);
}
/* We should see three unaligned word loads and store pairs, one unaligned
ldrh/strh pair, and an ldrb/strb pair. Sanity check that. */
/* { dg-final { scan-assembler-times "@ unaligned" 8 } } */
/* { dg-final { scan-assembler-times "ldrh" 1 } } */
/* { dg-final { scan-assembler-times "strh" 1 } } */
/* { dg-final { scan-assembler-times "ldrb" 1 } } */
/* { dg-final { scan-assembler-times "strb" 1 } } */
/* { dg-do compile } */
/* { dg-require-effective-target arm_unaligned } */
/* { dg-options "-O2" } */
#include <string.h>
char dest[16];
void aligned_dest (char *src)
{
memcpy (dest, src, 15);
}
/* Expect a multi-word store for the main part of the copy, but subword
loads/stores for the remainder. */
/* { dg-final { scan-assembler-times "stmia" 1 } } */
/* { dg-final { scan-assembler-times "ldrh" 1 } } */
/* { dg-final { scan-assembler-times "strh" 1 } } */
/* { dg-final { scan-assembler-times "ldrb" 1 } } */
/* { dg-final { scan-assembler-times "strb" 1 } } */
/* { dg-do compile } */
/* { dg-require-effective-target arm_unaligned } */
/* { dg-options "-O2" } */
#include <string.h>
char src[16];
void aligned_src (char *dest)
{
memcpy (dest, src, 15);
}
/* Expect a multi-word load for the main part of the copy, but subword
loads/stores for the remainder. */
/* { dg-final { scan-assembler-times "ldmia" 1 } } */
/* { dg-final { scan-assembler-times "ldrh" 1 } } */
/* { dg-final { scan-assembler-times "strh" 1 } } */
/* { dg-final { scan-assembler-times "ldrb" 1 } } */
/* { dg-final { scan-assembler-times "strb" 1 } } */
/* { dg-do compile } */
/* { dg-require-effective-target arm_unaligned } */
/* { dg-options "-O2" } */
#include <string.h>
char src[16];
char dest[16];
void aligned_both (void)
{
memcpy (dest, src, 15);
}
/* We know both src and dest to be aligned: expect multiword loads/stores. */
/* { dg-final { scan-assembler-times "ldmia" 1 } } */
/* { dg-final { scan-assembler-times "stmia" 1 } } */
...@@ -1973,6 +1973,18 @@ proc check_effective_target_arm_dsp { } { ...@@ -1973,6 +1973,18 @@ proc check_effective_target_arm_dsp { } {
}] }]
} }
# Return 1 if this is an ARM target that supports unaligned word/halfword
# load/store instructions.
proc check_effective_target_arm_unaligned { } {
return [check_no_compiler_messages arm_unaligned assembly {
#ifndef __ARM_FEATURE_UNALIGNED
#error no unaligned support
#endif
int i;
}]
}
# Add the options needed for NEON. We need either -mfloat-abi=softfp # Add the options needed for NEON. We need either -mfloat-abi=softfp
# or -mfloat-abi=hard, but if one is already specified by the # or -mfloat-abi=hard, but if one is already specified by the
# multilib, use it. Similarly, if a -mfpu option already enables # multilib, use it. Similarly, if a -mfpu option already enables
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment