Commit 930b700b by Michael Zolotukhin Committed by Kirill Yukhin

i386-opts.h (enum stringop_alg): Add vector_loop.

ChangeLog:
        * config/i386/i386-opts.h (enum stringop_alg): Add vector_loop.
        * config/i386/i386.c (expand_set_or_movmem_via_loop): Use
        adjust_address instead of change_address to keep info about alignment.
        (emit_strmov): Remove.
        (emit_memmov): New function.
        (expand_movmem_epilogue): Refactor to properly handle bigger sizes.
        (expand_movmem_epilogue): Likewise and return updated rtx for
        destination.
        (expand_constant_movmem_prologue): Likewise and return updated rtx for
        destination and source.
        (decide_alignment): Refactor, handle vector_loop.
        (ix86_expand_movmem): Likewise.
        (ix86_expand_setmem): Likewise.
        * config/i386/i386.opt (Enum): Add vector_loop to option stringop_alg.

testsuite/ChangeLog:
        * gcc.target/i386/memcpy-vector_loop-1.c: New.
        * gcc.target/i386/memcpy-vector_loop-2.c: New.

From-SVN: r200751
parent c8dfadf8
2013-07-08 Michael Zolotukhin <michael.v.zolotukhin@gmail.com>
* config/i386/i386-opts.h (enum stringop_alg): Add vector_loop.
* config/i386/i386.c (expand_set_or_movmem_via_loop): Use
adjust_address instead of change_address to keep info about alignment.
(emit_strmov): Remove.
(emit_memmov): New function.
(expand_movmem_epilogue): Refactor to properly handle bigger sizes.
(expand_movmem_epilogue): Likewise and return updated rtx for
destination.
(expand_constant_movmem_prologue): Likewise and return updated rtx for
destination and source.
(decide_alignment): Refactor, handle vector_loop.
(ix86_expand_movmem): Likewise.
(ix86_expand_setmem): Likewise.
* config/i386/i386.opt (Enum): Add vector_loop to option stringop_alg.
2013-07-07 Uros Bizjak <ubizjak@gmail.com> 2013-07-07 Uros Bizjak <ubizjak@gmail.com>
* config/i386/driver-i386.c (host_detect_local_cpu): Do not check * config/i386/driver-i386.c (host_detect_local_cpu): Do not check
......
...@@ -35,7 +35,8 @@ enum stringop_alg ...@@ -35,7 +35,8 @@ enum stringop_alg
rep_prefix_8_byte, rep_prefix_8_byte,
loop_1_byte, loop_1_byte,
loop, loop,
unrolled_loop unrolled_loop,
vector_loop
}; };
/* Available call abi. */ /* Available call abi. */
......
...@@ -21953,11 +21953,10 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem, ...@@ -21953,11 +21953,10 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
{ {
rtx out_label, top_label, iter, tmp; rtx out_label, top_label, iter, tmp;
enum machine_mode iter_mode = counter_mode (count); enum machine_mode iter_mode = counter_mode (count);
rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll); int piece_size_n = GET_MODE_SIZE (mode) * unroll;
rtx piece_size = GEN_INT (piece_size_n);
rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1)); rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
rtx size; rtx size;
rtx x_addr;
rtx y_addr;
int i; int i;
top_label = gen_label_rtx (); top_label = gen_label_rtx ();
...@@ -21978,13 +21977,18 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem, ...@@ -21978,13 +21977,18 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
emit_label (top_label); emit_label (top_label);
tmp = convert_modes (Pmode, iter_mode, iter, true); tmp = convert_modes (Pmode, iter_mode, iter, true);
x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
destmem = change_address (destmem, mode, x_addr); /* This assert could be relaxed - in this case we'll need to compute
smallest power of two, containing in PIECE_SIZE_N and pass it to
offset_address. */
gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
destmem = offset_address (destmem, tmp, piece_size_n);
destmem = adjust_address (destmem, mode, 0);
if (srcmem) if (srcmem)
{ {
y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp)); srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
srcmem = change_address (srcmem, mode, y_addr); srcmem = adjust_address (srcmem, mode, 0);
/* When unrolling for chips that reorder memory reads and writes, /* When unrolling for chips that reorder memory reads and writes,
we can save registers by using single temporary. we can save registers by using single temporary.
...@@ -22163,13 +22167,76 @@ expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value, ...@@ -22163,13 +22167,76 @@ expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp)); emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
} }
static void /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
emit_strmov (rtx destmem, rtx srcmem, DESTMEM.
rtx destptr, rtx srcptr, enum machine_mode mode, int offset) SRC is passed by pointer to be updated on return.
Return value is updated DST. */
static rtx
emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
HOST_WIDE_INT size_to_move)
{ {
rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset); rtx dst = destmem, src = *srcmem, adjust, tempreg;
rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset); enum insn_code code;
emit_insn (gen_strmov (destptr, dest, srcptr, src)); enum machine_mode move_mode;
int piece_size, i;
/* Find the widest mode in which we could perform moves.
Start with the biggest power of 2 less than SIZE_TO_MOVE and half
it until move of such size is supported. */
piece_size = 1 << floor_log2 (size_to_move);
move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
code = optab_handler (mov_optab, move_mode);
while (code == CODE_FOR_nothing && piece_size > 1)
{
piece_size >>= 1;
move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
code = optab_handler (mov_optab, move_mode);
}
/* Find the corresponding vector mode with the same size as MOVE_MODE.
MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
{
int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
move_mode = mode_for_vector (word_mode, nunits);
code = optab_handler (mov_optab, move_mode);
if (code == CODE_FOR_nothing)
{
move_mode = word_mode;
piece_size = GET_MODE_SIZE (move_mode);
code = optab_handler (mov_optab, move_mode);
}
}
gcc_assert (code != CODE_FOR_nothing);
dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
/* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
gcc_assert (size_to_move % piece_size == 0);
adjust = GEN_INT (piece_size);
for (i = 0; i < size_to_move; i += piece_size)
{
/* We move from memory to memory, so we'll need to do it via
a temporary register. */
tempreg = gen_reg_rtx (move_mode);
emit_insn (GEN_FCN (code) (tempreg, src));
emit_insn (GEN_FCN (code) (dst, tempreg));
emit_move_insn (destptr,
gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
emit_move_insn (srcptr,
gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
dst = adjust_automodify_address_nv (dst, move_mode, destptr,
piece_size);
src = adjust_automodify_address_nv (src, move_mode, srcptr,
piece_size);
}
/* Update DST and SRC rtx. */
*srcmem = src;
return dst;
} }
/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */ /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
...@@ -22181,44 +22248,17 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem, ...@@ -22181,44 +22248,17 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem,
if (CONST_INT_P (count)) if (CONST_INT_P (count))
{ {
HOST_WIDE_INT countval = INTVAL (count); HOST_WIDE_INT countval = INTVAL (count);
int offset = 0; HOST_WIDE_INT epilogue_size = countval % max_size;
int i;
if ((countval & 0x10) && max_size > 16) /* For now MAX_SIZE should be a power of 2. This assert could be
relaxed, but it'll require a bit more complicated epilogue
expanding. */
gcc_assert ((max_size & (max_size - 1)) == 0);
for (i = max_size; i >= 1; i >>= 1)
{ {
if (TARGET_64BIT) if (epilogue_size & i)
{ destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
}
else
gcc_unreachable ();
offset += 16;
}
if ((countval & 0x08) && max_size > 8)
{
if (TARGET_64BIT)
emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
else
{
emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
}
offset += 8;
}
if ((countval & 0x04) && max_size > 4)
{
emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
offset += 4;
}
if ((countval & 0x02) && max_size > 2)
{
emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
offset += 2;
}
if ((countval & 0x01) && max_size > 1)
{
emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
offset += 1;
} }
return; return;
} }
...@@ -22454,47 +22494,33 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_ ...@@ -22454,47 +22494,33 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_
} }
/* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
DESIRED_ALIGNMENT. */ DESIRED_ALIGNMENT.
static void Return value is updated DESTMEM. */
static rtx
expand_movmem_prologue (rtx destmem, rtx srcmem, expand_movmem_prologue (rtx destmem, rtx srcmem,
rtx destptr, rtx srcptr, rtx count, rtx destptr, rtx srcptr, rtx count,
int align, int desired_alignment) int align, int desired_alignment)
{ {
if (align <= 1 && desired_alignment > 1) int i;
for (i = 1; i < desired_alignment; i <<= 1)
{ {
rtx label = ix86_expand_aligntest (destptr, 1, false); if (align <= i)
srcmem = change_address (srcmem, QImode, srcptr);
destmem = change_address (destmem, QImode, destptr);
emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
ix86_adjust_counter (count, 1);
emit_label (label);
LABEL_NUSES (label) = 1;
}
if (align <= 2 && desired_alignment > 2)
{ {
rtx label = ix86_expand_aligntest (destptr, 2, false); rtx label = ix86_expand_aligntest (destptr, i, false);
srcmem = change_address (srcmem, HImode, srcptr); destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
destmem = change_address (destmem, HImode, destptr); ix86_adjust_counter (count, i);
emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
ix86_adjust_counter (count, 2);
emit_label (label); emit_label (label);
LABEL_NUSES (label) = 1; LABEL_NUSES (label) = 1;
set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
} }
if (align <= 4 && desired_alignment > 4)
{
rtx label = ix86_expand_aligntest (destptr, 4, false);
srcmem = change_address (srcmem, SImode, srcptr);
destmem = change_address (destmem, SImode, destptr);
emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
ix86_adjust_counter (count, 4);
emit_label (label);
LABEL_NUSES (label) = 1;
} }
gcc_assert (desired_alignment <= 8); return destmem;
} }
/* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN. /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
ALIGN_BYTES is how many bytes need to be copied. */ ALIGN_BYTES is how many bytes need to be copied.
The function updates DST and SRC, namely, it sets proper alignment.
DST is returned via return value, SRC is updated via pointer SRCP. */
static rtx static rtx
expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg, expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
int desired_align, int align_bytes) int desired_align, int align_bytes)
...@@ -22502,62 +22528,34 @@ expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg, ...@@ -22502,62 +22528,34 @@ expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
rtx src = *srcp; rtx src = *srcp;
rtx orig_dst = dst; rtx orig_dst = dst;
rtx orig_src = src; rtx orig_src = src;
int off = 0; int piece_size = 1;
int copied_bytes = 0;
int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT); int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
if (src_align_bytes >= 0) if (src_align_bytes >= 0)
src_align_bytes = desired_align - src_align_bytes; src_align_bytes = desired_align - src_align_bytes;
if (align_bytes & 1)
for (piece_size = 1;
piece_size <= desired_align && copied_bytes < align_bytes;
piece_size <<= 1)
{ {
dst = adjust_automodify_address_nv (dst, QImode, destreg, 0); if (align_bytes & piece_size)
src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
off = 1;
emit_insn (gen_strmov (destreg, dst, srcreg, src));
}
if (align_bytes & 2)
{ {
dst = adjust_automodify_address_nv (dst, HImode, destreg, off); dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
src = adjust_automodify_address_nv (src, HImode, srcreg, off); copied_bytes += piece_size;
if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
set_mem_align (dst, 2 * BITS_PER_UNIT);
if (src_align_bytes >= 0
&& (src_align_bytes & 1) == (align_bytes & 1)
&& MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
set_mem_align (src, 2 * BITS_PER_UNIT);
off = 2;
emit_insn (gen_strmov (destreg, dst, srcreg, src));
} }
if (align_bytes & 4)
{
dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
src = adjust_automodify_address_nv (src, SImode, srcreg, off);
if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
set_mem_align (dst, 4 * BITS_PER_UNIT);
if (src_align_bytes >= 0)
{
unsigned int src_align = 0;
if ((src_align_bytes & 3) == (align_bytes & 3))
src_align = 4;
else if ((src_align_bytes & 1) == (align_bytes & 1))
src_align = 2;
if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
set_mem_align (src, src_align * BITS_PER_UNIT);
} }
off = 4;
emit_insn (gen_strmov (destreg, dst, srcreg, src));
}
dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT) if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
set_mem_align (dst, desired_align * BITS_PER_UNIT); set_mem_align (dst, desired_align * BITS_PER_UNIT);
if (src_align_bytes >= 0) if (src_align_bytes >= 0)
{ {
unsigned int src_align = 0; unsigned int src_align;
if ((src_align_bytes & 7) == (align_bytes & 7)) for (src_align = desired_align; src_align >= 2; src_align >>= 1)
src_align = 8; {
else if ((src_align_bytes & 3) == (align_bytes & 3)) if ((src_align_bytes & (src_align - 1))
src_align = 4; == (align_bytes & (src_align - 1)))
else if ((src_align_bytes & 1) == (align_bytes & 1)) break;
src_align = 2; }
if (src_align > (unsigned int) desired_align) if (src_align > (unsigned int) desired_align)
src_align = desired_align; src_align = desired_align;
if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT) if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
...@@ -22790,42 +22788,24 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, ...@@ -22790,42 +22788,24 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
static int static int
decide_alignment (int align, decide_alignment (int align,
enum stringop_alg alg, enum stringop_alg alg,
int expected_size) int expected_size,
enum machine_mode move_mode)
{ {
int desired_align = 0; int desired_align = 0;
switch (alg)
{ gcc_assert (alg != no_stringop);
case no_stringop:
gcc_unreachable (); if (alg == libcall)
case loop: return 0;
case unrolled_loop: if (move_mode == VOIDmode)
desired_align = GET_MODE_SIZE (Pmode); return 0;
break;
case rep_prefix_8_byte: desired_align = GET_MODE_SIZE (move_mode);
desired_align = 8;
break;
case rep_prefix_4_byte:
/* PentiumPro has special logic triggering for 8 byte aligned blocks.
copying whole cacheline at once. */
if (TARGET_PENTIUMPRO)
desired_align = 8;
else
desired_align = 4;
break;
case rep_prefix_1_byte:
/* PentiumPro has special logic triggering for 8 byte aligned blocks. /* PentiumPro has special logic triggering for 8 byte aligned blocks.
copying whole cacheline at once. */ copying whole cacheline at once. */
if (TARGET_PENTIUMPRO) if (TARGET_PENTIUMPRO
&& (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
desired_align = 8; desired_align = 8;
else
desired_align = 1;
break;
case loop_1_byte:
desired_align = 1;
break;
case libcall:
return 0;
}
if (optimize_size) if (optimize_size)
desired_align = 1; desired_align = 1;
...@@ -22833,17 +22813,8 @@ decide_alignment (int align, ...@@ -22833,17 +22813,8 @@ decide_alignment (int align,
desired_align = align; desired_align = align;
if (expected_size != -1 && expected_size < 4) if (expected_size != -1 && expected_size < 4)
desired_align = align; desired_align = align;
return desired_align;
}
/* Return the smallest power of 2 greater than VAL. */ return desired_align;
static int
smallest_pow2_greater_than (int val)
{
int ret = 1;
while (ret <= val)
ret <<= 1;
return ret;
} }
/* Expand string move (memcpy) operation. Use i386 string operations /* Expand string move (memcpy) operation. Use i386 string operations
...@@ -22889,6 +22860,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, ...@@ -22889,6 +22860,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
int dynamic_check; int dynamic_check;
bool need_zero_guard = false; bool need_zero_guard = false;
bool noalign; bool noalign;
enum machine_mode move_mode = VOIDmode;
int unroll_factor = 1;
if (CONST_INT_P (align_exp)) if (CONST_INT_P (align_exp))
align = INTVAL (align_exp); align = INTVAL (align_exp);
...@@ -22912,50 +22885,71 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, ...@@ -22912,50 +22885,71 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
/* Step 0: Decide on preferred algorithm, desired alignment and /* Step 0: Decide on preferred algorithm, desired alignment and
size of chunks to be copied by main loop. */ size of chunks to be copied by main loop. */
alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign); alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
desired_align = decide_alignment (align, alg, expected_size);
if (!TARGET_ALIGN_STRINGOPS || noalign)
align = desired_align;
if (alg == libcall) if (alg == libcall)
return false; return false;
gcc_assert (alg != no_stringop); gcc_assert (alg != no_stringop);
if (!count) if (!count)
count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
destreg = copy_addr_to_reg (XEXP (dst, 0)); destreg = copy_addr_to_reg (XEXP (dst, 0));
srcreg = copy_addr_to_reg (XEXP (src, 0)); srcreg = copy_addr_to_reg (XEXP (src, 0));
unroll_factor = 1;
move_mode = word_mode;
switch (alg) switch (alg)
{ {
case libcall: case libcall:
case no_stringop: case no_stringop:
gcc_unreachable (); gcc_unreachable ();
case loop_1_byte:
need_zero_guard = true;
move_mode = QImode;
break;
case loop: case loop:
need_zero_guard = true; need_zero_guard = true;
size_needed = GET_MODE_SIZE (word_mode);
break; break;
case unrolled_loop: case unrolled_loop:
need_zero_guard = true; need_zero_guard = true;
size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2); unroll_factor = (TARGET_64BIT ? 4 : 2);
break;
case vector_loop:
need_zero_guard = true;
unroll_factor = 4;
/* Find the widest supported mode. */
move_mode = word_mode;
while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
!= CODE_FOR_nothing)
move_mode = GET_MODE_WIDER_MODE (move_mode);
/* Find the corresponding vector mode with the same size as MOVE_MODE.
MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
{
int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
move_mode = mode_for_vector (word_mode, nunits);
if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
move_mode = word_mode;
}
gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
break; break;
case rep_prefix_8_byte: case rep_prefix_8_byte:
size_needed = 8; move_mode = DImode;
break; break;
case rep_prefix_4_byte: case rep_prefix_4_byte:
size_needed = 4; move_mode = SImode;
break; break;
case rep_prefix_1_byte: case rep_prefix_1_byte:
size_needed = 1; move_mode = QImode;
break;
case loop_1_byte:
need_zero_guard = true;
size_needed = 1;
break; break;
} }
size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
epilogue_size_needed = size_needed; epilogue_size_needed = size_needed;
desired_align = decide_alignment (align, alg, expected_size, move_mode);
if (!TARGET_ALIGN_STRINGOPS || noalign)
align = desired_align;
/* Step 1: Prologue guard. */ /* Step 1: Prologue guard. */
/* Alignment code needs count to be in register. */ /* Alignment code needs count to be in register. */
...@@ -22982,7 +22976,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, ...@@ -22982,7 +22976,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
epilogue_size_needed = MAX (size_needed - 1, desired_align - align); epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
/* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
Make sure it is power of 2. */ Make sure it is power of 2. */
epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
if (count) if (count)
{ {
...@@ -23047,7 +23041,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, ...@@ -23047,7 +23041,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
the info early. */ the info early. */
src = change_address (src, BLKmode, srcreg); src = change_address (src, BLKmode, srcreg);
dst = change_address (dst, BLKmode, destreg); dst = change_address (dst, BLKmode, destreg);
expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align, dst = expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
desired_align); desired_align);
} }
else else
...@@ -23099,31 +23093,18 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, ...@@ -23099,31 +23093,18 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
case no_stringop: case no_stringop:
gcc_unreachable (); gcc_unreachable ();
case loop_1_byte: case loop_1_byte:
expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
count_exp, QImode, 1, expected_size);
break;
case loop: case loop:
expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
count_exp, word_mode, 1, expected_size);
break;
case unrolled_loop: case unrolled_loop:
/* Unroll only by factor of 2 in 32bit mode, since we don't have enough case vector_loop:
registers for 4 temporaries anyway. */
expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
count_exp, word_mode, TARGET_64BIT ? 4 : 2, count_exp, move_mode, unroll_factor,
expected_size); expected_size);
break; break;
case rep_prefix_8_byte: case rep_prefix_8_byte:
expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
DImode);
break;
case rep_prefix_4_byte: case rep_prefix_4_byte:
expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
SImode);
break;
case rep_prefix_1_byte: case rep_prefix_1_byte:
expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
QImode); move_mode);
break; break;
} }
/* Adjust properly the offset of src and dest memory for aliasing. */ /* Adjust properly the offset of src and dest memory for aliasing. */
...@@ -23164,7 +23145,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, ...@@ -23164,7 +23145,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
if (count_exp != const0_rtx && epilogue_size_needed > 1) if (count_exp != const0_rtx && epilogue_size_needed > 1)
expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp, expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
epilogue_size_needed); size_needed);
if (jump_around_label) if (jump_around_label)
emit_label (jump_around_label); emit_label (jump_around_label);
return true; return true;
...@@ -23285,6 +23266,8 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, ...@@ -23285,6 +23266,8 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
int dynamic_check; int dynamic_check;
bool need_zero_guard = false; bool need_zero_guard = false;
bool noalign; bool noalign;
enum machine_mode move_mode = VOIDmode;
int unroll_factor;
if (CONST_INT_P (align_exp)) if (CONST_INT_P (align_exp))
align = INTVAL (align_exp); align = INTVAL (align_exp);
...@@ -23305,17 +23288,16 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, ...@@ -23305,17 +23288,16 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
size of chunks to be copied by main loop. */ size of chunks to be copied by main loop. */
alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign); alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
desired_align = decide_alignment (align, alg, expected_size);
if (!TARGET_ALIGN_STRINGOPS || noalign)
align = desired_align;
if (alg == libcall) if (alg == libcall)
return false; return false;
gcc_assert (alg != no_stringop); gcc_assert (alg != no_stringop);
if (!count) if (!count)
count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp); count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
destreg = copy_addr_to_reg (XEXP (dst, 0)); destreg = copy_addr_to_reg (XEXP (dst, 0));
move_mode = word_mode;
unroll_factor = 1;
switch (alg) switch (alg)
{ {
case libcall: case libcall:
...@@ -23323,28 +23305,33 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, ...@@ -23323,28 +23305,33 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
gcc_unreachable (); gcc_unreachable ();
case loop: case loop:
need_zero_guard = true; need_zero_guard = true;
size_needed = GET_MODE_SIZE (word_mode);
break; break;
case vector_loop:
case unrolled_loop: case unrolled_loop:
need_zero_guard = true; need_zero_guard = true;
size_needed = GET_MODE_SIZE (word_mode) * 4; unroll_factor = 4;
break; break;
case rep_prefix_8_byte: case rep_prefix_8_byte:
size_needed = 8; move_mode = DImode;
break; break;
case rep_prefix_4_byte: case rep_prefix_4_byte:
size_needed = 4; move_mode = SImode;
break; break;
case rep_prefix_1_byte: case rep_prefix_1_byte:
size_needed = 1; move_mode = QImode;
break; break;
case loop_1_byte: case loop_1_byte:
need_zero_guard = true; need_zero_guard = true;
size_needed = 1; move_mode = QImode;
break; break;
} }
size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
epilogue_size_needed = size_needed; epilogue_size_needed = size_needed;
desired_align = decide_alignment (align, alg, expected_size, move_mode);
if (!TARGET_ALIGN_STRINGOPS || noalign)
align = desired_align;
/* Step 1: Prologue guard. */ /* Step 1: Prologue guard. */
/* Alignment code needs count to be in register. */ /* Alignment code needs count to be in register. */
...@@ -23380,7 +23367,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, ...@@ -23380,7 +23367,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
epilogue_size_needed = MAX (size_needed - 1, desired_align - align); epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
/* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes. /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
Make sure it is power of 2. */ Make sure it is power of 2. */
epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
/* To improve performance of small blocks, we jump around the VAL /* To improve performance of small blocks, we jump around the VAL
promoting mode. This mean that if the promoted VAL is not constant, promoting mode. This mean that if the promoted VAL is not constant,
...@@ -23494,16 +23481,12 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, ...@@ -23494,16 +23481,12 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
case no_stringop: case no_stringop:
gcc_unreachable (); gcc_unreachable ();
case loop_1_byte: case loop_1_byte:
expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
count_exp, QImode, 1, expected_size);
break;
case loop: case loop:
expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, case vector_loop:
count_exp, word_mode, 1, expected_size);
break;
case unrolled_loop: case unrolled_loop:
expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
count_exp, word_mode, 4, expected_size); count_exp, move_mode, unroll_factor,
expected_size);
break; break;
case rep_prefix_8_byte: case rep_prefix_8_byte:
expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
...@@ -345,6 +345,9 @@ Enum(stringop_alg) String(loop) Value(loop) ...@@ -345,6 +345,9 @@ Enum(stringop_alg) String(loop) Value(loop)
EnumValue EnumValue
Enum(stringop_alg) String(unrolled_loop) Value(unrolled_loop) Enum(stringop_alg) String(unrolled_loop) Value(unrolled_loop)
EnumValue
Enum(stringop_alg) String(vector_loop) Value(vector_loop)
mtls-dialect= mtls-dialect=
Target RejectNegative Joined Var(ix86_tls_dialect) Enum(tls_dialect) Init(TLS_DIALECT_GNU) Target RejectNegative Joined Var(ix86_tls_dialect) Enum(tls_dialect) Init(TLS_DIALECT_GNU)
Use given thread-local storage dialect Use given thread-local storage dialect
......
2013-07-08 Michael Zolotukhin <michael.v.zolotukhin@gmail.com>
* gcc.target/i386/memcpy-vector_loop-1.c: New.
* gcc.target/i386/memcpy-vector_loop-2.c: New.
2013-07-06 Uros Bizjak <ubizjak@gmail.com> 2013-07-06 Uros Bizjak <ubizjak@gmail.com>
PR target/57807 PR target/57807
......
/* { dg-do compile } */
/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */
/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
char a[2048];
char b[2048];
void t (void)
{
__builtin_memcpy (a, b, 2048);
}
/* { dg-do compile } */
/* { dg-options "-O2 -march=atom -minline-all-stringops -mstringop-strategy=vector_loop" } */
/* { dg-final { scan-assembler-times "movdqa" 4} } */
char *a;
char *b;
void t (void)
{
__builtin_memcpy (a, b, 2048);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment