Commit e2e52e1b by Jan Hubicka Committed by Jan Hubicka

i386.md (memstr): Do not use rep stosb for counts divisible by 4 when optimize_size.

	* i386.md (memstr): Do not use rep stosb for counts divisible by 4
	when optimize_size.
	(clrstrsi): Rewrite.
	(strsethi, strsetqi): New expanders.
	(strsethi_1, strsetqi_1, rep_stossi, rep_stosqi): New insn patterns.
	(cmpstrsi): Emit compare insn before cmpstrsi_1
	(cmpstrsi_nz): use flags, set type to str, prefix_length to 1.
	(strlensi_1): Likewise.
	(cmpstrsi_1): Likewise; do not output compare.
	(strlen expander): Do not unroll when optimizing for size.
	(*subsi3_carry): Rename to subsi3_carry
	(addqi3_cc): New pattern.
	* i386.h (processor_costs): Add move_ratio field.
	(MOVE_RATIO): Use move_ratio field, set to 3 for OPTIMIZE_SIZE
	* i386.c (*_cost): Set move_ratio.
	(x86_unroll_strlen): Enable for Athlon, PPro and K6 too.
	(x86_expand_strlensi_1): Rewrite the main loop.

From-SVN: r31488
parent b9f243c2
Tue Jan 18 16:19:55 MET 2000 Jan Hubicka <hubicka@freesoft.cz>
* i386.md (memstr): Do not use rep stosb for counts divisible by 4
when optimize_size.
(clrstrsi): Rewrite.
(strsethi, strsetqi): New expanders.
(strsethi_1, strsetqi_1, rep_stossi, rep_stosqi): New insn patterns.
(cmpstrsi): Emit compare insn before cmpstrsi_1
(cmpstrsi_nz): use flags, set type to str, prefix_length to 1.
(strlensi_1): Likewise.
(cmpstrsi_1): Likewise; do not output compare.
(strlen expander): Do not unroll when optimizing for size.
(*subsi3_carry): Rename to subsi3_carry
(addqi3_cc): New pattern.
* i386.h (processor_costs): Add move_ratio field.
(MOVE_RATIO): Use move_ratio field, set to 3 for OPTIMIZE_SIZE
* i386.c (*_cost): Set move_ratio.
(x86_unroll_strlen): Enable for Athlon, PPro and K6 too.
(x86_expand_strlensi_1): Rewrite the main loop.
2000-01-17 Richard Henderson <rth@cygnus.com> 2000-01-17 Richard Henderson <rth@cygnus.com>
* combine.c (combine_simplify_rtx): Give FLOAT_STORE_FLAG_VALUE a mode. * combine.c (combine_simplify_rtx): Give FLOAT_STORE_FLAG_VALUE a mode.
......
...@@ -64,6 +64,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ ...@@ -64,6 +64,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */
1, /* cost of multiply per each bit set */ 1, /* cost of multiply per each bit set */
23, /* cost of a divide/mod */ 23, /* cost of a divide/mod */
15, /* "large" insn */ 15, /* "large" insn */
3, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */ 4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers {2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode. in QImode, HImode and SImode.
...@@ -84,6 +85,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ ...@@ -84,6 +85,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */
1, /* cost of multiply per each bit set */ 1, /* cost of multiply per each bit set */
40, /* cost of a divide/mod */ 40, /* cost of a divide/mod */
15, /* "large" insn */ 15, /* "large" insn */
3, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */ 4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers {2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode. in QImode, HImode and SImode.
...@@ -104,6 +106,7 @@ struct processor_costs pentium_cost = { ...@@ -104,6 +106,7 @@ struct processor_costs pentium_cost = {
0, /* cost of multiply per each bit set */ 0, /* cost of multiply per each bit set */
25, /* cost of a divide/mod */ 25, /* cost of a divide/mod */
8, /* "large" insn */ 8, /* "large" insn */
6, /* MOVE_RATIO */
6, /* cost for loading QImode using movzbl */ 6, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers {2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode. in QImode, HImode and SImode.
...@@ -124,6 +127,7 @@ struct processor_costs pentiumpro_cost = { ...@@ -124,6 +127,7 @@ struct processor_costs pentiumpro_cost = {
0, /* cost of multiply per each bit set */ 0, /* cost of multiply per each bit set */
17, /* cost of a divide/mod */ 17, /* cost of a divide/mod */
8, /* "large" insn */ 8, /* "large" insn */
6, /* MOVE_RATIO */
2, /* cost for loading QImode using movzbl */ 2, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers {4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode. in QImode, HImode and SImode.
...@@ -144,6 +148,7 @@ struct processor_costs k6_cost = { ...@@ -144,6 +148,7 @@ struct processor_costs k6_cost = {
0, /* cost of multiply per each bit set */ 0, /* cost of multiply per each bit set */
18, /* cost of a divide/mod */ 18, /* cost of a divide/mod */
8, /* "large" insn */ 8, /* "large" insn */
4, /* MOVE_RATIO */
3, /* cost for loading QImode using movzbl */ 3, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers {4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode. in QImode, HImode and SImode.
...@@ -164,6 +169,7 @@ struct processor_costs athlon_cost = { ...@@ -164,6 +169,7 @@ struct processor_costs athlon_cost = {
0, /* cost of multiply per each bit set */ 0, /* cost of multiply per each bit set */
19, /* cost of a divide/mod */ 19, /* cost of a divide/mod */
8, /* "large" insn */ 8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */ 4, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers {4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode. in QImode, HImode and SImode.
...@@ -191,7 +197,7 @@ const int x86_zero_extend_with_and = m_486 | m_PENT; ...@@ -191,7 +197,7 @@ const int x86_zero_extend_with_and = m_486 | m_PENT;
const int x86_movx = m_ATHLON /* m_386 | m_PPRO | m_K6 */; const int x86_movx = m_ATHLON /* m_386 | m_PPRO | m_K6 */;
const int x86_double_with_add = ~m_386; const int x86_double_with_add = ~m_386;
const int x86_use_bit_test = m_386; const int x86_use_bit_test = m_386;
const int x86_unroll_strlen = m_486 | m_PENT; const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON | m_K6;
const int x86_use_q_reg = m_PENT | m_PPRO | m_K6; const int x86_use_q_reg = m_PENT | m_PPRO | m_K6;
const int x86_use_any_reg = m_486; const int x86_use_any_reg = m_486;
const int x86_cmove = m_PPRO | m_ATHLON; const int x86_cmove = m_PPRO | m_ATHLON;
...@@ -5149,10 +5155,9 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) ...@@ -5149,10 +5155,9 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch)
rtx align_3_label = NULL_RTX; rtx align_3_label = NULL_RTX;
rtx align_4_label = gen_label_rtx (); rtx align_4_label = gen_label_rtx ();
rtx end_0_label = gen_label_rtx (); rtx end_0_label = gen_label_rtx ();
rtx end_2_label = gen_label_rtx ();
rtx end_3_label = gen_label_rtx ();
rtx mem; rtx mem;
rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
rtx tmpreg = gen_reg_rtx (SImode);
align = 0; align = 0;
if (GET_CODE (align_rtx) == CONST_INT) if (GET_CODE (align_rtx) == CONST_INT)
...@@ -5269,48 +5274,69 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) ...@@ -5269,48 +5274,69 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch)
mem = gen_rtx_MEM (SImode, out); mem = gen_rtx_MEM (SImode, out);
emit_move_insn (scratch, mem); emit_move_insn (scratch, mem);
emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
/* Check first byte. */ /* This formula yields a nonzero result iff one of the bytes is zero.
emit_insn (gen_cmpqi_0 (gen_lowpart (QImode, scratch), const0_rtx)); This saves three branches inside loop and many cycles. */
tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx);
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
gen_rtx_LABEL_REF (VOIDmode, end_0_label), emit_insn (gen_one_cmplsi2 (scratch, scratch));
pc_rtx); emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp)); emit_insn (gen_andsi3 (tmpreg, tmpreg, GEN_INT (0x80808080)));
emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1, 0, align_4_label);
if (TARGET_CMOVE)
{
rtx reg = gen_reg_rtx (SImode);
emit_move_insn (reg, tmpreg);
emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
/* If zero is not in the first two bytes, move two bytes forward. */
emit_insn (gen_testsi_1 (tmpreg, GEN_INT (0x8080)));
tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
gen_rtx_IF_THEN_ELSE (SImode, tmp,
reg,
tmpreg)));
/* Emit lea manually to avoid clobbering of flags. */
emit_insn (gen_rtx_SET (SImode, reg,
gen_rtx_PLUS (SImode, out, GEN_INT (2))));
tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
emit_insn (gen_rtx_SET (VOIDmode, out,
gen_rtx_IF_THEN_ELSE (SImode, tmp,
reg,
out)));
/* Check second byte. */ }
emit_insn (gen_cmpqi_ext_3 (scratch, const0_rtx)); else
tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx); {
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, rtx end_2_label = gen_label_rtx ();
gen_rtx_LABEL_REF (VOIDmode, end_3_label), /* Is zero in the first two bytes? */
pc_rtx);
emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
/* Check third byte. */ emit_insn (gen_testsi_1 (tmpreg, GEN_INT (0x8080)));
emit_insn (gen_testsi_1 (scratch, GEN_INT (0x00ff0000))); tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
tmp = gen_rtx_EQ (VOIDmode, flags, const0_rtx); tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
gen_rtx_LABEL_REF (VOIDmode, end_2_label), gen_rtx_LABEL_REF (VOIDmode, end_2_label),
pc_rtx); pc_rtx);
emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp)); tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
JUMP_LABEL (tmp) = end_2_label;
/* Check fourth byte and increment address. */ /* Not in the first two. Move two bytes forward. */
emit_insn (gen_addsi3 (out, out, GEN_INT (4))); emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
emit_insn (gen_testsi_1 (scratch, GEN_INT (0xff000000))); emit_insn (gen_addsi3 (out, out, GEN_INT (2)));
tmp = gen_rtx_NE (VOIDmode, flags, const0_rtx);
tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
gen_rtx_LABEL_REF (VOIDmode, align_4_label),
pc_rtx);
emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
/* Now generate fixups when the compare stops within a 4-byte word. */
emit_insn (gen_subsi3 (out, out, GEN_INT (3)));
emit_label (end_2_label); emit_label (end_2_label);
emit_insn (gen_addsi3 (out, out, const1_rtx));
emit_label (end_3_label); }
emit_insn (gen_addsi3 (out, out, const1_rtx));
/* Avoid branch in fixing the byte. */
tmpreg = gen_lowpart (QImode, tmpreg);
emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
emit_insn (gen_subsi3_carry (out, out, GEN_INT (3)));
emit_label (end_0_label); emit_label (end_0_label);
} }
......
...@@ -62,6 +62,8 @@ struct processor_costs { ...@@ -62,6 +62,8 @@ struct processor_costs {
int mult_bit; /* cost of multiply per each bit set */ int mult_bit; /* cost of multiply per each bit set */
int divide; /* cost of a divide/mod */ int divide; /* cost of a divide/mod */
int large_insn; /* insns larger than this cost more */ int large_insn; /* insns larger than this cost more */
int move_ratio; /* The threshold of number of scalar memory-to-memory
move insns. */
int movzbl_load; /* cost of loading using movzbl */ int movzbl_load; /* cost of loading using movzbl */
int int_load[3]; /* cost of loading integer registers int int_load[3]; /* cost of loading integer registers
in QImode, HImode and SImode relative in QImode, HImode and SImode relative
...@@ -1709,13 +1711,9 @@ while (0) ...@@ -1709,13 +1711,9 @@ while (0)
Increasing the value will always make code faster, but eventually Increasing the value will always make code faster, but eventually
incurs high cost in increased code size. incurs high cost in increased code size.
If you don't define this, a reasonable default is used. If you don't define this, a reasonable default is used. */
Make this large on i386, since the block move is very inefficient with small #define MOVE_RATIO (optimize_size ? 3 : ix86_cost->move_ratio)
blocks, and the hard register needs of the block move require much reload
work. */
#define MOVE_RATIO 5
/* Define if shifts truncate the shift count /* Define if shifts truncate the shift count
which implies one can omit a sign-extension or zero-extension which implies one can omit a sign-extension or zero-extension
......
...@@ -3235,6 +3235,15 @@ ...@@ -3235,6 +3235,15 @@
"add{l}\\t{%2, %0|%0, %2}" "add{l}\\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")]) [(set_attr "type" "alu")])
(define_insn "addqi3_cc"
[(set (reg:CC 17) (plus:CC (match_operand:QI 1 "nonimmediate_operand" "%0,0")
(match_operand:QI 2 "general_operand" "ri,rm")))
(set (match_operand:QI 0 "nonimmediate_operand" "=rm,r")
(plus:QI (match_dup 1) (match_dup 2)))]
"ix86_binary_operator_ok (PLUS, QImode, operands)"
"add{b}\\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")])
(define_insn "*addsi3_carry" (define_insn "*addsi3_carry"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r") [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
(plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0") (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0")
...@@ -3736,7 +3745,7 @@ ...@@ -3736,7 +3745,7 @@
"sub{l}\\t{%2, %0|%0, %2}" "sub{l}\\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")]) [(set_attr "type" "alu")])
(define_insn "*subsi3_carry" (define_insn "subsi3_carry"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r") [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
(minus:SI (match_operand:SI 1 "nonimmediate_operand" "0,0") (minus:SI (match_operand:SI 1 "nonimmediate_operand" "0,0")
(plus:SI (match_operand:SI 2 "general_operand" "ri,rm") (plus:SI (match_operand:SI 2 "general_operand" "ri,rm")
...@@ -7841,8 +7850,9 @@ ...@@ -7841,8 +7850,9 @@
srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0)); srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
emit_insn (gen_cld()); emit_insn (gen_cld());
/* When optimizing for size emit simple rep ; movsb instruction. */ /* When optimizing for size emit simple rep ; movsb instruction for
if (!optimize || optimize_size) counts not divisible by 4. */
if ((!optimize || optimize_size) && (INTVAL (operands[2]) & 0x03))
{ {
countreg = copy_to_mode_reg (SImode, operands[2]); countreg = copy_to_mode_reg (SImode, operands[2]);
emit_insn (gen_rep_movqi (destreg, srcreg, countreg, emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
...@@ -7983,84 +7993,143 @@ ...@@ -7983,84 +7993,143 @@
(set_attr "memory" "both")]) (set_attr "memory" "both")])
(define_expand "clrstrsi" (define_expand "clrstrsi"
[(set (reg:SI 19) (const_int 0)) [(use (match_operand:BLK 0 "memory_operand" ""))
(set (match_dup 3) (const_int 0))
(parallel [(set (match_operand:BLK 0 "memory_operand" "")
(const_int 0))
(use (match_operand:SI 1 "const_int_operand" "")) (use (match_operand:SI 1 "const_int_operand" ""))
(use (match_operand:SI 2 "const_int_operand" "")) (use (match_operand:SI 2 "const_int_operand" ""))]
(use (match_dup 3))
(use (reg:SI 19))
(clobber (match_scratch:SI 4 ""))
(clobber (match_dup 5))])]
"" ""
" "
{ {
rtx addr0; rtx destreg, zeroreg, countreg;
if (GET_CODE (operands[1]) != CONST_INT) if (GET_CODE (operands[1]) != CONST_INT)
FAIL; FAIL;
addr0 = copy_to_mode_reg (Pmode, XEXP (operands[0], 0)); destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
operands[3] = gen_reg_rtx (SImode); emit_insn (gen_cld());
operands[5] = addr0;
operands[0] = gen_rtx_MEM (BLKmode, addr0); /* When optimizing for size emit simple rep ; movsb instruction for
counts not divisible by 4. */
if ((!optimize || optimize_size) && (INTVAL (operands[1]) & 0x03))
{
countreg = copy_to_mode_reg (SImode, operands[1]);
zeroreg = copy_to_mode_reg (QImode, const0_rtx);
emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg,
destreg, countreg));
}
else
{
zeroreg = copy_to_mode_reg (SImode, const0_rtx);
if (INTVAL (operands[1]) & ~0x03)
{
countreg = copy_to_mode_reg (SImode,
GEN_INT ((INTVAL (operands[1]) >> 2)
& 0x3fffffff));
emit_insn (gen_rep_stossi (destreg, countreg, zeroreg,
destreg, countreg));
}
if (INTVAL (operands[1]) & 0x02)
emit_insn (gen_strsethi (destreg,
gen_rtx_SUBREG (HImode, zeroreg, 0)));
if (INTVAL (operands[1]) & 0x01)
emit_insn (gen_strsetqi (destreg,
gen_rtx_SUBREG (QImode, zeroreg, 0)));
}
DONE;
}") }")
;; It might seem that operand 0 could use predicate register_operand. ;; Most CPUs don't like single string operations
;; But strength reduction might offset the MEM expression. So we let ;; Handle this case here to simplify previous expander.
;; reload put the address into %edi.
(define_insn "*clrstrsi_1" (define_expand "strsethi"
[(set (mem:BLK (match_operand:SI 0 "address_operand" "D")) [(set (mem:HI (match_operand:SI 0 "register_operand" ""))
(const_int 0)) (match_operand:HI 1 "register_operand" ""))
(use (match_operand:SI 1 "const_int_operand" "n")) (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 2)))
(use (match_operand:SI 2 "immediate_operand" "i")) (clobber (reg:CC 17))])]
(use (match_operand:SI 3 "register_operand" "a"))
(use (reg:SI 19))
(clobber (match_scratch:SI 4 "=&c"))
(clobber (match_dup 0))]
"" ""
"* "
{ {
rtx xops[2]; if (TARGET_SINGLE_STRINGOP || optimize_size)
if (GET_CODE (operands[1]) == CONST_INT)
{
unsigned int count = INTVAL (operands[1]) & 0xffffffff;
if (count & ~0x03)
{
xops[0] = GEN_INT (count / 4);
xops[1] = operands[4];
/* K6: stos takes 1 cycle, rep stos takes 8 + %ecx cycles.
80386: 4/5+5n (+2 for set of ecx)
80486: 5/7+5n (+1 for set of ecx)
*/
if (count / 4 < ((int) ix86_cpu < (int)PROCESSOR_PENTIUM ? 4 : 6))
{ {
do emit_insn (gen_strsethi_1 (operands[0], operands[0], operands[1]));
output_asm_insn (\"{stosl|stosd}\", xops); DONE;
while ((count -= 4) > 3);
} }
else }")
(define_expand "strsetqi"
[(set (mem:QI (match_operand:SI 0 "register_operand" ""))
(match_operand:QI 1 "register_operand" ""))
(parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
(clobber (reg:CC 17))])]
""
"
{
if (TARGET_SINGLE_STRINGOP || optimize_size)
{ {
output_asm_insn (\"mov{l}\\t{%0, %1|%1, %0}\", xops); emit_insn (gen_strsetqi_1 (operands[0], operands[0], operands[1]));
output_asm_insn (\"{rep\;stosl|rep stosd}\", xops); DONE;
}
}
if (INTVAL (operands[1]) & 0x02)
output_asm_insn (\"stosw\", operands);
if (INTVAL (operands[1]) & 0x01)
output_asm_insn (\"stosb\", operands);
} }
else }")
abort ();
RET; (define_insn "strsethi_1"
}" [(set (mem:HI (match_operand:SI 1 "register_operand" "0"))
[(set_attr "type" "multi")]) (match_operand:HI 2 "register_operand" "a"))
(set (match_operand:SI 0 "register_operand" "=D")
(plus:SI (match_dup 0)
(const_int 2)))
(use (reg:SI 19))]
"TARGET_SINGLE_STRINGOP || optimize_size"
"stosw"
[(set_attr "type" "str")
(set_attr "memory" "store")
(set_attr "length_prefix" "1")])
(define_insn "strsetqi_1"
[(set (mem:QI (match_operand:SI 1 "register_operand" "0"))
(match_operand:QI 2 "register_operand" "a"))
(set (match_operand:SI 0 "register_operand" "=D")
(plus:SI (match_dup 0)
(const_int 1)))
(use (reg:SI 19))]
"TARGET_SINGLE_STRINGOP || optimize_size"
"stosb"
[(set_attr "type" "str")
(set_attr "memory" "store")])
;; It might seem that operand 0 could use predicate register_operand.
;; But strength reduction might offset the MEM expression. So we let
;; reload put the address into %edi.
(define_insn "rep_stossi"
[(set (match_operand:SI 1 "register_operand" "=c") (const_int 0))
(use (match_operand:SI 2 "register_operand" "a"))
(use (match_operand:SI 4 "register_operand" "1"))
(set (match_operand:SI 0 "register_operand" "=D")
(plus:SI (match_operand:SI 3 "address_operand" "0")
(ashift:SI (match_dup 3) (const_int 2))))
(set (mem:BLK (match_dup 3))
(const_int 0))
(use (reg:SI 19))]
""
"rep\;stosl|rep stosd"
[(set_attr "type" "str")
(set_attr "length_prefix" "1")
(set_attr "memory" "store")])
(define_insn "rep_stosqi"
[(set (match_operand:SI 1 "register_operand" "=c") (const_int 0))
(use (match_operand:QI 2 "register_operand" "a"))
(use (match_operand:SI 4 "register_operand" "1"))
(set (match_operand:SI 0 "register_operand" "=D")
(plus:SI (match_operand:SI 3 "address_operand" "0") (match_dup 3)))
(set (mem:BLK (match_dup 3))
(const_int 0))
(use (reg:SI 19))]
""
"rep\;stosb|rep stosb"
[(set_attr "type" "str")
(set_attr "length_prefix" "1")
(set_attr "memory" "store")])
(define_expand "cmpstrsi" (define_expand "cmpstrsi"
[(set (match_operand:SI 0 "register_operand" "") [(set (match_operand:SI 0 "register_operand" "")
...@@ -8099,7 +8168,10 @@ ...@@ -8099,7 +8168,10 @@
emit_insn (gen_cmpstrsi_nz_1 (addr1, addr2, countreg, align)); emit_insn (gen_cmpstrsi_nz_1 (addr1, addr2, countreg, align));
} }
else else
{
emit_insn (gen_cmpsi_1 (countreg, countreg));
emit_insn (gen_cmpstrsi_1 (addr1, addr2, countreg, align)); emit_insn (gen_cmpstrsi_1 (addr1, addr2, countreg, align));
}
outlow = gen_lowpart (QImode, out); outlow = gen_lowpart (QImode, out);
emit_insn (gen_cmpintqi (outlow)); emit_insn (gen_cmpintqi (outlow));
...@@ -8145,8 +8217,8 @@ ...@@ -8145,8 +8217,8 @@
(clobber (match_dup 2))] (clobber (match_dup 2))]
"" ""
"repz{\;| }cmpsb" "repz{\;| }cmpsb"
[(set_attr "type" "multi") [(set_attr "type" "str")
(set_attr "length" "3")]) (set_attr "length_prefix" "1")])
;; The same, but the count is not known to not be zero. ;; The same, but the count is not known to not be zero.
...@@ -8158,15 +8230,15 @@ ...@@ -8158,15 +8230,15 @@
(mem:BLK (match_operand:SI 1 "address_operand" "D"))) (mem:BLK (match_operand:SI 1 "address_operand" "D")))
(const_int 0))) (const_int 0)))
(use (match_operand:SI 3 "immediate_operand" "i")) (use (match_operand:SI 3 "immediate_operand" "i"))
(use (reg:CC 17))
(use (reg:SI 19)) (use (reg:SI 19))
(clobber (match_dup 0)) (clobber (match_dup 0))
(clobber (match_dup 1)) (clobber (match_dup 1))
(clobber (match_dup 2))] (clobber (match_dup 2))]
"" ""
;; The initial compare sets the zero flag. "repz{\;| }cmpsb"
"cmp{l}\\t%2, %2\;repz{\;| }cmpsb" [(set_attr "type" "str")
[(set_attr "type" "multi") (set_attr "length_prefix" "1")])
(set_attr "length" "5")])
(define_expand "strlensi" (define_expand "strlensi"
[(set (match_operand:SI 0 "register_operand" "") [(set (match_operand:SI 0 "register_operand" "")
...@@ -8184,7 +8256,8 @@ ...@@ -8184,7 +8256,8 @@
align = operands[3]; align = operands[3];
scratch1 = gen_reg_rtx (SImode); scratch1 = gen_reg_rtx (SImode);
if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1) if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
&& !optimize_size)
{ {
/* Well it seems that some optimizer does not combine a call like /* Well it seems that some optimizer does not combine a call like
foo(strlen(bar), strlen(bar)); foo(strlen(bar), strlen(bar));
...@@ -8236,8 +8309,8 @@ ...@@ -8236,8 +8309,8 @@
(clobber (reg:CC 17))] (clobber (reg:CC 17))]
"" ""
"repnz{\;| }scasb" "repnz{\;| }scasb"
[(set_attr "type" "multi") [(set_attr "type" "str")
(set_attr "length" "3")]) (set_attr "length_prefix" "1")])
;; Conditional move instructions. ;; Conditional move instructions.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment