Commit b368d6b8 by J"orn Rennecke Committed by Joern Rennecke

divtab-sh4.c, [...]: New files.

2006-03-23  J"orn Rennecke <joern.rennecke@st.com>

	* config/sh/divtab-sh4.c, config/sh/divcost-analysis: New files.
	* config/sh/lib1funcs.asm (div_table): Add !__SH5__ variant.
	* config/sh/t-sh (LIB1ASMFUNCS): Add _div_table.
	* config/sh/sh.opt (mdiv=): Amend description.
	* config/sh/sh.h (TARGET_DIVIDE_CALL_DIV1): New macro.
	(TARGET_DIVIDE_CALL_FP, TARGET_DIVIDE_CALL_TABLE): Likewise.
	(sh_divide_strategy_e): Add new members SH_DIV_CALL_DIV1,
	SH_DIV_CALL_FP, SH_DIV_CALL_TABLE and SH_DIV_INTRINSIC.
	(OVERRIDE_OPTIONS): Also process sh_div_str for TARGET_SH1.
	Calculate sh_divsi3_libfunc using TARGET_DIVIDE_* macros.
	* config/sh/sh.md (udivsi3_i4_int, divsi3_i4_int): New patterns.
	(udivsi3, divsi3): Use them.  Check TARGET_DIVIDE_CALL_TABLE /
	TARGET_DIVIDE_CALL_FP.

From-SVN: r112331
parent a57aee2a
2006-03-23 J"orn Rennecke <joern.rennecke@st.com>
* config/sh/divtab-sh4.c, config/sh/divcost-analysis: New files.
* config/sh/lib1funcs.asm (div_table): Add !__SH5__ variant.
* config/sh/t-sh (LIB1ASMFUNCS): Add _div_table.
* config/sh/sh.opt (mdiv=): Amend description.
* config/sh/sh.h (TARGET_DIVIDE_CALL_DIV1): New macro.
(TARGET_DIVIDE_CALL_FP, TARGET_DIVIDE_CALL_TABLE): Likewise.
(sh_divide_strategy_e): Add new members SH_DIV_CALL_DIV1,
SH_DIV_CALL_FP, SH_DIV_CALL_TABLE and SH_DIV_INTRINSIC.
(OVERRIDE_OPTIONS): Also process sh_div_str for TARGET_SH1.
Calculate sh_divsi3_libfunc using TARGET_DIVIDE_* macros.
* config/sh/sh.md (udivsi3_i4_int, divsi3_i4_int): New patterns.
(udivsi3, divsi3): Use them. Check TARGET_DIVIDE_CALL_TABLE /
TARGET_DIVIDE_CALL_FP.
2006-03-23 Maxim Kuvyrkov <mkuvyrkov@ispras.ru>
* haifa-sched.c (choose_ready): Fix type of the local variable.
......
Analysis of cycle costs for SH4:
-> udiv_le128: 5
-> udiv_ge64k: 6
-> udiv udiv_25: 10
-> pos_divisor: 3
-> pos_result linear: 5
-> pos_result - -: 5
-> div_le128: 7
-> div_ge64k: 9
sdivsi3 -> udiv_25 13
udiv25 -> div_ge64k_end: 15
div_ge64k_end -> rts: 13
div_le128 -> div_le128_2: 2, r1 latency 3
udiv_le128 -> div_le128_2: 2, r1 latency 3
(u)div_le128 -> div_by_1: 9
(u)div_le128 -> rts: 17
div_by_1(_neg) -> rts: 4
div_ge64k -> div_r8: 2
div_ge64k -> div_ge64k_2: 3
udiv_ge64k -> udiv_r8: 3
udiv_ge64k -> div_ge64k_2: 3 + LS
(u)div_ge64k -> div_ge64k_end: 13
div_r8 -> div_r8_2: 2
udiv_r8 -> div_r8_2: 2 + LS
(u)div_r8 -> rts: 21
-> - + neg_result: 5
-> + - neg_result: 5
-> div_le128_neg: 7
-> div_ge64k_neg: 9
-> div_r8_neg: 11
-> <64k div_ge64k_neg_end: 28
-> >=64k div_ge64k_neg_end: 22
div_ge64k_neg_end ft -> rts: 14
div_r8_neg_end -> rts: 4
div_r8_neg -> div_r8_neg_end: 18
div_le128_neg -> div_by_1_neg: 4
div_le128_neg -> rts 18
absolute divisor range:
1 [2..128] [129..64K) [64K..|divident|/256] >=64K,>|divident/256|
udiv 18 22 38 32 30
sdiv pos: 20 24 41 35 32
sdiv neg: 15 25 42 36 33
fp-based:
unsigned: 42 + 3 + 3 (lingering ftrc latency + sts fpul,rx) at caller's site
signed: 33 + 3 + 3 (lingering ftrc latency + sts fpul,rx) at caller's site
call-div1: divisor range:
[1..64K) >= 64K
unsigned: 63 58
signed: 76 76
SFUNC_STATIC call overhead:
mov.l 0f,r1
bsrf r1
SFUNC_GOT call overhead - current:
mov.l 0f,r1
mova 0f,r0
mov.l 1f,r2
add r1,r0
mov.l @(r0,r2),r0
jmp @r0
; 3 cycles worse than SFUNC_STATIC
SFUNC_GOT call overhead - improved assembler:
mov.l 0f,r1
mova 0f,r0
mov.l @(r0,r1),r0
jmp @r0
; 2 cycles worse than SFUNC_STATIC
/* Copyright (C) 2004 Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 2, or (at your option) any
later version.
In addition to the permissions in the GNU General Public License, the
Free Software Foundation gives you unlimited permission to link the
compiled version of this file into combinations with other programs,
and to distribute those combinations without any restriction coming
from the use of this file. (The General Public License restrictions
do apply in other respects; for example, they cover modification of
the file, and distribution when not linked into a combine
executable.)
This file is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; see the file COPYING. If not, write to
the Free Software Foundation, 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* Calculate division table for SH2..4 integer division
Contributed by Joern Rernnecke
joern.rennecke@superh.com */
#include <stdio.h>
#include <math.h>
int
main ()
{
int i, j;
double q, r, err, max_err = 0, max_s_err = 0;
puts("/* This table has been generated by divtab-sh4.c. */");
puts ("\t.balign 4");
puts ("LOCAL(div_table_clz):");
/* output some dummy number for 1/0. */
printf ("\t.byte\t%d\n", 0);
for (i = 1; i <= 128; i++)
{
int n = 0;
if (i == 128)
puts ("\
/* Lookup table translating positive divisor to index into table of\n\
normalized inverse. N.B. the '0' entry is also the last entry of the\n\
previous table, and causes an unaligned access for division by zero. */\n\
LOCAL(div_table_ix):");
for (j = i; j <= 128; j += j)
n++;
printf ("\t.byte\t%d\n", n - 7);
}
for (i = 1; i <= 128; i++)
{
j = i < 0 ? -i : i;
while (j < 128)
j += j;
printf ("\t.byte\t%d\n", j * 2 - 96*4);
}
puts("\
/* 1/64 .. 1/127, normalized. There is an implicit leading 1 in bit 32. */\n\
.balign 4\n\
LOCAL(zero_l):");
for (i = 64; i < 128; i++)
{
if (i == 96)
puts ("LOCAL(div_table):");
q = 4.*(1<<30)*128/i;
r = ceil (q);
/* The value for 64 is actually differently scaled that it would
appear from this calculation. The implicit part is %01, not 10.
Still, since the value in the table is 0 either way, this
doesn't matter here. Still, the 1/64 entry is effectively a 1/128
entry. */
printf ("\t.long\t0x%X\n", (unsigned) r);
err = r - q;
if (err > max_err)
max_err = err;
err = err * i / 128;
if (err > max_s_err)
max_s_err = err;
}
printf ("\t/* maximum error: %f scaled: %f*/\n", max_err, max_s_err);
exit (0);
}
......@@ -234,6 +234,9 @@ do { \
#define TARGET_DIVIDE_INV20L (sh_div_strategy == SH_DIV_INV20L)
#define TARGET_DIVIDE_INV_CALL (sh_div_strategy == SH_DIV_INV_CALL)
#define TARGET_DIVIDE_INV_CALL2 (sh_div_strategy == SH_DIV_INV_CALL2)
#define TARGET_DIVIDE_CALL_DIV1 (sh_div_strategy == SH_DIV_CALL_DIV1)
#define TARGET_DIVIDE_CALL_FP (sh_div_strategy == SH_DIV_CALL_FP)
#define TARGET_DIVIDE_CALL_TABLE (sh_div_strategy == SH_DIV_CALL_TABLE)
#define SELECT_SH1 (MASK_SH1)
#define SELECT_SH2 (MASK_SH2 | SELECT_SH1)
......@@ -467,7 +470,7 @@ do { \
sh_div_str = SH_DIV_STR_FOR_SIZE ; \
} \
/* We can't meaningfully test TARGET_SHMEDIA here, because -m options \
haven't been parsed yet, hence we';d read only the default. \
haven't been parsed yet, hence we'd read only the default. \
sh_target_reg_class will return NO_REGS if this is not SHMEDIA, so \
it's OK to always set flag_branch_target_load_optimize. */ \
if (LEVEL > 1) \
......@@ -492,16 +495,24 @@ do { \
extern int assembler_dialect;
enum sh_divide_strategy_e {
/* SH5 strategies. */
SH_DIV_CALL,
SH_DIV_CALL2,
SH_DIV_FP,
SH_DIV_FP, /* We could do this also for SH4. */
SH_DIV_INV,
SH_DIV_INV_MINLAT,
SH_DIV_INV20U,
SH_DIV_INV20L,
SH_DIV_INV_CALL,
SH_DIV_INV_CALL2,
SH_DIV_INV_FP
SH_DIV_INV_FP,
/* SH1 .. SH4 strategies. Because of the small number of registers
available, the compiler uses knowledge of the actual et of registers
being clobbed by the different functions called. */
SH_DIV_CALL_DIV1, /* No FPU, medium size, highest latency. */
SH_DIV_CALL_FP, /* FPU needed, small size, high latency. */
SH_DIV_CALL_TABLE, /* No FPU, large size, medium latency. */
SH_DIV_INTRINSIC
};
extern enum sh_divide_strategy_e sh_div_strategy;
......@@ -611,17 +622,46 @@ do { \
targetm.asm_out.aligned_op.di = NULL; \
targetm.asm_out.unaligned_op.di = NULL; \
} \
if (TARGET_SH1) \
{ \
if (! strcmp (sh_div_str, "call-div1")) \
sh_div_strategy = SH_DIV_CALL_DIV1; \
else if (! strcmp (sh_div_str, "call-fp") \
&& (TARGET_FPU_DOUBLE \
|| (TARGET_HARD_SH4 && TARGET_SH2E) \
|| (TARGET_SHCOMPACT && TARGET_FPU_ANY))) \
sh_div_strategy = SH_DIV_CALL_FP; \
else if (! strcmp (sh_div_str, "call-table") && TARGET_SH3) \
sh_div_strategy = SH_DIV_CALL_TABLE; \
else \
/* Pick one that makes most sense for the target in general. \
It is not much good to use different functions depending \
on -Os, since then we'll end up with two different functions \
when some of the code is compiled for size, and some for \
speed. */ \
\
/* SH4 tends to emphasize speed. */ \
if (TARGET_HARD_SH4) \
sh_div_strategy = SH_DIV_CALL_TABLE; \
/* These have their own way of doing things. */ \
else if (TARGET_SH2A) \
sh_div_strategy = SH_DIV_INTRINSIC; \
/* ??? Should we use the integer SHmedia function instead? */ \
else if (TARGET_SHCOMPACT && TARGET_FPU_ANY) \
sh_div_strategy = SH_DIV_CALL_FP; \
/* SH1 .. SH3 cores often go into small-footprint systems, so \
default to the smallest implementation available. */ \
else \
sh_div_strategy = SH_DIV_CALL_DIV1; \
} \
if (sh_divsi3_libfunc[0]) \
; /* User supplied - leave it alone. */ \
else if (TARGET_HARD_SH4 && TARGET_SH2E) \
else if (TARGET_DIVIDE_CALL_FP) \
sh_divsi3_libfunc = "__sdivsi3_i4"; \
else if (TARGET_DIVIDE_CALL_TABLE) \
sh_divsi3_libfunc = "__sdivsi3_i4i"; \
else if (TARGET_SH5) \
{ \
if (TARGET_FPU_ANY && TARGET_SH1) \
sh_divsi3_libfunc = "__sdivsi3_i4"; \
else \
sh_divsi3_libfunc = "__sdivsi3_1"; \
} \
sh_divsi3_libfunc = "__sdivsi3_1"; \
else \
sh_divsi3_libfunc = "__sdivsi3"; \
if (TARGET_FMOVD) \
......
......@@ -1739,6 +1739,19 @@
[(set_attr "type" "sfunc")
(set_attr "needs_delay_slot" "yes")])
(define_insn "udivsi3_i4_int"
[(set (match_operand:SI 0 "register_operand" "=z")
(udiv:SI (reg:SI R4_REG) (reg:SI R5_REG)))
(clobber (reg:SI T_REG))
(clobber (reg:SI R1_REG))
(clobber (reg:SI PR_REG))
(use (match_operand:SI 1 "arith_reg_operand" "r"))]
"TARGET_SH1"
"jsr @%1%#"
[(set_attr "type" "sfunc")
(set_attr "needs_delay_slot" "yes")])
(define_expand "udivsi3"
[(set (match_dup 3) (symbol_ref:SI "__udivsi3"))
(set (reg:SI R4_REG) (match_operand:SI 1 "general_operand" ""))
......@@ -1757,7 +1770,12 @@
operands[3] = gen_reg_rtx (Pmode);
/* Emit the move of the address to a pseudo outside of the libcall. */
if (TARGET_HARD_SH4 && TARGET_SH2E)
if (TARGET_DIVIDE_CALL_TABLE)
{
function_symbol (operands[3], \"__udivsi3_i4i\", SFUNC_GOT);
last = gen_udivsi3_i4_int (operands[0], operands[3]);
}
else if (TARGET_DIVIDE_CALL_FP)
{
function_symbol (operands[3], \"__udivsi3_i4\", SFUNC_STATIC);
if (TARGET_FPU_SINGLE)
......@@ -1975,6 +1993,18 @@
[(set_attr "type" "sfunc")
(set_attr "needs_delay_slot" "yes")])
(define_insn "divsi3_i4_int"
[(set (match_operand:SI 0 "register_operand" "=z")
(div:SI (reg:SI R4_REG) (reg:SI R5_REG)))
(clobber (reg:SI T_REG))
(clobber (reg:SI PR_REG))
(clobber (reg:SI R1_REG))
(use (match_operand:SI 1 "arith_reg_operand" "r"))]
"TARGET_SH1"
"jsr @%1%#"
[(set_attr "type" "sfunc")
(set_attr "needs_delay_slot" "yes")])
(define_expand "divsi3"
[(set (match_dup 3) (symbol_ref:SI "__sdivsi3"))
(set (reg:SI R4_REG) (match_operand:SI 1 "general_operand" ""))
......@@ -1995,7 +2025,12 @@
operands[3] = gen_reg_rtx (Pmode);
/* Emit the move of the address to a pseudo outside of the libcall. */
if (TARGET_HARD_SH4 && TARGET_SH2E)
if (TARGET_DIVIDE_CALL_TABLE)
{
function_symbol (operands[3], sh_divsi3_libfunc, SFUNC_GOT);
last = gen_divsi3_i4_int (operands[0], operands[3]);
}
else if (TARGET_DIVIDE_CALL_FP)
{
function_symbol (operands[3], sh_divsi3_libfunc, SFUNC_STATIC);
if (TARGET_FPU_SINGLE)
......
; Options for the SH port of the compiler.
; Copyright (C) 2005 Free Software Foundation, Inc.
; Copyright (C) 2005, 2006 Free Software Foundation, Inc.
;
; This file is part of GCC.
;
......@@ -158,7 +158,7 @@ Align doubles at 64-bit boundaries
mdiv=
Target RejectNegative Joined Var(sh_div_str) Init("")
Division strategy, one of: call, call2, fp, inv, inv:minlat, inv20u, inv20l, inv:call, inv:call2, inv:fp
Division strategy, one of: call, call2, fp, inv, inv:minlat, inv20u, inv20l, inv:call, inv:call2, inv:fp call-div1 call-fp call-table
mdivsi3_libfunc=
Target RejectNegative Joined Var(sh_divsi3_libfunc) Init("")
......
......@@ -5,6 +5,7 @@ sh-c.o: $(srcdir)/config/sh/sh-c.c \
LIB1ASMSRC = sh/lib1funcs.asm
LIB1ASMFUNCS = _ashiftrt _ashiftrt_n _ashiftlt _lshiftrt _movmem \
_movmem_i4 _mulsi3 _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr \
_div_table \
$(LIB1ASMFUNCS_CACHE)
# We want fine grained libraries, so use the new code to build the
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment