divtab-sh4.c, [...]: New files.

2006-03-23 J"orn Rennecke <joern.rennecke@st.com> * config/sh/divtab-sh4.c, config/sh/divcost-analysis: New files. * config/sh/lib1funcs.asm (div_table): Add !__SH5__ variant. * config/sh/t-sh (LIB1ASMFUNCS): Add _div_table. * config/sh/sh.opt (mdiv=): Amend description. * config/sh/sh.h (TARGET_DIVIDE_CALL_DIV1): New macro. (TARGET_DIVIDE_CALL_FP, TARGET_DIVIDE_CALL_TABLE): Likewise. (sh_divide_strategy_e): Add new members SH_DIV_CALL_DIV1, SH_DIV_CALL_FP, SH_DIV_CALL_TABLE and SH_DIV_INTRINSIC. (OVERRIDE_OPTIONS): Also process sh_div_str for TARGET_SH1. Calculate sh_divsi3_libfunc using TARGET_DIVIDE_* macros. * config/sh/sh.md (udivsi3_i4_int, divsi3_i4_int): New patterns. (udivsi3, divsi3): Use them. Check TARGET_DIVIDE_CALL_TABLE / TARGET_DIVIDE_CALL_FP. From-SVN: r112331

divtab-sh4.c, [...]: New files.
2006-03-23 J"orn Rennecke <joern.rennecke@st.com> * config/sh/divtab-sh4.c, config/sh/divcost-analysis: New files. * config/sh/lib1funcs.asm (div_table): Add !__SH5__ variant. * config/sh/t-sh (LIB1ASMFUNCS): Add _div_table. * config/sh/sh.opt (mdiv=): Amend description. * config/sh/sh.h (TARGET_DIVIDE_CALL_DIV1): New macro. (TARGET_DIVIDE_CALL_FP, TARGET_DIVIDE_CALL_TABLE): Likewise. (sh_divide_strategy_e): Add new members SH_DIV_CALL_DIV1, SH_DIV_CALL_FP, SH_DIV_CALL_TABLE and SH_DIV_INTRINSIC. (OVERRIDE_OPTIONS): Also process sh_div_str for TARGET_SH1. Calculate sh_divsi3_libfunc using TARGET_DIVIDE_* macros. * config/sh/sh.md (udivsi3_i4_int, divsi3_i4_int): New patterns. (udivsi3, divsi3): Use them. Check TARGET_DIVIDE_CALL_TABLE / TARGET_DIVIDE_CALL_FP. From-SVN: r112331
b368d6b8 · J"orn Rennecke · Joern Rennecke · a57aee2a · b368d6b8 · b368d6b8
Commit b368d6b8 authored Mar 23, 2006 by J"orn Rennecke Committed by Joern Rennecke Mar 23, 2006
8 changed files
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2006-03-23  J"orn Rennecke <joern.rennecke@st.com>
+	* config/sh/divtab-sh4.c, config/sh/divcost-analysis: New files.
+	* config/sh/lib1funcs.asm (div_table): Add !__SH5__ variant.
+	* config/sh/t-sh (LIB1ASMFUNCS): Add _div_table.
+	* config/sh/sh.opt (mdiv=): Amend description.
+	* config/sh/sh.h (TARGET_DIVIDE_CALL_DIV1): New macro.
+	(TARGET_DIVIDE_CALL_FP, TARGET_DIVIDE_CALL_TABLE): Likewise.
+	(sh_divide_strategy_e): Add new members SH_DIV_CALL_DIV1,
+	SH_DIV_CALL_FP, SH_DIV_CALL_TABLE and SH_DIV_INTRINSIC.
+	(OVERRIDE_OPTIONS): Also process sh_div_str for TARGET_SH1.
+	Calculate sh_divsi3_libfunc using TARGET_DIVIDE_* macros.
+	* config/sh/sh.md (udivsi3_i4_int, divsi3_i4_int): New patterns.
+	(udivsi3, divsi3): Use them.  Check TARGET_DIVIDE_CALL_TABLE /
+	TARGET_DIVIDE_CALL_FP.
 2006-03-23  Maxim Kuvyrkov  <mkuvyrkov@ispras.ru>
 	* haifa-sched.c (choose_ready): Fix type of the local variable.

--- a/gcc/config/sh/divcost-analysis
+++ b/gcc/config/sh/divcost-analysis
+Analysis of cycle costs for SH4:
+-> udiv_le128:            5
+-> udiv_ge64k:            6
+-> udiv udiv_25:         10
+-> pos_divisor:           3
+-> pos_result linear:     5
+-> pos_result - -:        5
+-> div_le128:             7
+-> div_ge64k:             9
+sdivsi3 -> udiv_25             13
+udiv25 -> div_ge64k_end:       15
+div_ge64k_end -> rts:          13
+div_le128 -> div_le128_2:       2, r1 latency 3
+udiv_le128 -> div_le128_2:      2, r1 latency 3
+(u)div_le128 -> div_by_1:       9
+(u)div_le128 -> rts:           17
+div_by_1(_neg) -> rts:          4
+div_ge64k -> div_r8:            2
+div_ge64k -> div_ge64k_2:       3
+udiv_ge64k -> udiv_r8:          3
+udiv_ge64k -> div_ge64k_2:      3 + LS
+(u)div_ge64k -> div_ge64k_end: 13
+div_r8 -> div_r8_2:             2
+udiv_r8 -> div_r8_2:            2 + LS
+(u)div_r8 -> rts:              21
+-> - + neg_result:             5
+-> + - neg_result:             5
+-> div_le128_neg:              7
+-> div_ge64k_neg:              9
+-> div_r8_neg:                11
+-> <64k div_ge64k_neg_end:    28
+-> >=64k div_ge64k_neg_end:   22
+div_ge64k_neg_end ft -> rts:  14
+div_r8_neg_end -> rts:         4
+div_r8_neg -> div_r8_neg_end: 18
+div_le128_neg -> div_by_1_neg: 4
+div_le128_neg -> rts          18
+                    absolute divisor range:
+            1  [2..128]  [129..64K) [64K..|divident|/256] >=64K,>|divident/256|
+udiv       18     22         38            32                   30
+sdiv pos:  20     24         41            35                   32
+sdiv neg:  15     25         42            36                   33
+fp-based:
+unsigned: 42 + 3 + 3 (lingering ftrc latency + sts fpul,rx) at caller's site
+signed: 33 + 3 + 3 (lingering ftrc latency + sts fpul,rx) at caller's site
+call-div1:    divisor range:
+              [1..64K)  >= 64K
+unsigned:       63        58
+signed:         76        76
+SFUNC_STATIC call overhead:
+mov.l 0f,r1
+bsrf r1
+SFUNC_GOT call overhead - current:
+mov.l 0f,r1
+mova 0f,r0
+mov.l 1f,r2
+add r1,r0
+mov.l @(r0,r2),r0
+jmp @r0
+; 3 cycles worse than SFUNC_STATIC
+SFUNC_GOT call overhead - improved assembler:
+mov.l 0f,r1
+mova 0f,r0
+mov.l @(r0,r1),r0
+jmp @r0
+; 2 cycles worse than SFUNC_STATIC
--- a/gcc/config/sh/divtab-sh4.c
+++ b/gcc/config/sh/divtab-sh4.c
+/* Copyright (C) 2004 Free Software Foundation, Inc.
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+/* Calculate division table for SH2..4 integer division
+   Contributed by Joern Rernnecke
+   joern.rennecke@superh.com  */
+#include <stdio.h>
+#include <math.h>
+int
+main ()
+{
+  int i, j;
+  double q, r, err, max_err = 0, max_s_err = 0;
+  puts("/* This table has been generated by divtab-sh4.c.  */");
+  puts ("\t.balign 4");
+  puts ("LOCAL(div_table_clz):");
+  /* output some dummy number for 1/0.  */
+  printf ("\t.byte\t%d\n", 0);
+  for (i = 1; i <= 128; i++)
+    {
+      int n = 0;
+      if (i == 128)
+	puts ("\
+/* Lookup table translating positive divisor to index into table of\n\
+   normalized inverse.  N.B. the '0' entry is also the last entry of the\n\
+ previous table, and causes an unaligned access for division by zero.  */\n\
+LOCAL(div_table_ix):");
+      for (j = i; j <= 128; j += j)
+	n++;
+      printf ("\t.byte\t%d\n", n - 7);
+    }
+  for (i = 1; i <= 128; i++)
+    {
+      j = i < 0 ? -i : i;
+      while (j < 128)
+	j += j;
+      printf ("\t.byte\t%d\n", j * 2 - 96*4);
+    }
+  puts("\
+/* 1/64 .. 1/127, normalized.  There is an implicit leading 1 in bit 32.  */\n\
+	.balign 4\n\
+LOCAL(zero_l):");
+  for (i = 64; i < 128; i++)
+    {
+      if (i == 96)
+	puts ("LOCAL(div_table):");
+      q = 4.*(1<<30)*128/i;
+      r = ceil (q);
+      /* The value for 64 is actually differently scaled that it would
+	 appear from this calculation.  The implicit part is %01, not 10.
+	 Still, since the value in the table is 0 either way, this
+	 doesn't matter here.  Still, the 1/64 entry is effectively a 1/128
+	 entry.  */
+      printf ("\t.long\t0x%X\n", (unsigned) r);
+      err = r - q;
+      if (err > max_err)
+	max_err = err;
+      err = err * i / 128;
+      if (err > max_s_err)
+	max_s_err = err;
+    }
+  printf ("\t/* maximum error: %f scaled: %f*/\n", max_err, max_s_err);
+  exit (0);
+}
--- a/gcc/config/sh/lib1funcs.asm
+++ b/gcc/config/sh/lib1funcs.asm
--- a/gcc/config/sh/sh.h
+++ b/gcc/config/sh/sh.h
@@ -234,6 +234,9 @@ do { \
 #define TARGET_DIVIDE_INV20L (sh_div_strategy == SH_DIV_INV20L)
 #define TARGET_DIVIDE_INV_CALL (sh_div_strategy == SH_DIV_INV_CALL)
 #define TARGET_DIVIDE_INV_CALL2 (sh_div_strategy == SH_DIV_INV_CALL2)
+#define TARGET_DIVIDE_CALL_DIV1 (sh_div_strategy == SH_DIV_CALL_DIV1)
+#define TARGET_DIVIDE_CALL_FP (sh_div_strategy == SH_DIV_CALL_FP)
+#define TARGET_DIVIDE_CALL_TABLE (sh_div_strategy == SH_DIV_CALL_TABLE)
 #define SELECT_SH1               (MASK_SH1)
 #define SELECT_SH2               (MASK_SH2 | SELECT_SH1)
@@ -467,7 +470,7 @@ do {									\
      sh_div_str = SH_DIV_STR_FOR_SIZE ;				\
    }									\
  /* We can't meaningfully test TARGET_SHMEDIA here, because -m options	\
-     haven't been parsed yet, hence we';d read only the default.	\
+     haven't been parsed yet, hence we'd read only the default.	\
     sh_target_reg_class will return NO_REGS if this is not SHMEDIA, so	\
     it's OK to always set flag_branch_target_load_optimize.  */	\
  if (LEVEL > 1)							\
@@ -492,16 +495,24 @@ do {									\
 extern int assembler_dialect;
 enum sh_divide_strategy_e {
+  /* SH5 strategies.  */
  SH_DIV_CALL,
  SH_DIV_CALL2,
-  SH_DIV_FP,
+  SH_DIV_FP, /* We could do this also for SH4.  */
  SH_DIV_INV,
  SH_DIV_INV_MINLAT,
  SH_DIV_INV20U,
  SH_DIV_INV20L,
  SH_DIV_INV_CALL,
  SH_DIV_INV_CALL2,
-  SH_DIV_INV_FP
+  SH_DIV_INV_FP,
+  /* SH1 .. SH4 strategies.  Because of the small number of registers
+     available, the compiler uses knowledge of the actual et of registers
+     being clobbed by the different functions called.  */
+  SH_DIV_CALL_DIV1, /* No FPU, medium size, highest latency.  */
+  SH_DIV_CALL_FP,     /* FPU needed, small size, high latency.  */
+  SH_DIV_CALL_TABLE,  /* No FPU, large size, medium latency. */
+  SH_DIV_INTRINSIC
 };
 extern enum sh_divide_strategy_e sh_div_strategy;
@@ -611,17 +622,46 @@ do {									\
       targetm.asm_out.aligned_op.di = NULL;				\
       targetm.asm_out.unaligned_op.di = NULL;				\
    }									\
+  if (TARGET_SH1)							\
+    {									\
+      if (! strcmp (sh_div_str, "call-div1"))				\
+	sh_div_strategy = SH_DIV_CALL_DIV1;				\
+      else if (! strcmp (sh_div_str, "call-fp")				\
+	       && (TARGET_FPU_DOUBLE					\
+		   || (TARGET_HARD_SH4 && TARGET_SH2E)			\
+		   || (TARGET_SHCOMPACT && TARGET_FPU_ANY)))		\
+	sh_div_strategy = SH_DIV_CALL_FP;				\
+      else if (! strcmp (sh_div_str, "call-table") && TARGET_SH3)	\
+	sh_div_strategy = SH_DIV_CALL_TABLE;				\
+      else								\
+	/* Pick one that makes most sense for the target in general.	\
+	   It is not much good to use different functions depending	\
+	   on -Os, since then we'll end up with two different functions	\
+	   when some of the code is compiled for size, and some for	\
+	   speed.  */							\
+									\
+	/* SH4 tends to emphasize speed.  */				\
+	if (TARGET_HARD_SH4)						\
+	  sh_div_strategy = SH_DIV_CALL_TABLE;				\
+	/* These have their own way of doing things.  */		\
+	else if (TARGET_SH2A)						\
+	  sh_div_strategy = SH_DIV_INTRINSIC;				\
+	/* ??? Should we use the integer SHmedia function instead?  */	\
+	else if (TARGET_SHCOMPACT && TARGET_FPU_ANY)			\
+	  sh_div_strategy = SH_DIV_CALL_FP;				\
+        /* SH1 .. SH3 cores often go into small-footprint systems, so	\
+	   default to the smallest implementation available.  */	\
+	else								\
+	  sh_div_strategy = SH_DIV_CALL_DIV1;				\
+    }									\
  if (sh_divsi3_libfunc[0])						\
    ; /* User supplied - leave it alone.  */				\
-  else if (TARGET_HARD_SH4 && TARGET_SH2E)				\
+  else if (TARGET_DIVIDE_CALL_FP)					\
    sh_divsi3_libfunc = "__sdivsi3_i4";					\
+  else if (TARGET_DIVIDE_CALL_TABLE)					\
+    sh_divsi3_libfunc = "__sdivsi3_i4i";				\
  else if (TARGET_SH5)							\
-    {									\
+    sh_divsi3_libfunc = "__sdivsi3_1";					\
-      if (TARGET_FPU_ANY && TARGET_SH1)					\
-	sh_divsi3_libfunc = "__sdivsi3_i4";				\
-      else								\
-	sh_divsi3_libfunc = "__sdivsi3_1";				\
-    }									\
  else									\
    sh_divsi3_libfunc = "__sdivsi3";					\
  if (TARGET_FMOVD)							\

--- a/gcc/config/sh/sh.md
+++ b/gcc/config/sh/sh.md
@@ -1739,6 +1739,19 @@
  [(set_attr "type" "sfunc")
   (set_attr "needs_delay_slot" "yes")])
+(define_insn "udivsi3_i4_int"
+  [(set (match_operand:SI 0 "register_operand" "=z")
+	(udiv:SI (reg:SI R4_REG) (reg:SI R5_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI R1_REG))
+   (clobber (reg:SI PR_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH1"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
 (define_expand "udivsi3"
  [(set (match_dup 3) (symbol_ref:SI "__udivsi3"))
   (set (reg:SI R4_REG) (match_operand:SI 1 "general_operand" ""))
@@ -1757,7 +1770,12 @@
  operands[3] = gen_reg_rtx (Pmode);
  /* Emit the move of the address to a pseudo outside of the libcall.  */
-  if (TARGET_HARD_SH4 && TARGET_SH2E)
+  if (TARGET_DIVIDE_CALL_TABLE)
+    {
+      function_symbol (operands[3], \"__udivsi3_i4i\", SFUNC_GOT);
+      last = gen_udivsi3_i4_int (operands[0], operands[3]);
+    }
+  else if (TARGET_DIVIDE_CALL_FP)
    {
      function_symbol (operands[3], \"__udivsi3_i4\", SFUNC_STATIC);
      if (TARGET_FPU_SINGLE)
@@ -1975,6 +1993,18 @@
  [(set_attr "type" "sfunc")
   (set_attr "needs_delay_slot" "yes")])
+(define_insn "divsi3_i4_int"
+  [(set (match_operand:SI 0 "register_operand" "=z")
+	(div:SI (reg:SI R4_REG) (reg:SI R5_REG)))
+   (clobber (reg:SI T_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (reg:SI R1_REG))
+   (use (match_operand:SI 1 "arith_reg_operand" "r"))]
+  "TARGET_SH1"
+  "jsr	@%1%#"
+  [(set_attr "type" "sfunc")
+   (set_attr "needs_delay_slot" "yes")])
 (define_expand "divsi3"
  [(set (match_dup 3) (symbol_ref:SI "__sdivsi3"))
   (set (reg:SI R4_REG) (match_operand:SI 1 "general_operand" ""))
@@ -1995,7 +2025,12 @@
  operands[3] = gen_reg_rtx (Pmode);
  /* Emit the move of the address to a pseudo outside of the libcall.  */
-  if (TARGET_HARD_SH4 && TARGET_SH2E)
+  if (TARGET_DIVIDE_CALL_TABLE)
+    {
+      function_symbol (operands[3], sh_divsi3_libfunc, SFUNC_GOT);
+      last = gen_divsi3_i4_int (operands[0], operands[3]);
+    }
+  else if (TARGET_DIVIDE_CALL_FP)
    {
      function_symbol (operands[3], sh_divsi3_libfunc, SFUNC_STATIC);
      if (TARGET_FPU_SINGLE)

--- a/gcc/config/sh/sh.opt
+++ b/gcc/config/sh/sh.opt
 ; Options for the SH port of the compiler.
-; Copyright (C) 2005 Free Software Foundation, Inc.
+; Copyright (C) 2005, 2006 Free Software Foundation, Inc.
 ;
 ; This file is part of GCC.
 ;
@@ -158,7 +158,7 @@ Align doubles at 64-bit boundaries
 mdiv=
 Target RejectNegative Joined Var(sh_div_str) Init("")
-Division strategy, one of: call, call2, fp, inv, inv:minlat, inv20u, inv20l, inv:call, inv:call2, inv:fp
+Division strategy, one of: call, call2, fp, inv, inv:minlat, inv20u, inv20l, inv:call, inv:call2, inv:fp call-div1 call-fp call-table
 mdivsi3_libfunc=
 Target RejectNegative Joined Var(sh_divsi3_libfunc) Init("")

--- a/gcc/config/sh/t-sh
+++ b/gcc/config/sh/t-sh
@@ -5,6 +5,7 @@ sh-c.o: $(srcdir)/config/sh/sh-c.c \
 LIB1ASMSRC = sh/lib1funcs.asm
 LIB1ASMFUNCS = _ashiftrt _ashiftrt_n _ashiftlt _lshiftrt _movmem \
  _movmem_i4 _mulsi3 _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr \
+  _div_table \
  $(LIB1ASMFUNCS_CACHE)
 # We want fine grained libraries, so use the new code to build the