Commit 4902aa64 by Bill Schmidt Committed by William Schmidt

re PR target/56843 (PowerPC Newton-Raphson reciprocal estimates can be improved)

gcc:

2013-04-05  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	PR target/56843
	* config/rs6000/rs6000.c (rs6000_emit_swdiv_high_precision): Remove.
	(rs6000_emit_swdiv_low_precision): Remove.
	(rs6000_emit_swdiv): Rewrite to handle between one and four
	iterations of Newton-Raphson generally; modify required number of
	iterations for some cases.
	* config/rs6000/rs6000.h (RS6000_RECIP_HIGH_PRECISION_P): Remove.

gcc/testsuite:

2013-04-05  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	PR target/56843
	* gcc.target/powerpc/recip-1.c: Modify expected output.
	* gcc.target/powerpc/recip-3.c: Likewise.
	* gcc.target/powerpc/recip-4.c: Likewise.
	* gcc.target/powerpc/recip-5.c: Add expected output for iterations.

From-SVN: r197534
parent 7bca81dc
2013-04-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
PR target/56843
* config/rs6000/rs6000.c (rs6000_emit_swdiv_high_precision): Remove.
(rs6000_emit_swdiv_low_precision): Remove.
(rs6000_emit_swdiv): Rewrite to handle between one and four
iterations of Newton-Raphson generally; modify required number of
iterations for some cases.
* config/rs6000/rs6000.h (RS6000_RECIP_HIGH_PRECISION_P): Remove.
2013-04-05 Steven Bosscher <steven@gcc.gnu.org> 2013-04-05 Steven Bosscher <steven@gcc.gnu.org>
* bb-reorder.c (fix_crossing_unconditional_branches): Remove a * bb-reorder.c (fix_crossing_unconditional_branches): Remove a
......
...@@ -26913,54 +26913,26 @@ rs6000_emit_nmsub (rtx dst, rtx m1, rtx m2, rtx a) ...@@ -26913,54 +26913,26 @@ rs6000_emit_nmsub (rtx dst, rtx m1, rtx m2, rtx a)
emit_insn (gen_rtx_SET (VOIDmode, dst, r)); emit_insn (gen_rtx_SET (VOIDmode, dst, r));
} }
/* Newton-Raphson approximation of floating point divide with just 2 passes /* Newton-Raphson approximation of floating point divide DST = N/D. If NOTE_P,
(either single precision floating point, or newer machines with higher add a reg_note saying that this was a division. Support both scalar and
accuracy estimates). Support both scalar and vector divide. Assumes no vector divide. Assumes no trapping math and finite arguments. */
trapping math and finite arguments. */
static void void
rs6000_emit_swdiv_high_precision (rtx dst, rtx n, rtx d) rs6000_emit_swdiv (rtx dst, rtx n, rtx d, bool note_p)
{ {
enum machine_mode mode = GET_MODE (dst); enum machine_mode mode = GET_MODE (dst);
rtx x0, e0, e1, y1, u0, v0; rtx one, x0, e0, x1, xprev, eprev, xnext, enext, u, v;
enum insn_code code = optab_handler (smul_optab, mode); int i;
gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code);
rtx one = rs6000_load_constant_and_splat (mode, dconst1);
gcc_assert (code != CODE_FOR_nothing);
/* x0 = 1./d estimate */
x0 = gen_reg_rtx (mode);
emit_insn (gen_rtx_SET (VOIDmode, x0,
gen_rtx_UNSPEC (mode, gen_rtvec (1, d),
UNSPEC_FRES)));
e0 = gen_reg_rtx (mode);
rs6000_emit_nmsub (e0, d, x0, one); /* e0 = 1. - (d * x0) */
e1 = gen_reg_rtx (mode);
rs6000_emit_madd (e1, e0, e0, e0); /* e1 = (e0 * e0) + e0 */
y1 = gen_reg_rtx (mode);
rs6000_emit_madd (y1, e1, x0, x0); /* y1 = (e1 * x0) + x0 */
u0 = gen_reg_rtx (mode);
emit_insn (gen_mul (u0, n, y1)); /* u0 = n * y1 */
v0 = gen_reg_rtx (mode);
rs6000_emit_nmsub (v0, d, u0, n); /* v0 = n - (d * u0) */
rs6000_emit_madd (dst, v0, y1, u0); /* dst = (v0 * y1) + u0 */
}
/* Newton-Raphson approximation of floating point divide that has a low /* Low precision estimates guarantee 5 bits of accuracy. High
precision estimate. Assumes no trapping math and finite arguments. */ precision estimates guarantee 14 bits of accuracy. SFmode
requires 23 bits of accuracy. DFmode requires 52 bits of
accuracy. Each pass at least doubles the accuracy, leading
to the following. */
int passes = (TARGET_RECIP_PRECISION) ? 1 : 3;
if (mode == DFmode || mode == V2DFmode)
passes++;
static void
rs6000_emit_swdiv_low_precision (rtx dst, rtx n, rtx d)
{
enum machine_mode mode = GET_MODE (dst);
rtx x0, e0, e1, e2, y1, y2, y3, u0, v0, one;
enum insn_code code = optab_handler (smul_optab, mode); enum insn_code code = optab_handler (smul_optab, mode);
gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code); gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code);
...@@ -26974,46 +26946,44 @@ rs6000_emit_swdiv_low_precision (rtx dst, rtx n, rtx d) ...@@ -26974,46 +26946,44 @@ rs6000_emit_swdiv_low_precision (rtx dst, rtx n, rtx d)
gen_rtx_UNSPEC (mode, gen_rtvec (1, d), gen_rtx_UNSPEC (mode, gen_rtvec (1, d),
UNSPEC_FRES))); UNSPEC_FRES)));
e0 = gen_reg_rtx (mode); /* Each iteration but the last calculates x_(i+1) = x_i * (2 - d * x_i). */
rs6000_emit_nmsub (e0, d, x0, one); /* e0 = 1. - d * x0 */ if (passes > 1) {
y1 = gen_reg_rtx (mode); /* e0 = 1. - d * x0 */
rs6000_emit_madd (y1, e0, x0, x0); /* y1 = x0 + e0 * x0 */ e0 = gen_reg_rtx (mode);
rs6000_emit_nmsub (e0, d, x0, one);
e1 = gen_reg_rtx (mode); /* x1 = x0 + e0 * x0 */
emit_insn (gen_mul (e1, e0, e0)); /* e1 = e0 * e0 */ x1 = gen_reg_rtx (mode);
rs6000_emit_madd (x1, e0, x0, x0);
y2 = gen_reg_rtx (mode); for (i = 0, xprev = x1, eprev = e0; i < passes - 2;
rs6000_emit_madd (y2, e1, y1, y1); /* y2 = y1 + e1 * y1 */ ++i, xprev = xnext, eprev = enext) {
/* enext = eprev * eprev */
enext = gen_reg_rtx (mode);
emit_insn (gen_mul (enext, eprev, eprev));
e2 = gen_reg_rtx (mode); /* xnext = xprev + enext * xprev */
emit_insn (gen_mul (e2, e1, e1)); /* e2 = e1 * e1 */ xnext = gen_reg_rtx (mode);
rs6000_emit_madd (xnext, enext, xprev, xprev);
}
y3 = gen_reg_rtx (mode); } else
rs6000_emit_madd (y3, e2, y2, y2); /* y3 = y2 + e2 * y2 */ xprev = x0;
u0 = gen_reg_rtx (mode); /* The last iteration calculates x_(i+1) = n * x_i * (2 - d * x_i). */
emit_insn (gen_mul (u0, n, y3)); /* u0 = n * y3 */
v0 = gen_reg_rtx (mode); /* u = n * xprev */
rs6000_emit_nmsub (v0, d, u0, n); /* v0 = n - d * u0 */ u = gen_reg_rtx (mode);
emit_insn (gen_mul (u, n, xprev));
rs6000_emit_madd (dst, v0, y3, u0); /* dst = u0 + v0 * y3 */ /* v = n - (d * u) */
} v = gen_reg_rtx (mode);
rs6000_emit_nmsub (v, d, u, n);
/* Newton-Raphson approximation of floating point divide DST = N/D. If NOTE_P, /* dst = (v * xprev) + u */
add a reg_note saying that this was a division. Support both scalar and rs6000_emit_madd (dst, v, xprev, u);
vector divide. Assumes no trapping math and finite arguments. */
void
rs6000_emit_swdiv (rtx dst, rtx n, rtx d, bool note_p)
{
enum machine_mode mode = GET_MODE (dst);
if (RS6000_RECIP_HIGH_PRECISION_P (mode))
rs6000_emit_swdiv_high_precision (dst, n, d);
else
rs6000_emit_swdiv_low_precision (dst, n, d);
if (note_p) if (note_p)
add_reg_note (get_last_insn (), REG_EQUAL, gen_rtx_DIV (mode, n, d)); add_reg_note (get_last_insn (), REG_EQUAL, gen_rtx_DIV (mode, n, d));
...@@ -27028,7 +26998,16 @@ rs6000_emit_swrsqrt (rtx dst, rtx src) ...@@ -27028,7 +26998,16 @@ rs6000_emit_swrsqrt (rtx dst, rtx src)
enum machine_mode mode = GET_MODE (src); enum machine_mode mode = GET_MODE (src);
rtx x0 = gen_reg_rtx (mode); rtx x0 = gen_reg_rtx (mode);
rtx y = gen_reg_rtx (mode); rtx y = gen_reg_rtx (mode);
int passes = (TARGET_RECIP_PRECISION) ? 2 : 3;
/* Low precision estimates guarantee 5 bits of accuracy. High
precision estimates guarantee 14 bits of accuracy. SFmode
requires 23 bits of accuracy. DFmode requires 52 bits of
accuracy. Each pass at least doubles the accuracy, leading
to the following. */
int passes = (TARGET_RECIP_PRECISION) ? 1 : 3;
if (mode == DFmode || mode == V2DFmode)
passes++;
REAL_VALUE_TYPE dconst3_2; REAL_VALUE_TYPE dconst3_2;
int i; int i;
rtx halfthree; rtx halfthree;
......
...@@ -601,9 +601,6 @@ extern unsigned char rs6000_recip_bits[]; ...@@ -601,9 +601,6 @@ extern unsigned char rs6000_recip_bits[];
#define RS6000_RECIP_AUTO_RSQRTE_P(MODE) \ #define RS6000_RECIP_AUTO_RSQRTE_P(MODE) \
(rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_AUTO_RSQRTE) (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_AUTO_RSQRTE)
#define RS6000_RECIP_HIGH_PRECISION_P(MODE) \
((MODE) == SFmode || (MODE) == V4SFmode || TARGET_RECIP_PRECISION)
/* The default CPU for TARGET_OPTION_OVERRIDE. */ /* The default CPU for TARGET_OPTION_OVERRIDE. */
#define OPTION_TARGET_CPU_DEFAULT TARGET_CPU_DEFAULT #define OPTION_TARGET_CPU_DEFAULT TARGET_CPU_DEFAULT
......
2013-04-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
PR target/56843
* gcc.target/powerpc/recip-1.c: Modify expected output.
* gcc.target/powerpc/recip-3.c: Likewise.
* gcc.target/powerpc/recip-4.c: Likewise.
* gcc.target/powerpc/recip-5.c: Add expected output for iterations.
2013-04-05 Greta Yorsh <Greta.Yorsh@arm.com> 2013-04-05 Greta Yorsh <Greta.Yorsh@arm.com>
* gcc.target/arm/peep-ldrd-1.c: New test. * gcc.target/arm/peep-ldrd-1.c: New test.
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power6" } */ /* { dg-options "-O2 -mrecip -ffast-math -mcpu=power6" } */
/* { dg-final { scan-assembler-times "frsqrte" 2 } } */ /* { dg-final { scan-assembler-times "frsqrte" 2 } } */
/* { dg-final { scan-assembler-times "fmsub" 2 } } */ /* { dg-final { scan-assembler-times "fmsub" 2 } } */
/* { dg-final { scan-assembler-times "fmul" 8 } } */ /* { dg-final { scan-assembler-times "fmul" 6 } } */
/* { dg-final { scan-assembler-times "fnmsub" 4 } } */ /* { dg-final { scan-assembler-times "fnmsub" 3 } } */
double double
rsqrt_d (double a) rsqrt_d (double a)
......
...@@ -7,8 +7,8 @@ ...@@ -7,8 +7,8 @@
/* { dg-final { scan-assembler-times "xsnmsub.dp\|fnmsub\ " 2 } } */ /* { dg-final { scan-assembler-times "xsnmsub.dp\|fnmsub\ " 2 } } */
/* { dg-final { scan-assembler-times "frsqrtes" 1 } } */ /* { dg-final { scan-assembler-times "frsqrtes" 1 } } */
/* { dg-final { scan-assembler-times "fmsubs" 1 } } */ /* { dg-final { scan-assembler-times "fmsubs" 1 } } */
/* { dg-final { scan-assembler-times "fmuls" 4 } } */ /* { dg-final { scan-assembler-times "fmuls" 2 } } */
/* { dg-final { scan-assembler-times "fnmsubs" 2 } } */ /* { dg-final { scan-assembler-times "fnmsubs" 1 } } */
double double
rsqrt_d (double a) rsqrt_d (double a)
......
...@@ -7,8 +7,8 @@ ...@@ -7,8 +7,8 @@
/* { dg-final { scan-assembler-times "xvnmsub.dp" 2 } } */ /* { dg-final { scan-assembler-times "xvnmsub.dp" 2 } } */
/* { dg-final { scan-assembler-times "xvrsqrtesp" 1 } } */ /* { dg-final { scan-assembler-times "xvrsqrtesp" 1 } } */
/* { dg-final { scan-assembler-times "xvmsub.sp" 1 } } */ /* { dg-final { scan-assembler-times "xvmsub.sp" 1 } } */
/* { dg-final { scan-assembler-times "xvmulsp" 4 } } */ /* { dg-final { scan-assembler-times "xvmulsp" 2 } } */
/* { dg-final { scan-assembler-times "xvnmsub.sp" 2 } } */ /* { dg-final { scan-assembler-times "xvnmsub.sp" 1 } } */
#define SIZE 1024 #define SIZE 1024
......
...@@ -6,6 +6,14 @@ ...@@ -6,6 +6,14 @@
/* { dg-final { scan-assembler-times "xvresp" 5 } } */ /* { dg-final { scan-assembler-times "xvresp" 5 } } */
/* { dg-final { scan-assembler-times "xsredp" 2 } } */ /* { dg-final { scan-assembler-times "xsredp" 2 } } */
/* { dg-final { scan-assembler-times "fres" 2 } } */ /* { dg-final { scan-assembler-times "fres" 2 } } */
/* { dg-final { scan-assembler-times "fmuls" 2 } } */
/* { dg-final { scan-assembler-times "fnmsubs" 2 } } */
/* { dg-final { scan-assembler-times "xsmuldp" 2 } } */
/* { dg-final { scan-assembler-times "xsnmsub.dp" 4 } } */
/* { dg-final { scan-assembler-times "xvmulsp" 7 } } */
/* { dg-final { scan-assembler-times "xvnmsub.sp" 5 } } */
/* { dg-final { scan-assembler-times "xvmuldp" 6 } } */
/* { dg-final { scan-assembler-times "xvnmsub.dp" 8 } } */
#include <altivec.h> #include <altivec.h>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment