Commit f3a83111 by Steve Ellcey Committed by Steve Ellcey

ia64-protos.h (ia64_dconst_0_5): New.

	* config/ia64/ia64-protos.h (ia64_dconst_0_5): New.
	(ia64_dconst_0_375): New.
	* config/ia64/ia64.c (ia64_override_options): Remove
	-minline-sqrt-min-latency warning.
	(ia64_dconst_0_5_rtx, ia64_dconst_0_5): New.
	(ia64_dconst_0_375_rtx, ia64_dconst_0_375): New
	* config/ia64/ia64.md (*sqrt_approx): Remove.
	(sqrtsf2): Remove #if 0.
	(sqrtsf2_internal_thr): Rewrite and move to div.md.
	(sqrtdf): Remove assert.
	(sqrtdf2_internal_thr): Rewrite and move to div.md.
	(sqrtxf2): Remove #if 0.
	(sqrtxf2_internal_thr): Rewrite and move to div.md.
	* div.md (sqrt_approx_rf): New.
	(sqrtsf2_internal_thr): New implementation.
	(sqrtsf2_internal_lat): New.
	(sqrtdf2_internal_thr: New implementation.
	(sqrtxf2_internal): New implementation.

From-SVN: r147713
parent 1ffc7157
2009-05-19 Steve Ellcey <sje@cup.hp.com>
* config/ia64/ia64-protos.h (ia64_dconst_0_5): New.
(ia64_dconst_0_375): New.
* config/ia64/ia64.c (ia64_override_options): Remove
-minline-sqrt-min-latency warning.
(ia64_dconst_0_5_rtx, ia64_dconst_0_5): New.
(ia64_dconst_0_375_rtx, ia64_dconst_0_375): New
* config/ia64/ia64.md (*sqrt_approx): Remove.
(sqrtsf2): Remove #if 0.
(sqrtsf2_internal_thr): Rewrite and move to div.md.
(sqrtdf): Remove assert.
(sqrtdf2_internal_thr): Rewrite and move to div.md.
(sqrtxf2): Remove #if 0.
(sqrtxf2_internal_thr): Rewrite and move to div.md.
* div.md (sqrt_approx_rf): New.
(sqrtsf2_internal_thr): New implementation.
(sqrtsf2_internal_lat): New.
(sqrtdf2_internal_thr: New implementation.
(sqrtxf2_internal): New implementation.
2009-05-19 Francois-Xavier Coudert <fxcoudert@gmail.com>
Hans-Peter Nilsson <hp@axis.com>
......
......@@ -518,3 +518,281 @@
emit_insn (gen_truncrfxf2 (operands[0], q_res));
DONE;
})
;; SQRT operations
(define_insn "sqrt_approx_rf"
[(set (match_operand:RF 0 "fr_register_operand" "=f")
(unspec:RF [(match_operand:RF 1 "fr_reg_or_fp01_operand" "fG")]
UNSPEC_FR_SQRT_RECIP_APPROX_RES))
(set (match_operand:BI 2 "register_operand" "=c")
(unspec:BI [(match_dup 1)] UNSPEC_FR_SQRT_RECIP_APPROX))
(use (match_operand:SI 3 "const_int_operand" ""))]
""
"frsqrta.s%3 %0, %2 = %F1"
[(set_attr "itanium_class" "fmisc")
(set_attr "predicable" "no")])
(define_expand "sqrtsf2_internal_thr"
[(set (match_operand:SF 0 "fr_register_operand" "")
(sqrt:SF (match_operand:SF 1 "fr_register_operand" "")))]
"TARGET_INLINE_SQRT"
{
rtx y = gen_reg_rtx (RFmode);
rtx b = gen_reg_rtx (RFmode);
rtx g = gen_reg_rtx (RFmode);
rtx e = gen_reg_rtx (RFmode);
rtx s = gen_reg_rtx (RFmode);
rtx f = gen_reg_rtx (RFmode);
rtx y1 = gen_reg_rtx (RFmode);
rtx g1 = gen_reg_rtx (RFmode);
rtx h = gen_reg_rtx (RFmode);
rtx d = gen_reg_rtx (RFmode);
rtx g2 = gen_reg_rtx (RFmode);
rtx cond = gen_reg_rtx (BImode);
rtx zero = CONST0_RTX (RFmode);
rtx one = CONST1_RTX (RFmode);
rtx c1 = ia64_dconst_0_5();
rtx c2 = ia64_dconst_0_375();
rtx reg_df_c1 = gen_reg_rtx (DFmode);
rtx reg_df_c2 = gen_reg_rtx (DFmode);
rtx reg_rf_c1 = gen_reg_rtx (RFmode);
rtx reg_rf_c2 = gen_reg_rtx (RFmode);
rtx status0 = CONST0_RTX (SImode);
rtx status1 = CONST1_RTX (SImode);
rtx trunc_sgl = CONST0_RTX (SImode);
rtx trunc_off = CONST2_RTX (SImode);
/* Put needed constants into registers. */
emit_insn (gen_movdf (reg_df_c1, c1));
emit_insn (gen_movdf (reg_df_c2, c2));
emit_insn (gen_extenddfrf2 (reg_rf_c1, reg_df_c1));
emit_insn (gen_extenddfrf2 (reg_rf_c2, reg_df_c2));
/* Empty conversion to put input into RFmode. */
emit_insn (gen_extendsfrf2 (b, operands[1]));
/* y = sqrt (1 / b) */
emit_insn (gen_sqrt_approx_rf (y, b, cond, status0));
/* g = b * y */
emit_insn (gen_mulrf3_cond (g, cond, b, y, zero, status1, trunc_off));
/* e = 1 - (g * y) */
emit_insn (gen_m2subrf4_cond (e, cond, one, g, y, zero, status1, trunc_off));
/* s = 0.5 + (0.375 * e) */
emit_insn (gen_m2addrf4_cond (s, cond, reg_rf_c1, reg_rf_c2, e, zero, status1, trunc_off));
/* f = y * e */
emit_insn (gen_mulrf3_cond (f, cond, y, e, zero, status1, trunc_off));
/* y1 = y + (f * s) */
emit_insn (gen_m2addrf4_cond (y1, cond, y, f, s, zero, status1, trunc_off));
/* g1 = single (b * y1) */
emit_insn (gen_mulrf3_cond (g1, cond, b, y1, zero, status1, trunc_sgl));
/* h = 0.5 * y1 */
emit_insn (gen_mulrf3_cond (h, cond, reg_rf_c1, y1, zero, status1, trunc_off));
/* d = b - g1 * g1 */
emit_insn (gen_m2subrf4_cond (d, cond, b, g1, g1, zero, status1, trunc_off));
/* g2 = single(g1 + (d * h)) */
emit_insn (gen_m2addrf4_cond (g2, cond, g1, d, h, y, status0, trunc_sgl));
/* Conversion back into SFmode. */
emit_insn (gen_truncrfsf2 (operands[0], g2));
DONE;
})
(define_expand "sqrtsf2_internal_lat"
[(set (match_operand:SF 0 "fr_register_operand" "")
(sqrt:SF (match_operand:SF 1 "fr_register_operand" "")))]
"TARGET_INLINE_SQRT"
{
rtx y = gen_reg_rtx (RFmode);
rtx b = gen_reg_rtx (RFmode);
rtx g = gen_reg_rtx (RFmode);
rtx g1 = gen_reg_rtx (RFmode);
rtx g2 = gen_reg_rtx (RFmode);
rtx e = gen_reg_rtx (RFmode);
rtx s = gen_reg_rtx (RFmode);
rtx f = gen_reg_rtx (RFmode);
rtx f1 = gen_reg_rtx (RFmode);
rtx h = gen_reg_rtx (RFmode);
rtx h1 = gen_reg_rtx (RFmode);
rtx d = gen_reg_rtx (RFmode);
rtx cond = gen_reg_rtx (BImode);
rtx zero = CONST0_RTX (RFmode);
rtx one = CONST1_RTX (RFmode);
rtx c1 = ia64_dconst_0_5();
rtx c2 = ia64_dconst_0_375();
rtx reg_df_c1 = gen_reg_rtx (DFmode);
rtx reg_df_c2 = gen_reg_rtx (DFmode);
rtx reg_rf_c1 = gen_reg_rtx (RFmode);
rtx reg_rf_c2 = gen_reg_rtx (RFmode);
rtx status0 = CONST0_RTX (SImode);
rtx status1 = CONST1_RTX (SImode);
rtx trunc_sgl = CONST0_RTX (SImode);
rtx trunc_off = CONST2_RTX (SImode);
/* Put needed constants into registers. */
emit_insn (gen_movdf (reg_df_c1, c1));
emit_insn (gen_movdf (reg_df_c2, c2));
emit_insn (gen_extenddfrf2 (reg_rf_c1, reg_df_c1));
emit_insn (gen_extenddfrf2 (reg_rf_c2, reg_df_c2));
/* Empty conversion to put input into RFmode. */
emit_insn (gen_extendsfrf2 (b, operands[1]));
/* y = sqrt (1 / b) */
emit_insn (gen_sqrt_approx_rf (y, b, cond, status0));
/* g = b * y */
emit_insn (gen_mulrf3_cond (g, cond, b, y, zero, status1, trunc_off));
/* e = 1 - (g * y) */
emit_insn (gen_m2subrf4_cond (e, cond, one, g, y, zero, status1, trunc_off));
/* h = 0.5 * y */
emit_insn (gen_mulrf3_cond (h, cond, reg_rf_c1, y, zero, status1, trunc_off));
/* s = 0.5 + (0.375 * e) */
emit_insn (gen_m2addrf4_cond (s, cond, reg_rf_c1, reg_rf_c2, e, zero, status1, trunc_off));
/* f = e * g */
emit_insn (gen_mulrf3_cond (f, cond, e, g, zero, status1, trunc_off));
/* g1 = single (g + (f * s)) */
emit_insn (gen_m2addrf4_cond (g1, cond, g, f, s, zero, status1, trunc_sgl));
/* f1 = e * h */
emit_insn (gen_mulrf3_cond (f1, cond, e, h, zero, status1, trunc_off));
/* d = b - g1 * g1 */
emit_insn (gen_m2subrf4_cond (d, cond, b, g1, g1, zero, status1, trunc_off));
/* h1 = h + (f1 * s) */
emit_insn (gen_m2addrf4_cond (h1, cond, h, f1, s, zero, status1, trunc_off));
/* g2 = single(g1 + (d * h1)) */
emit_insn (gen_m2addrf4_cond (g2, cond, g1, d, h1, y, status0, trunc_sgl));
/* Conversion back into SFmode. */
emit_insn (gen_truncrfsf2 (operands[0], g2));
DONE;
})
(define_expand "sqrtdf2_internal_thr"
[(set (match_operand:DF 0 "fr_register_operand" "")
(sqrt:DF (match_operand:DF 1 "fr_register_operand" "")))]
"TARGET_INLINE_SQRT"
{
rtx y = gen_reg_rtx (RFmode);
rtx b = gen_reg_rtx (RFmode);
rtx g = gen_reg_rtx (RFmode);
rtx g1 = gen_reg_rtx (RFmode);
rtx g2 = gen_reg_rtx (RFmode);
rtx g3 = gen_reg_rtx (RFmode);
rtx g4 = gen_reg_rtx (RFmode);
rtx r = gen_reg_rtx (RFmode);
rtx r1 = gen_reg_rtx (RFmode);
rtx h = gen_reg_rtx (RFmode);
rtx h1 = gen_reg_rtx (RFmode);
rtx h2 = gen_reg_rtx (RFmode);
rtx d = gen_reg_rtx (RFmode);
rtx d1 = gen_reg_rtx (RFmode);
rtx cond = gen_reg_rtx (BImode);
rtx zero = CONST0_RTX (RFmode);
rtx c1 = ia64_dconst_0_5();
rtx reg_df_c1 = gen_reg_rtx (DFmode);
rtx reg_rf_c1 = gen_reg_rtx (RFmode);
rtx status0 = CONST0_RTX (SImode);
rtx status1 = CONST1_RTX (SImode);
rtx trunc_dbl = CONST1_RTX (SImode);
rtx trunc_off = CONST2_RTX (SImode);
/* Put needed constants into registers. */
emit_insn (gen_movdf (reg_df_c1, c1));
emit_insn (gen_extenddfrf2 (reg_rf_c1, reg_df_c1));
/* Empty conversion to put input into RFmode. */
emit_insn (gen_extenddfrf2 (b, operands[1]));
/* y = sqrt (1 / b) */
emit_insn (gen_sqrt_approx_rf (y, b, cond, status0));
/* g = b * y */
emit_insn (gen_mulrf3_cond (g, cond, b, y, zero, status1, trunc_off));
/* h = 0.5 * y */
emit_insn (gen_mulrf3_cond (h, cond, reg_rf_c1, y, zero, status1, trunc_off));
/* r = 0.5 - (g * h) */
emit_insn (gen_m2subrf4_cond (r, cond, reg_rf_c1, g, h, zero, status1, trunc_off));
/* g1 = g + (g * r) */
emit_insn (gen_m2addrf4_cond (g1, cond, g, g, r, zero, status1, trunc_off));
/* h1 = h + (h * r) */
emit_insn (gen_m2addrf4_cond (h1, cond, h, h, r, zero, status1, trunc_off));
/* r1 = 0.5 - (g1 * h1) */
emit_insn (gen_m2subrf4_cond (r1, cond, reg_rf_c1, g1, h1, zero, status1, trunc_off));
/* g2 = g1 + (g1 * r1) */
emit_insn (gen_m2addrf4_cond (g2, cond, g1, g1, r1, zero, status1, trunc_off));
/* h2 = h1 + (h1 * r1) */
emit_insn (gen_m2addrf4_cond (h2, cond, h1, h1, r1, zero, status1, trunc_off));
/* d = b - (g2 * g2) */
emit_insn (gen_m2subrf4_cond (d, cond, b, g2, g2, zero, status1, trunc_off));
/* g3 = g2 + (d * h2) */
emit_insn (gen_m2addrf4_cond (g3, cond, g2, d, h2, zero, status1, trunc_off));
/* d1 = b - (g3 * g3) */
emit_insn (gen_m2subrf4_cond (d1, cond, b, g3, g3, zero, status1, trunc_off));
/* g4 = g3 + (d1 * h2) */
emit_insn (gen_m2addrf4_cond (g4, cond, g3, d1, h2, y, status1, trunc_dbl));
/* Conversion back into SFmode. */
emit_insn (gen_truncrfdf2 (operands[0], g4));
DONE;
})
(define_expand "sqrtxf2_internal"
[(set (match_operand:XF 0 "fr_register_operand" "")
(sqrt:XF (match_operand:XF 1 "fr_register_operand" "")))]
"TARGET_INLINE_SQRT"
{
rtx y = gen_reg_rtx (RFmode);
rtx b = gen_reg_rtx (RFmode);
rtx g = gen_reg_rtx (RFmode);
rtx g1 = gen_reg_rtx (RFmode);
rtx g2 = gen_reg_rtx (RFmode);
rtx g3 = gen_reg_rtx (RFmode);
rtx g4 = gen_reg_rtx (RFmode);
rtx e = gen_reg_rtx (RFmode);
rtx e1 = gen_reg_rtx (RFmode);
rtx e2 = gen_reg_rtx (RFmode);
rtx h = gen_reg_rtx (RFmode);
rtx h1 = gen_reg_rtx (RFmode);
rtx h2 = gen_reg_rtx (RFmode);
rtx h3 = gen_reg_rtx (RFmode);
rtx d = gen_reg_rtx (RFmode);
rtx d1 = gen_reg_rtx (RFmode);
rtx cond = gen_reg_rtx (BImode);
rtx zero = CONST0_RTX (RFmode);
rtx c1 = ia64_dconst_0_5();
rtx reg_df_c1 = gen_reg_rtx (DFmode);
rtx reg_rf_c1 = gen_reg_rtx (RFmode);
rtx status0 = CONST0_RTX (SImode);
rtx status1 = CONST1_RTX (SImode);
rtx trunc_off = CONST2_RTX (SImode);
/* Put needed constants into registers. */
emit_insn (gen_movdf (reg_df_c1, c1));
emit_insn (gen_extenddfrf2 (reg_rf_c1, reg_df_c1));
/* Empty conversion to put input into RFmode. */
emit_insn (gen_extendxfrf2 (b, operands[1]));
/* y = sqrt (1 / b) */
emit_insn (gen_sqrt_approx_rf (y, b, cond, status0));
/* g = b * y */
emit_insn (gen_mulrf3_cond (g, cond, b, y, zero, status1, trunc_off));
/* h = 0.5 * y */
emit_insn (gen_mulrf3_cond (h, cond, reg_rf_c1, y, zero, status1, trunc_off));
/* e = 0.5 - (g * h) */
emit_insn (gen_m2subrf4_cond (e, cond, reg_rf_c1, g, h, zero, status1, trunc_off));
/* g1 = g + (g * e) */
emit_insn (gen_m2addrf4_cond (g1, cond, g, g, e, zero, status1, trunc_off));
/* h1 = h + (h * e) */
emit_insn (gen_m2addrf4_cond (h1, cond, h, h, e, zero, status1, trunc_off));
/* e1 = 0.5 - (g1 * h1) */
emit_insn (gen_m2subrf4_cond (e1, cond, reg_rf_c1, g1, h1, zero, status1, trunc_off));
/* g2 = g1 + (g1 * e1) */
emit_insn (gen_m2addrf4_cond (g2, cond, g1, g1, e1, zero, status1, trunc_off));
/* h2 = h1 + (h1 * e1) */
emit_insn (gen_m2addrf4_cond (h2, cond, h1, h1, e1, zero, status1, trunc_off));
/* d = b - (g2 * g2) */
emit_insn (gen_m2subrf4_cond (d, cond, b, g2, g2, zero, status1, trunc_off));
/* e2 = 0.5 - (g2 * h2) */
emit_insn (gen_m2subrf4_cond (e2, cond, reg_rf_c1, g2, h2, zero, status1, trunc_off));
/* g3 = g2 + (d * h2) */
emit_insn (gen_m2addrf4_cond (g3, cond, g2, d, h2, zero, status1, trunc_off));
/* h3 = h2 + (e2 * h2) */
emit_insn (gen_m2addrf4_cond (h3, cond, h2, e2, h2, zero, status1, trunc_off));
/* d1 = b - (g3 * g3) */
emit_insn (gen_m2subrf4_cond (d1, cond, b, g3, g3, zero, status1, trunc_off));
/* g4 = g3 + (d1 * h3) */
emit_insn (gen_m2addrf4_cond (g4, cond, g3, d1, h3, y, status1, trunc_off));
/* Conversion back into SFmode. */
emit_insn (gen_truncrfxf2 (operands[0], g4));
DONE;
})
......@@ -102,3 +102,6 @@ extern void ia64_profile_hook (int);
extern void ia64_optimization_options (int, int);
extern void ia64_init_expanders (void);
extern rtx ia64_dconst_0_5 (void);
extern rtx ia64_dconst_0_375 (void);
......@@ -5280,12 +5280,6 @@ ia64_override_options (void)
if (TARGET_AUTO_PIC)
target_flags |= MASK_CONST_GP;
if (TARGET_INLINE_SQRT == INL_MIN_LAT)
{
warning (0, "not yet implemented: latency-optimized inline square root");
TARGET_INLINE_SQRT = INL_MAX_THR;
}
ia64_flag_schedule_insns2 = flag_schedule_insns_after_reload;
flag_schedule_insns_after_reload = 0;
......@@ -10571,4 +10565,33 @@ ia64_c_mode_for_suffix (char suffix)
return VOIDmode;
}
static GTY(()) rtx ia64_dconst_0_5_rtx;
rtx
ia64_dconst_0_5 (void)
{
if (! ia64_dconst_0_5_rtx)
{
REAL_VALUE_TYPE rv;
real_from_string (&rv, "0.5");
ia64_dconst_0_5_rtx = const_double_from_real_value (rv, DFmode);
}
return ia64_dconst_0_5_rtx;
}
static GTY(()) rtx ia64_dconst_0_375_rtx;
rtx
ia64_dconst_0_375 (void)
{
if (! ia64_dconst_0_375_rtx)
{
REAL_VALUE_TYPE rv;
real_from_string (&rv, "0.375");
ia64_dconst_0_375_rtx = const_double_from_real_value (rv, DFmode);
}
return ia64_dconst_0_375_rtx;
}
#include "gt-ia64.h"
......@@ -3161,21 +3161,6 @@
DONE;
})
;; Inline square root.
(define_insn "*sqrt_approx"
[(set (match_operand:XF 0 "fr_register_operand" "=f")
(div:XF (const_int 1)
(unspec:XF [(match_operand:XF 2 "fr_reg_or_fp01_operand" "fG")]
UNSPEC_FR_SQRT_RECIP_APPROX_RES)))
(set (match_operand:BI 1 "register_operand" "=c")
(unspec:BI [(match_dup 2)] UNSPEC_FR_SQRT_RECIP_APPROX))
(use (match_operand:SI 3 "const_int_operand" "")) ]
""
"frsqrta.s%3 %0, %1 = %2"
[(set_attr "itanium_class" "fmisc")
(set_attr "predicable" "no")])
(define_insn "setf_exp_xf"
[(set (match_operand:XF 0 "fr_register_operand" "=f")
(unspec:XF [(match_operand:DI 1 "register_operand" "r")]
......@@ -3184,134 +3169,23 @@
"setf.exp %0 = %1"
[(set_attr "itanium_class" "frfr")])
;; Inline square root.
(define_expand "sqrtsf2"
[(set (match_operand:SF 0 "fr_register_operand" "=&f")
(sqrt:SF (match_operand:SF 1 "fr_reg_or_fp01_operand" "fG")))]
"TARGET_INLINE_SQRT"
{
rtx insn;
#if 0
if (TARGET_INLINE_SQRT == INL_MIN_LAT)
insn = gen_sqrtsf2_internal_lat (operands[0], operands[1]);
else
#else
gcc_assert (TARGET_INLINE_SQRT != INL_MIN_LAT);
#endif
insn = gen_sqrtsf2_internal_thr (operands[0], operands[1]);
emit_insn (insn);
DONE;
})
;; Latency-optimized square root.
;; FIXME: Implement.
;; Throughput-optimized square root.
(define_insn_and_split "sqrtsf2_internal_thr"
[(set (match_operand:SF 0 "fr_register_operand" "=&f")
(sqrt:SF (match_operand:SF 1 "fr_reg_or_fp01_operand" "fG")))
;; Register r2 in optimization guide.
(clobber (match_scratch:DI 2 "=r"))
;; Register f8 in optimization guide
(clobber (match_scratch:XF 3 "=&f"))
;; Register f9 in optimization guide
(clobber (match_scratch:XF 4 "=&f"))
;; Register f10 in optimization guide
(clobber (match_scratch:XF 5 "=&f"))
;; Register p6 in optimization guide.
(clobber (match_scratch:BI 6 "=c"))]
"TARGET_INLINE_SQRT == INL_MAX_THR"
"#"
"&& reload_completed"
[ ;; exponent of +1/2 in r2
(set (match_dup 2) (const_int 65534))
;; +1/2 in f8
(set (match_dup 3)
(unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
;; Step 1
;; y0 = 1/sqrt(a) in f7
(parallel [(set (match_dup 7)
(div:XF (const_int 1)
(unspec:XF [(match_dup 8)]
UNSPEC_FR_SQRT_RECIP_APPROX_RES)))
(set (match_dup 6)
(unspec:BI [(match_dup 8)]
UNSPEC_FR_SQRT_RECIP_APPROX))
(use (const_int 0))])
;; Step 2
;; H0 = 1/2 * y0 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 3) (match_dup 7))
(match_dup 9)))
(use (const_int 1))]))
;; Step 3
;; S0 = a * y0 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 8) (match_dup 7))
(match_dup 9)))
(use (const_int 1))]))
;; Step 4
;; d = 1/2 - S0 * H0 in f10
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 5)
(minus:XF (match_dup 3)
(mult:XF (match_dup 7) (match_dup 4))))
(use (const_int 1))]))
;; Step 5
;; d' = d + 1/2 * d in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 3) (match_dup 5))
(match_dup 5)))
(use (const_int 1))]))
;; Step 6
;; e = d + d * d' in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 5) (match_dup 3))
(match_dup 5)))
(use (const_int 1))]))
;; Step 7
;; S1 = S0 + e * S0 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 0)
(float_truncate:SF
(plus:XF (mult:XF (match_dup 3) (match_dup 7))
(match_dup 7))))
(use (const_int 1))]))
;; Step 8
;; H1 = H0 + e * H0 in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 3) (match_dup 4))
(match_dup 4)))
(use (const_int 1))]))
;; Step 9
;; d1 = a - S1 * S1 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(minus:XF (match_dup 8)
(mult:XF (match_dup 7) (match_dup 7))))
(use (const_int 1))]))
;; Step 10
;; S = S1 + d1 * H1 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 0)
(float_truncate:SF
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 7))))
(use (const_int 0))]))]
{
/* Generate 82-bit versions of the input and output operands. */
operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
/* Generate required floating-point constants. */
operands[9] = CONST0_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
;; :: 64-bit floating point arithmetic
......@@ -3546,144 +3420,12 @@
if (TARGET_INLINE_SQRT == INL_MIN_LAT)
insn = gen_sqrtdf2_internal_lat (operands[0], operands[1]);
else
#else
gcc_assert (TARGET_INLINE_SQRT != INL_MIN_LAT);
#endif
insn = gen_sqrtdf2_internal_thr (operands[0], operands[1]);
emit_insn (insn);
DONE;
})
;; Latency-optimized square root.
;; FIXME: Implement.
;; Throughput-optimized square root.
(define_insn_and_split "sqrtdf2_internal_thr"
[(set (match_operand:DF 0 "fr_register_operand" "=&f")
(sqrt:DF (match_operand:DF 1 "fr_reg_or_fp01_operand" "fG")))
;; Register r2 in optimization guide.
(clobber (match_scratch:DI 2 "=r"))
;; Register f8 in optimization guide
(clobber (match_scratch:XF 3 "=&f"))
;; Register f9 in optimization guide
(clobber (match_scratch:XF 4 "=&f"))
;; Register f10 in optimization guide
(clobber (match_scratch:XF 5 "=&f"))
;; Register p6 in optimization guide.
(clobber (match_scratch:BI 6 "=c"))]
"TARGET_INLINE_SQRT == INL_MAX_THR"
"#"
"&& reload_completed"
[ ;; exponent of +1/2 in r2
(set (match_dup 2) (const_int 65534))
;; +1/2 in f10
(set (match_dup 5)
(unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
;; Step 1
;; y0 = 1/sqrt(a) in f7
(parallel [(set (match_dup 7)
(div:XF (const_int 1)
(unspec:XF [(match_dup 8)]
UNSPEC_FR_SQRT_RECIP_APPROX_RES)))
(set (match_dup 6)
(unspec:BI [(match_dup 8)]
UNSPEC_FR_SQRT_RECIP_APPROX))
(use (const_int 0))])
;; Step 2
;; H0 = 1/2 * y0 in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 5) (match_dup 7))
(match_dup 9)))
(use (const_int 1))]))
;; Step 3
;; G0 = a * y0 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 8) (match_dup 7))
(match_dup 9)))
(use (const_int 1))]))
;; Step 4
;; r0 = 1/2 - G0 * H0 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(minus:XF (match_dup 5)
(mult:XF (match_dup 7) (match_dup 3))))
(use (const_int 1))]))
;; Step 5
;; H1 = H0 + r0 * H0 in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 3)))
(use (const_int 1))]))
;; Step 6
;; G1 = G0 + r0 * G0 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 4) (match_dup 7))
(match_dup 7)))
(use (const_int 1))]))
;; Step 7
;; r1 = 1/2 - G1 * H1 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(minus:XF (match_dup 5)
(mult:XF (match_dup 7) (match_dup 3))))
(use (const_int 1))]))
;; Step 8
;; H2 = H1 + r1 * H1 in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 3)))
(use (const_int 1))]))
;; Step 9
;; G2 = G1 + r1 * G1 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 4) (match_dup 7))
(match_dup 7)))
(use (const_int 1))]))
;; Step 10
;; d2 = a - G2 * G2 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(minus:XF (match_dup 8)
(mult:XF (match_dup 7) (match_dup 7))))
(use (const_int 1))]))
;; Step 11
;; G3 = G2 + d2 * H2 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 7)))
(use (const_int 1))]))
;; Step 12
;; d3 = a - G3 * G3 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(minus:XF (match_dup 8)
(mult:XF (match_dup 7) (match_dup 7))))
(use (const_int 1))]))
;; Step 13
;; S = G3 + d3 * H2 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 0)
(float_truncate:DF
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 7))))
(use (const_int 0))]))]
{
/* Generate 82-bit versions of the input and output operands. */
operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
/* Generate required floating-point constants. */
operands[9] = CONST0_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
;; :: 80-bit floating point arithmetic
......@@ -4056,163 +3798,11 @@
"TARGET_INLINE_SQRT"
{
rtx insn;
#if 0
if (TARGET_INLINE_SQRT == INL_MIN_LAT)
insn = gen_sqrtxf2_internal_lat (operands[0], operands[1]);
else
#else
gcc_assert (TARGET_INLINE_SQRT != INL_MIN_LAT);
#endif
insn = gen_sqrtxf2_internal_thr (operands[0], operands[1]);
insn = gen_sqrtxf2_internal (operands[0], operands[1]);
emit_insn (insn);
DONE;
})
;; Latency-optimized square root.
;; FIXME: Implement.
;; Throughput-optimized square root.
(define_insn_and_split "sqrtxf2_internal_thr"
[(set (match_operand:XF 0 "fr_register_operand" "=&f")
(sqrt:XF (match_operand:XF 1 "fr_reg_or_fp01_operand" "fG")))
;; Register r2 in optimization guide.
(clobber (match_scratch:DI 2 "=r"))
;; Register f8 in optimization guide
(clobber (match_scratch:XF 3 "=&f"))
;; Register f9 in optimization guide
(clobber (match_scratch:XF 4 "=&f"))
;; Register f10 in optimization guide
(clobber (match_scratch:XF 5 "=&f"))
;; Register f11 in optimization guide
(clobber (match_scratch:XF 6 "=&f"))
;; Register p6 in optimization guide.
(clobber (match_scratch:BI 7 "=c"))]
"TARGET_INLINE_SQRT == INL_MAX_THR"
"#"
"&& reload_completed"
[ ;; exponent of +1/2 in r2
(set (match_dup 2) (const_int 65534))
;; +1/2 in f8. The Intel manual mistakenly specifies f10.
(set (match_dup 3)
(unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
;; Step 1
;; y0 = 1/sqrt(a) in f7
(parallel [(set (match_dup 8)
(div:XF (const_int 1)
(unspec:XF [(match_dup 9)]
UNSPEC_FR_SQRT_RECIP_APPROX_RES)))
(set (match_dup 7)
(unspec:BI [(match_dup 9)]
UNSPEC_FR_SQRT_RECIP_APPROX))
(use (const_int 0))])
;; Step 2
;; H0 = 1/2 * y0 in f9
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 3) (match_dup 8))
(match_dup 10)))
(use (const_int 1))]))
;; Step 3
;; S0 = a * y0 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 8)
(plus:XF (mult:XF (match_dup 9) (match_dup 8))
(match_dup 10)))
(use (const_int 1))]))
;; Step 4
;; d0 = 1/2 - S0 * H0 in f10
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 5)
(minus:XF (match_dup 3)
(mult:XF (match_dup 8) (match_dup 4))))
(use (const_int 1))]))
;; Step 5
;; H1 = H0 + d0 * H0 in f9
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 5) (match_dup 4))
(match_dup 4)))
(use (const_int 1))]))
;; Step 6
;; S1 = S0 + d0 * S0 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 8)
(plus:XF (mult:XF (match_dup 5) (match_dup 8))
(match_dup 8)))
(use (const_int 1))]))
;; Step 7
;; d1 = 1/2 - S1 * H1 in f10
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 5)
(minus:XF (match_dup 3)
(mult:XF (match_dup 8) (match_dup 4))))
(use (const_int 1))]))
;; Step 8
;; H2 = H1 + d1 * H1 in f9
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 5) (match_dup 4))
(match_dup 4)))
(use (const_int 1))]))
;; Step 9
;; S2 = S1 + d1 * S1 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 8)
(plus:XF (mult:XF (match_dup 5) (match_dup 8))
(match_dup 8)))
(use (const_int 1))]))
;; Step 10
;; d2 = 1/2 - S2 * H2 in f10
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 5)
(minus:XF (match_dup 3)
(mult:XF (match_dup 8) (match_dup 4))))
(use (const_int 1))]))
;; Step 11
;; e2 = a - S2 * S2 in f8
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 3)
(minus:XF (match_dup 9)
(mult:XF (match_dup 8) (match_dup 8))))
(use (const_int 1))]))
;; Step 12
;; S3 = S2 + e2 * H2 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 8)
(plus:XF (mult:XF (match_dup 3) (match_dup 4))
(match_dup 8)))
(use (const_int 1))]))
;; Step 13
;; H3 = H2 + d2 * H2 in f9
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 5) (match_dup 4))
(match_dup 4)))
(use (const_int 1))]))
;; Step 14
;; e3 = a - S3 * S3 in f8
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 3)
(minus:XF (match_dup 9)
(mult:XF (match_dup 8) (match_dup 8))))
(use (const_int 1))]))
;; Step 15
;; S = S3 + e3 * H3 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 0)
(plus:XF (mult:XF (match_dup 3) (match_dup 4))
(match_dup 8)))
(use (const_int 0))]))]
{
/* Generate 82-bit versions of the input and output operands. */
operands[8] = gen_rtx_REG (XFmode, REGNO (operands[0]));
operands[9] = gen_rtx_REG (XFmode, REGNO (operands[1]));
/* Generate required floating-point constants. */
operands[10] = CONST0_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; ??? frcpa works like cmp.foo.unc.
(define_insn "*recip_approx"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment