PR target/44218, improve -mrecip on powerpc

From-SVN: r160199

PR target/44218, improve -mrecip on powerpc
From-SVN: r160199
92902797 · Michael Meissner · Michael Meissner · 6c07d08b · 92902797 · 92902797
Commit 92902797 authored Jun 03, 2010 by Michael Meissner Committed by Michael Meissner Jun 03, 2010
25 changed files
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2010-06-02  Michael Meissner  <meissner@linux.vnet.ibm.com>
+	PR target/44218
+	* doc/invoke.texi (RS/6000 and PowerPC Options): Delete obsolete
+	-mswdiv option.  Add -mrecip, -mrecip=<xxx>, -mrecip-precision
+	options.
+	* doc/extend.texi (powerpc builtins): Document vec_recip,
+	vec_rsqrt, vec_rsqrte altivec/vsx builtins.
+	* config/rs6000/rs60000-protos.h (rs6000_emit_swdiv): New
+	function.
+	(rs6000_emit_swrsqrt): Ditto.
+	(rs6000_emit_swdivsf): Delete.
+	(rs6000_emit_swdivdf): Ditto.
+	(rs6000_emit_swrsqrtsf): Ditto.
+	* config/rs6000/rs6000.c (rs6000_recip_bits): New global to
+	describe the reciprocal estimate support for each type.
+	(recip_options): Map -mrecip=<opt> into option bits.
+	(gen_2arg_fn_t): New typedef for binary rtx gen function.
+	(rs6000_debug_reg_global): If -mdebug=reg, print the state of the
+	reciprocal estimate instructions.
+	(rs6000_init_hard_regno_mode_ok): Key ws constraint off of the
+	debug -mvsx-scalar-memory switch instead of -mvsx-scalar-double.
+	Set up rs6000_recip_bits based on the -mrecip* options.  Print the
+	cost information if -mdebug=cost or -mdebug=reg.
+	(rs6000_override_options): Set -mrecip-precision for power6, and
+	power7 machines.  If -mvsx or -mdfp, enable various options that
+	came in previous instruction set ISAs, unless the option was
+	explicitly disabled by the command line option.  Parse
+	-mrecip=<opt> options.
+	(rs6000_builtin_vectorized_function): Add support for vectorizing
+	the reciprocal estimate builtins and expansions.
+	(rs6000_handle_option): Add -mrecip, -mrecip=<opt> support.
+	(bdesc_2arg): Add reciprocal estimate builtins.
+	(bdesc_1arg): Add reciprocal square root estimate builtins.
+	(rs6000_expand_builtin): Rewrite to use a switch statement,
+	instead of multiple if/then/elses.  Add reciprocal estimate
+	builtins.
+	(rs6000_init_builtins): Create declarations for reciprocal
+	estimate builtins.
+	(rs6000_preferred_reload_class): Simplify VSX preferences, if scalar
+	sized, prefer traditional floating point registers, if integer
+	vector types, prefer altivec registers.  Don't actually look at
+	the memory address any more.
+	(rs6000_builtin_reciprocal): Add new builtin reciprocal estimate
+	builtins.
+	(rs6000_load_constant_and_splat): New helper function to load up
+	the constant for reciprocal estimate instructions.
+	(rs6000_emit_madd): New helper function for generating
+	multiply/add type instructions, based on the current switches.
+	(rs6000_emit_msub): Ditto.
+	(rs6000_emit_mnsub): Ditto.
+	(rs6000_emit_swdiv_high_precision): Replace rs6000_emit_swdivsf to
+	replace a divide with a reciprocal estimate and fixup, adding
+	support for machines with high precision and vectors.
+	(rs6000_emit_swdiv_low_precision): Rewrite rs6000_emit_swdivdf for
+	low precision machines.
+	(rs6000_emit_swdiv): New common function to be called to replace a
+	division with reciprocal estimate and fixup.
+	(rs6000_emit_swrsqrt): Replace rs6000_emit_swrsqrtsf.  Add support
+	for double and vector types.  Add support for high precision
+	machines.
+	* config/rs6000/rs6000.h (TARGET_FRES): New macro to say whether
+	the reciprocal estimate instructions can be generated.
+	(TARGET_FRE): Ditto.
+	(TARGET_FRSQRTES): Ditto.
+	(TARGET_FRSQRTE): Ditto.
+	(RS6000_RECIP_*): New macros for reciprocal estimate support.
+	* config/rs6000/vector.md (rsqrte<mode>2): New insn for reciprocal
+	square root estimate on vectors.
+	(re<mode>2): New insn for reciprocal division estimate on vectors.
+	* config/rs6000/rs6000-buitlins.def (ALTIVEC_BUILTIN_VRSQRTFP):
+	New builtin.
+	(ALTIVEC_BUILTIN_VRECIPFP): Ditto.
+	(ALTIVEC_BUITLIN_VEC_RE): Ditto.
+	(ALTIVEC_BUILTIN_VEC_RSQRT): Ditto.
+	(VSX_BUILTIN_RSQRT_V4SF): Ditto.
+	(VSX_BUITLIN_RSQRT_V2DF): Ditto.
+	(RS6000_BUILTIN_RSQRT): Ditto.
+	(ALTIVEC_BUILTIN_VEC_RSQRTE): Denote that the builtin is a
+	floating point builtin.
+	* config/rs6000/rs6000-c.c (rs6000_cpu_cpp_builtins): Define
+	macros __RECIP__, __RECIPF__, __RSQRTE__, __RSQRTEF__,
+	__RECIP_PRECISION__ based on the command line switches.
+	(altivec_overloaded_builtins): Add reciprocal estimate builtins.
+	* config/rs6000/rs6000.opt (-mrecip): Document add support for
+	replacing division instructions with reciprocal estimate and
+	fixup.
+	(-mrecip=<opt>): New option.
+	(-mrecip-precision): Ditto.
+	* config/rs6000/vsx.md (UNSPEC_VSX_RSQRTE): Delete.
+	(vsx_rsqrte<mode>2): Use UNSPEC_RSQRT not UNSPEC_VSX_RSQRTE.
+	(vsx_copysignsf3): If -mvsx, use double precision cpsign on single
+	precision scalar.
+	* config/rs6000/altivec.md (UNSPEC_RSQRTEFP): Delete.
+	(UNSPEC_VREFP): Ditto.
+	(altivec_vnmsubfp*): Make altivec nmsub mirror the scalar and VSX
+	conterparts with regard to support of -mno-fused-madd and
+	-ffast-math.
+	(altivec_vrsqrtefp): Use common UNSPEC to allow scalar/vector
+	reciprocal estimate instructions to be generated.
+	(altivec_vrefp): Ditto.
+	* config/rs6000/rs6000.md (RECIPF): New iterator for reciprocal
+	estimate support.
+	(rreg): New mode attribute for reciprocal estimate support.
+	(recip<mode>3): New insn for division using reciprocal estimate
+	and fixup builtins.
+	(divide define_split): New define_split to convert floating point
+	division to use reciprocal estimate if the user used the
+	appropriate options and the split is run when we can add new
+	pseudo registers for the fixup.
+	(rsqrt<mode>2): New insn for reciprocal square root support.
+	(recipsf3): Move into recip<mode>3.
+	(recipdf3): Ditto.
+	(fres): Use TARGET_FRES.
+	(rsqrtsf2): Move into rsqrt<mode>2.
+	(rsqrtsf_internal1): Use TARGET_FRSQRTSES.
+	(copysignsf3): Add support for VSX.
+	(fred): Use TARGET_FRE.
+	(fred_fpr): Ditto.
+	(rsqrtdf_internal1): New function for frsqrte instruciton.
+	* config/rs6000/altivec.h (vec_recipdiv): Define new vector
+	builtin.
+	(vec_rsqrt): Ditto.
 2010-06-03  Richard Guenther  <rguenther@suse.de>
 	PR middle-end/44291

--- a/gcc/config/rs6000/altivec.h
+++ b/gcc/config/rs6000/altivec.h
@@ -163,6 +163,8 @@
 #define vec_vpkshus __builtin_vec_vpkshus
 #define vec_re __builtin_vec_re
 #define vec_round __builtin_vec_round
+#define vec_recipdiv __builtin_vec_recipdiv
+#define vec_rsqrt __builtin_vec_rsqrt
 #define vec_rsqrte __builtin_vec_rsqrte
 #define vec_vsubfp __builtin_vec_vsubfp
 #define vec_subc __builtin_vec_subc

--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -75,9 +75,7 @@
   (UNSPEC_VCTSXS       154)
   (UNSPEC_VLOGEFP      155)
   (UNSPEC_VEXPTEFP     156)
-   (UNSPEC_VRSQRTEFP    157)
+   ;; 157-162 deleted
-   (UNSPEC_VREFP        158)
-   ;; 159-162 deleted
   (UNSPEC_VLSDOI       163)
   (UNSPEC_VUPKHSB      167)
   (UNSPEC_VUPKHPX      168)
@@ -141,10 +139,11 @@
   (UNSPEC_VPERMHI	321)
   (UNSPEC_INTERHI      322)
   (UNSPEC_INTERLO      323)
-   (UNSPEC_VUPKHS_V4SF   324)
+   (UNSPEC_VUPKHS_V4SF  324)
-   (UNSPEC_VUPKLS_V4SF   325)
+   (UNSPEC_VUPKLS_V4SF  325)
-   (UNSPEC_VUPKHU_V4SF   326)
+   (UNSPEC_VUPKHU_V4SF  326)
-   (UNSPEC_VUPKLU_V4SF   327)
+   (UNSPEC_VUPKLU_V4SF  327)
+   (UNSPEC_VNMSUBFP	328)
 ])
 (define_constants
@@ -628,11 +627,64 @@
 }")
 ;; Fused multiply subtract 
-(define_insn "altivec_vnmsubfp"
+(define_expand "altivec_vnmsubfp"
+  [(match_operand:V4SF 0 "register_operand" "")
+   (match_operand:V4SF 1 "register_operand" "")
+   (match_operand:V4SF 2 "register_operand" "")
+   (match_operand:V4SF 3 "register_operand" "")]
+  "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
+{
+  if (TARGET_FUSED_MADD && HONOR_SIGNED_ZEROS (SFmode))
+    {
+       emit_insn (gen_altivec_vnmsubfp_1 (operands[0], operands[1],
+					  operands[2], operands[3]));
+       DONE;
+    }
+  else if (TARGET_FUSED_MADD && !HONOR_SIGNED_ZEROS (DFmode))
+    {
+       emit_insn (gen_altivec_vnmsubfp_2 (operands[0], operands[1],
+					  operands[2], operands[3]));
+       DONE;
+    }
+  else
+    {
+       emit_insn (gen_altivec_vnmsubfp_3 (operands[0], operands[1],
+					  operands[2], operands[3]));
+       DONE;
+    }
+})
+(define_insn "altivec_vnmsubfp_1"
  [(set (match_operand:V4SF 0 "register_operand" "=v")
-	(neg:V4SF (minus:V4SF (mult:V4SF (match_operand:V4SF 1 "register_operand" "v")
+	(neg:V4SF
-			       (match_operand:V4SF 2 "register_operand" "v"))
+	 (minus:V4SF
-	  	    (match_operand:V4SF 3 "register_operand" "v"))))]
+	  (mult:V4SF
+	   (match_operand:V4SF 1 "register_operand" "v")
+	   (match_operand:V4SF 2 "register_operand" "v"))
+	  (match_operand:V4SF 3 "register_operand" "v"))))]
+  "VECTOR_UNIT_ALTIVEC_P (V4SFmode) && TARGET_FUSED_MADD
+   && HONOR_SIGNED_ZEROS (SFmode)"
+  "vnmsubfp %0,%1,%2,%3"
+  [(set_attr "type" "vecfloat")])
+(define_insn "altivec_vnmsubfp_2"
+  [(set (match_operand:V4SF 0 "register_operand" "=v")
+	(minus:V4SF
+	 (match_operand:V4SF 3 "register_operand" "v")
+	 (mult:V4SF
+	  (match_operand:V4SF 1 "register_operand" "v")
+	  (match_operand:V4SF 2 "register_operand" "v"))))]
+  "VECTOR_UNIT_ALTIVEC_P (V4SFmode) && TARGET_FUSED_MADD
+   && !HONOR_SIGNED_ZEROS (SFmode)"
+  "vnmsubfp %0,%1,%2,%3"
+  [(set_attr "type" "vecfloat")])
+(define_insn "altivec_vnmsubfp_3"
+  [(set (match_operand:V4SF 0 "register_operand" "=v")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
+		       (match_operand:V4SF 2 "register_operand" "v")
+		       (match_operand:V4SF 3 "register_operand" "v")]
+		      UNSPEC_VNMSUBFP))]
  "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
  "vnmsubfp %0,%1,%2,%3"
  [(set_attr "type" "vecfloat")])
@@ -1444,19 +1496,19 @@
  "vexptefp %0,%1"
  [(set_attr "type" "vecfloat")])
-(define_insn "altivec_vrsqrtefp"
+(define_insn "*altivec_vrsqrtefp"
  [(set (match_operand:V4SF 0 "register_operand" "=v")
        (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")]
-		     UNSPEC_VRSQRTEFP))]
+		     UNSPEC_RSQRT))]
-  "TARGET_ALTIVEC"
+  "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
  "vrsqrtefp %0,%1"
  [(set_attr "type" "vecfloat")])
 (define_insn "altivec_vrefp"
  [(set (match_operand:V4SF 0 "register_operand" "=v")
        (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")]
-		     UNSPEC_VREFP))]
+		     UNSPEC_FRES))]
-  "TARGET_ALTIVEC"
+  "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
  "vrefp %0,%1"
  [(set_attr "type" "vecfloat")])

--- a/gcc/config/rs6000/rs6000-builtin.def
+++ b/gcc/config/rs6000/rs6000-builtin.def
@@ -159,6 +159,7 @@ RS6000_BUILTIN(ALTIVEC_BUILTIN_VRFIZ,			RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VRLB,			RS6000_BTC_CONST)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VRLH,			RS6000_BTC_CONST)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VRLW,			RS6000_BTC_CONST)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VRSQRTFP,		RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VRSQRTEFP,		RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VSLB,			RS6000_BTC_CONST)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VSLH,			RS6000_BTC_CONST)
@@ -269,6 +270,7 @@ RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_EXT_V8HI,		RS6000_BTC_CONST)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_EXT_V16QI,		RS6000_BTC_CONST)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_EXT_V4SF,		RS6000_BTC_CONST)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_COPYSIGN_V4SF,		RS6000_BTC_CONST)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VRECIPFP,		RS6000_BTC_FP_PURE)
 /* Altivec overloaded builtins.  */
 /* For now, don't set the classification for overloaded functions.
@@ -351,10 +353,12 @@ RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_PACKS,		RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_PACKSU,		RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_PERM,		RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RE,			RS6000_BTC_MISC)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RECIP,		RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RL,			RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RINT,		RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_ROUND,		RS6000_BTC_MISC)
-RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RSQRTE,		RS6000_BTC_MISC)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RSQRT,		RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RSQRTE,		RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_SEL,			RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_SL,			RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_SLD,			RS6000_BTC_MISC)
@@ -959,6 +963,10 @@ RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEL_V2DF,		RS6000_BTC_CONST)
 RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEL_V2DI,		RS6000_BTC_CONST)
 RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEH_V2DF,		RS6000_BTC_CONST)
 RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEH_V2DI,		RS6000_BTC_CONST)
+RS6000_BUILTIN(VSX_BUILTIN_VEC_RSQRT_V4SF,		RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(VSX_BUILTIN_VEC_RSQRT_V2DF,		RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(VSX_BUILTIN_RECIP_V4SF,			RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(VSX_BUILTIN_RECIP_V2DF,			RS6000_BTC_FP_PURE)
 /* VSX overloaded builtins, add the overloaded functions not present in
   Altivec.  */
@@ -991,4 +999,5 @@ RS6000_BUILTIN(POWER7_BUILTIN_BPERMD,			RS6000_BTC_CONST)
 RS6000_BUILTIN(RS6000_BUILTIN_RECIP,			RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(RS6000_BUILTIN_RECIPF,			RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(RS6000_BUILTIN_RSQRTF,			RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(RS6000_BUILTIN_RSQRT,			RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(RS6000_BUILTIN_BSWAP_HI,			RS6000_BTC_CONST)
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -362,6 +362,16 @@ rs6000_cpu_cpp_builtins (cpp_reader *pfile)
      builtin_define ("__builtin_vsx_xvnmsubasp=__builtin_vsx_xvnmsubsp");
      builtin_define ("__builtin_vsx_xvnmsubmsp=__builtin_vsx_xvnmsubsp");
    }
+  if (RS6000_RECIP_HAVE_RE_P (DFmode))
+    builtin_define ("__RECIP__");
+  if (RS6000_RECIP_HAVE_RE_P (SFmode))
+    builtin_define ("__RECIPF__");
+  if (RS6000_RECIP_HAVE_RSQRTE_P (DFmode))
+    builtin_define ("__RSQRTE__");
+  if (RS6000_RECIP_HAVE_RSQRTE_P (SFmode))
+    builtin_define ("__RSQRTEF__");
+  if (TARGET_RECIP_PRECISION)
+    builtin_define ("__RECIP_PRECISION__");
  /* Tell users they can use __builtin_bswap{16,64}.  */
  builtin_define ("__HAVE_BSWAP__");
@@ -479,10 +489,22 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = {
    RS6000_BTI_void, RS6000_BTI_bool_V16QI, 0, 0 },
  { ALTIVEC_BUILTIN_VEC_RE, ALTIVEC_BUILTIN_VREFP,
    RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
+  { ALTIVEC_BUILTIN_VEC_RE, VSX_BUILTIN_XVREDP,
+    RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 },
  { ALTIVEC_BUILTIN_VEC_ROUND, ALTIVEC_BUILTIN_VRFIN,
    RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
+  { ALTIVEC_BUILTIN_VEC_RECIP, ALTIVEC_BUILTIN_VRECIPFP,
+    RS6000_BTI_V4SF, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0 },
+  { ALTIVEC_BUILTIN_VEC_RECIP, VSX_BUILTIN_RECIP_V2DF,
+    RS6000_BTI_V2DF, RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0 },
+  { ALTIVEC_BUILTIN_VEC_RSQRT, ALTIVEC_BUILTIN_VRSQRTFP,
+    RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
+  { ALTIVEC_BUILTIN_VEC_RSQRT, VSX_BUILTIN_VEC_RSQRT_V2DF,
+    RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 },
  { ALTIVEC_BUILTIN_VEC_RSQRTE, ALTIVEC_BUILTIN_VRSQRTEFP,
    RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
+  { ALTIVEC_BUILTIN_VEC_RSQRTE, VSX_BUILTIN_XVRSQRTEDP,
+    RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 },
  { ALTIVEC_BUILTIN_VEC_TRUNC, ALTIVEC_BUILTIN_VRFIZ,
    RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
  { ALTIVEC_BUILTIN_VEC_TRUNC, VSX_BUILTIN_XVRDPIZ,

--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -106,9 +106,8 @@ extern void rs6000_split_compare_and_swap (rtx, rtx, rtx, rtx, rtx);
 extern void rs6000_expand_compare_and_swapqhi (rtx, rtx, rtx, rtx);
 extern void rs6000_split_compare_and_swapqhi (rtx, rtx, rtx, rtx, rtx, rtx);
 extern void rs6000_split_lock_test_and_set (rtx, rtx, rtx, rtx);
-extern void rs6000_emit_swdivsf (rtx, rtx, rtx);
+extern void rs6000_emit_swdiv (rtx, rtx, rtx, bool);
-extern void rs6000_emit_swdivdf (rtx, rtx, rtx);
+extern void rs6000_emit_swrsqrt (rtx, rtx);
-extern void rs6000_emit_swrsqrtsf (rtx, rtx);
 extern void output_toc (FILE *, rtx, int, enum machine_mode);
 extern rtx rs6000_longcall_ref (rtx);
 extern void rs6000_fatal_bad_address (rtx);

--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -543,6 +543,46 @@ extern int rs6000_vector_align[];
 /* E500 processors only support plain "sync", not lwsync.  */
 #define TARGET_NO_LWSYNC TARGET_E500
+/* Which machine supports the various reciprocal estimate instructions.  */
+#define TARGET_FRES	(TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT \
+			 && TARGET_FPRS && TARGET_SINGLE_FLOAT)
+#define TARGET_FRE	(TARGET_HARD_FLOAT && TARGET_FPRS \
+			 && TARGET_DOUBLE_FLOAT \
+			 && (TARGET_POPCNTB || VECTOR_UNIT_VSX_P (DFmode)))
+#define TARGET_FRSQRTES	(TARGET_HARD_FLOAT && TARGET_POPCNTB \
+			 && TARGET_FPRS && TARGET_SINGLE_FLOAT)
+#define TARGET_FRSQRTE	(TARGET_HARD_FLOAT && TARGET_FPRS \
+			 && TARGET_DOUBLE_FLOAT \
+			 && (TARGET_PPC_GFXOPT || VECTOR_UNIT_VSX_P (DFmode)))
+/* Whether the various reciprocal divide/square root estimate instructions
+   exist, and whether we should automatically generate code for the instruction
+   by default.  */
+#define RS6000_RECIP_MASK_HAVE_RE	0x1	/* have RE instruction.  */
+#define RS6000_RECIP_MASK_AUTO_RE	0x2	/* generate RE by default.  */
+#define RS6000_RECIP_MASK_HAVE_RSQRTE	0x4	/* have RSQRTE instruction.  */
+#define RS6000_RECIP_MASK_AUTO_RSQRTE	0x8	/* gen. RSQRTE by default.  */
+extern unsigned char rs6000_recip_bits[];
+#define RS6000_RECIP_HAVE_RE_P(MODE) \
+  (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_HAVE_RE)
+#define RS6000_RECIP_AUTO_RE_P(MODE) \
+  (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_AUTO_RE)
+#define RS6000_RECIP_HAVE_RSQRTE_P(MODE) \
+  (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_HAVE_RSQRTE)
+#define RS6000_RECIP_AUTO_RSQRTE_P(MODE) \
+  (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_AUTO_RSQRTE)
+#define RS6000_RECIP_HIGH_PRECISION_P(MODE) \
+  ((MODE) == SFmode || (MODE) == V4SFmode || TARGET_RECIP_PRECISION)
 /* Sometimes certain combinations of command options do not make sense
   on a particular target machine.  You can define a macro
   `OVERRIDE_OPTIONS' to take account of this.  This macro, if

--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -220,6 +220,9 @@
 ; These modes do not fit in integer registers in 32-bit mode.
 (define_mode_iterator DIFD [DI DF DD])
+;; Iterator for reciprocal estimate instructions
+(define_mode_iterator RECIPF [SF DF V4SF V2DF])
 ; Various instructions that come in SI and DI forms.
 ; A generic w/d attribute, for things like cmpw/cmpd.
 (define_mode_attr wd [(QI "b") (HI "h") (SI "w") (DI "d")])
@@ -240,6 +243,11 @@
 (define_mode_attr mptrsize [(SI "si")
 			    (DI "di")])
+(define_mode_attr rreg [(SF   "f")
+			(DF   "Ws")
+			(V4SF "Wf")
+			(V2DF "Wd")])
 ;; Start with fixed-point load and store insns.  Here we put only the more
 ;; complex forms.  Basic data transfer is done later.
@@ -5563,6 +5571,45 @@
  [(set_attr "type" "var_delayed_compare,delayed_compare,var_delayed_compare,delayed_compare")
   (set_attr "length" "4,4,8,8")])
+;; Builtins to replace a division to generate FRE reciprocal estimate
+;; instructions and the necessary fixup instructions
+(define_expand "recip<mode>3"
+  [(match_operand:RECIPF 0 "gpc_reg_operand" "")
+   (match_operand:RECIPF 1 "gpc_reg_operand" "")
+   (match_operand:RECIPF 2 "gpc_reg_operand" "")]
+  "RS6000_RECIP_HAVE_RE_P (<MODE>mode)"
+{
+   rs6000_emit_swdiv (operands[0], operands[1], operands[2], false);
+   DONE;
+})
+;; Split to create division from FRE/FRES/etc. and fixup instead of the normal
+;; hardware division.  This is only done before register allocation and with
+;; -ffast-math.  This must appear before the divsf3/divdf3 insns.
+(define_split
+  [(set (match_operand:RECIPF 0 "gpc_reg_operand" "")
+	(div:RECIPF (match_operand 1 "gpc_reg_operand" "")
+		    (match_operand 2 "gpc_reg_operand" "")))]
+  "RS6000_RECIP_AUTO_RE_P (<MODE>mode)
+   && can_create_pseudo_p () && optimize_insn_for_speed_p ()
+   && flag_finite_math_only && !flag_trapping_math && flag_reciprocal_math"
+  [(const_int 0)]
+{
+  rs6000_emit_swdiv (operands[0], operands[1], operands[2], true);
+  DONE;
+})
+;; Builtins to replace 1/sqrt(x) with instructions using RSQRTE and the
+;; appropriate fixup.
+(define_expand "rsqrt<mode>2"
+  [(match_operand:RECIPF 0 "gpc_reg_operand" "")
+   (match_operand:RECIPF 1 "gpc_reg_operand" "")]
+  "RS6000_RECIP_HAVE_RSQRT_P (<MODE>mode)"
+{
+  rs6000_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})
 (define_split
  [(set (match_operand:CC 3 "cc_reg_not_micro_cr0_operand" "")
 	(compare:CC (ashiftrt:SI (match_operand:SI 1 "gpc_reg_operand" "")
@@ -5766,22 +5813,10 @@
  "{fd|fdiv} %0,%1,%2"
  [(set_attr "type" "ddiv")])
-(define_expand "recipsf3"
-  [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
-	(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")
-		    (match_operand:SF 2 "gpc_reg_operand" "f")]
-		   UNSPEC_FRES))]
-  "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT && !optimize_size
-   && flag_finite_math_only && !flag_trapping_math"
-{
-   rs6000_emit_swdivsf (operands[0], operands[1], operands[2]);
-   DONE;
-})
 (define_insn "fres"
  [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
 	(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")] UNSPEC_FRES))]
-  "TARGET_PPC_GFXOPT && flag_finite_math_only"
+  "TARGET_FRES"
  "fres %0,%1"
  [(set_attr "type" "fp")])
@@ -5931,23 +5966,12 @@
  "fsqrt %0,%1"
  [(set_attr "type" "dsqrt")])
-(define_expand "rsqrtsf2"
+(define_insn "*rsqrtsf_internal1"
  [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
 	(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")]
 		   UNSPEC_RSQRT))]
-  "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT && !optimize_size
+  "TARGET_FRSQRTES"
-   && flag_finite_math_only && !flag_trapping_math"
+  "frsqrtes %0,%1"
-{
-  rs6000_emit_swrsqrtsf (operands[0], operands[1]);
-  DONE;
-})
-(define_insn "*rsqrt_internal1"
-  [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
-	(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")]
-		   UNSPEC_RSQRT))]
-  "TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT"
-  "frsqrte %0,%1"
  [(set_attr "type" "fp")])
 (define_expand "copysignsf3"
@@ -5960,9 +5984,18 @@
 	                     (match_dup 5))
 			 (match_dup 3)
 			 (match_dup 4)))]
-  "TARGET_PPC_GFXOPT && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT
+  "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT
-   && !HONOR_NANS (SFmode) && !HONOR_SIGNED_ZEROS (SFmode)"
+   && ((TARGET_PPC_GFXOPT
+        && !HONOR_NANS (SFmode)
+        && !HONOR_SIGNED_ZEROS (SFmode))
+       || VECTOR_UNIT_VSX_P (DFmode))"
  {
+     if (VECTOR_UNIT_VSX_P (DFmode))
+       {
+	 emit_insn (gen_vsx_copysignsf3 (operands[0], operands[1], operands[2],
+					 CONST0_RTX (SFmode)));
+	 DONE;
+       }
     operands[3] = gen_reg_rtx (SFmode);
     operands[4] = gen_reg_rtx (SFmode);
     operands[5] = CONST0_RTX (SFmode);
@@ -6222,31 +6255,21 @@
  "{fd|fdiv} %0,%1,%2"
  [(set_attr "type" "ddiv")])
-(define_expand "recipdf3"
-  [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
-	(unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")
-		    (match_operand:DF 2 "gpc_reg_operand" "d")]
-		   UNSPEC_FRES))]
-  "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_POPCNTB && !optimize_size
-   && flag_finite_math_only && !flag_trapping_math"
-{
-   rs6000_emit_swdivdf (operands[0], operands[1], operands[2]);
-   DONE;
-})
-(define_expand "fred"
-  [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
-	(unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")] UNSPEC_FRES))]
-  "(TARGET_POPCNTB || VECTOR_UNIT_VSX_P (DFmode)) && flag_finite_math_only"
-  "")
 (define_insn "*fred_fpr"
  [(set (match_operand:DF 0 "gpc_reg_operand" "=f")
 	(unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "f")] UNSPEC_FRES))]
-  "TARGET_POPCNTB && flag_finite_math_only && !VECTOR_UNIT_VSX_P (DFmode)"
+  "TARGET_FRE && !VECTOR_UNIT_VSX_P (DFmode)"
  "fre %0,%1"
  [(set_attr "type" "fp")])
+(define_insn "*rsqrtdf_internal1"
+  [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
+	(unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")]
+		   UNSPEC_RSQRT))]
+  "TARGET_FRSQRTE && !VECTOR_UNIT_VSX_P (DFmode)"
+  "frsqrte %0,%1"
+  [(set_attr "type" "fp")])
 (define_insn "*fmadddf4_fpr"
  [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
 	(plus:DF (mult:DF (match_operand:DF 1 "gpc_reg_operand" "%d")

--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -195,8 +195,16 @@ Target Report Var(TARGET_XL_COMPAT)
 Conform more closely to IBM XLC semantics
 mrecip
-Target Report Var(TARGET_RECIP)
+Target Report
-Generate software reciprocal sqrt for better throughput
+Generate software reciprocal divide and square root for better throughput.
+mrecip=
+Target Report RejectNegative Joined
+Generate software reciprocal divide and square root for better throughput.
+mrecip-precision
+Target Report Mask(RECIP_PRECISION)
+Assume that the reciprocal estimate instructions provide more accuracy.
 mno-fp-in-toc
 Target Report RejectNegative Var(TARGET_NO_FP_IN_TOC)

--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -267,6 +267,20 @@
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "")
+(define_expand "rsqrte<mode>2"
+  [(set (match_operand:VEC_F 0 "vfloat_operand" "")
+        (unspec:VEC_F [(match_operand:VEC_F 1 "vfloat_operand" "")]
+		      UNSPEC_RSQRT))]
+  "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
+  "")
+(define_expand "re<mode>2"
+  [(set (match_operand:VEC_F 0 "vfloat_operand" "")
+	(unspec:VEC_F [(match_operand:VEC_F 1 "vfloat_operand" "f")]
+		      UNSPEC_FRES))]
+  "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
+  "")
 (define_expand "ftrunc<mode>2"
  [(set (match_operand:VEC_F 0 "vfloat_operand" "")
  	(fix:VEC_F (match_operand:VEC_F 1 "vfloat_operand" "")))]

--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -195,7 +195,7 @@
   (UNSPEC_VSX_MSUB		511)
   (UNSPEC_VSX_NMADD		512)
   (UNSPEC_VSX_NMSUB		513)
-   (UNSPEC_VSX_RSQRTE		514)
+   ;; 514 deleted
   (UNSPEC_VSX_TDIV		515)
   (UNSPEC_VSX_TSQRT		516)
   (UNSPEC_VSX_XXPERMDI		517)
@@ -446,10 +446,10 @@
  [(set_attr "type" "<VStype_sqrt>")
   (set_attr "fp_type" "<VSfptype_sqrt>")])
-(define_insn "vsx_rsqrte<mode>2"
+(define_insn "*vsx_rsqrte<mode>2"
  [(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,?wa")
 	(unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "<VSr>,wa")]
-		      UNSPEC_VSX_RSQRTE))]
+		      UNSPEC_RSQRT))]
  "VECTOR_UNIT_VSX_P (<MODE>mode)"
  "x<VSv>rsqrte<VSs> %x0,%x1"
  [(set_attr "type" "<VStype_simple>")
@@ -862,6 +862,20 @@
  [(set_attr "type" "<VStype_simple>")
   (set_attr "fp_type" "<VSfptype_simple>")])
+;; Special version of copysign for single precision that knows internally
+;; scalar single values are kept as double
+(define_insn "vsx_copysignsf3"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=f")
+	(if_then_else:SF
+	 (ge:SF (match_operand:SF 2 "vsx_register_operand" "f")
+		(match_operand:SF 3 "zero_constant" "j"))
+	 (abs:SF (match_operand:SF 1 "vsx_register_operand" "f"))
+	 (neg:SF (abs:SF (match_dup 1)))))]
+  "VECTOR_UNIT_VSX_P (DFmode)"
+  "xscpsgndp %x0,%x2,%x1"
+  [(set_attr "type" "fp")
+   (set_attr "fp_type" "fp_addsub_d")])
 ;; For the conversions, limit the register class for the integer value to be
 ;; the fprs because we don't want to add the altivec registers to movdi/movsi.
 ;; For the unsigned tests, there isn't a generic double -> unsigned conversion

--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -10994,6 +10994,10 @@ vector unsigned char vec_vrlb (vector unsigned char,
 vector float vec_round (vector float);
+vector float vec_recip (vector float, vector float);
+vector float vec_rsqrt (vector float);
 vector float vec_rsqrte (vector float);
 vector float vec_sel (vector float, vector float, vector bool int);
@@ -11922,8 +11926,10 @@ vector double vec_or (vector bool long, vector double);
 vector double vec_perm (vector double,
                        vector double,
                        vector unsigned char);
-vector float vec_rint (vector float);
 vector double vec_rint (vector double);
+vector double vec_recip (vector double, vector double);
+vector double vec_rsqrt (vector double);
+vector double vec_rsqrte (vector double);
 vector double vec_sel (vector double, vector double, vector bool long);
 vector double vec_sel (vector double, vector double, vector unsigned long);
 vector double vec_sub (vector double, vector double);
@@ -11964,10 +11970,20 @@ GCC provides a few other builtins on Powerpc to access certain instructions:
 float __builtin_recipdivf (float, float);
 float __builtin_rsqrtf (float);
 double __builtin_recipdiv (double, double);
+double __builtin_rsqrt (double);
 long __builtin_bpermd (long, long);
 int __builtin_bswap16 (int);
 @end smallexample
+The @code{vec_rsqrt}, @code{__builtin_rsqrt}, and
+@code{__builtin_rsqrtf} functions generate multiple instructions to
+implement the reciprocal sqrt functionality using reciprocal sqrt
+estimate instructions.
+The @code{__builtin_recipdiv}, and @code{__builtin_recipdivf}
+functions generate multiple instructions to implement division using
+the reciprocal estimate instructions.
 @node RX Built-in Functions
 @subsection RX Built-in Functions
 GCC supports some of the RX instructions which cannot be expressed in

--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -783,7 +783,8 @@ See RS/6000 and PowerPC Options.
 -mfloat-gprs=yes  -mfloat-gprs=no -mfloat-gprs=single -mfloat-gprs=double @gol
 -mprototype  -mno-prototype @gol
 -msim  -mmvme  -mads  -myellowknife  -memb  -msdata @gol
-msdata=@var{opt}  -mvxworks  -G @var{num}  -pthread}
+-msdata=@var{opt}  -mvxworks  -G @var{num}  -pthread @gol
+-mrecip -mrecip=@var{opt} -mno-recip -mrecip-precision -mno-recip-precision}
 @emph{RX Options}
 @gccoptlist{-m64bit-doubles  -m32bit-doubles  -fpu  -nofpu@gol
@@ -14975,17 +14976,6 @@ values for @var{cpu_type} are used for @option{-mtune} as for
 architecture, registers, and mnemonics set by @option{-mcpu}, but the
 scheduling parameters set by @option{-mtune}.
-@item -mswdiv
-@itemx -mno-swdiv
-@opindex mswdiv
-@opindex mno-swdiv
-Generate code to compute division as reciprocal estimate and iterative
-refinement, creating opportunities for increased throughput.  This
-feature requires: optional PowerPC Graphics instruction set for single
-precision and FRE instruction for double precision, assuming divides
-cannot generate user-visible traps, and the domain values not include
-Infinities, denormals or zero denominator.
 @item -maltivec
 @itemx -mno-altivec
 @opindex maltivec
@@ -15641,6 +15631,52 @@ sequence.
 Adds support for multithreading with the @dfn{pthreads} library.
 This option sets flags for both the preprocessor and linker.
+@item -mrecip
+@itemx -mno-recip
+@opindex mrecip
+This option will enable GCC to use the reciprocal estimate and
+reciprocal square root estimate instructions with additional
+Newton-Raphson steps to increase precision instead of doing a divide or
+square root and divide for floating point arguments.  You should use
+the @option{-ffast-math} option when using @option{-mrecip} (or at
+least @option{-funsafe-math-optimizations},
+@option{-finite-math-only}, @option{-freciprocal-math} and
+@option{-fno-trapping-math}).  Note that while the throughput of the
+sequence is generally higher than the throughput of the non-reciprocal
+instruction, the precision of the sequence can be decreased by up to 2
+ulp (i.e. the inverse of 1.0 equals 0.99999994) for reciprocal square
+roots.
+@item -mrecip=@var{opt}
+@opindex mrecip=opt
+This option allows to control which reciprocal estimate instructions
+may be used.  @var{opt} is a comma separated list of options, that may
+be preceeded by a @code{!} to invert the option:
+@code{all}: enable all estimate instructions,
+@code{default}: enable the default instructions, equvalent to @option{-mrecip},
+@code{none}: disable all estimate instructions, equivalent to @option{-mno-recip};
+@code{div}: enable the reciprocal approximation instructions for both single and double precision;
+@code{divf}: enable the single precision reciprocal approximation instructions;
+@code{divd}: enable the double precision reciprocal approximation instructions;
+@code{rsqrt}: enable the reciprocal square root approximation instructions for both single and double precision;
+@code{rsqrtf}: enable the single precision reciprocal square root approximation instructions;
+@code{rsqrtd}: enable the double precision reciprocal square root approximation instructions;
+So for example, @option{-mrecip=all,!rsqrtd} would enable the
+all of the reciprocal estimate instructions, except for the
+@code{FRSQRTE}, @code{XSRSQRTEDP}, and @code{XVRSQRTEDP} instructions
+which handle the double precision reciprocal square root calculations.
+@item -mrecip-precision
+@itemx -mno-recip-precision
+@opindex mrecip-precision
+Assume (do not assume) that the reciprocal estimate instructions
+provide higher precision estimates than is mandated by the powerpc
+ABI.  Selecting @option{-mcpu=power6} or @option{-mcpu=power7}
+automatically selects @option{-mrecip-precision}.  The double
+precision square root estimate instructions are not generated by
+default on low precision machines, since they do not provide an
+estimate that converges after three steps.
 @end table
 @node RX Options

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
+2010-06-02  Michael Meissner  <meissner@linux.vnet.ibm.com>
+	PR target/44218
+	* gcc.target/powerpc/recip-1.c: New test for -mrecip support.
+	* gcc.target/powerpc/recip-2.c: Ditto.
+	* gcc.target/powerpc/recip-3.c: Ditto.
+	* gcc.target/powerpc/recip-4.c: Ditto.
+	* gcc.target/powerpc/recip-5.c: Ditto.
+	* gcc.target/powerpc/recip-6.c: Ditto.
+	* gcc.target/powerpc/recip-7.c: Ditto.
+	* gcc.target/powerpc/recip-test.h: Ditto.
+	* gcc.target/powerpc/recip-test2.h: Ditto.
 2010-06-02  H.J. Lu  <hongjiu.lu@intel.com>
 	* g++.dg/torture/pr44295.C (size_t): Use __SIZE_TYPE__.

--- a/gcc/testsuite/gcc.target/powerpc/recip-1.c
+++ b/gcc/testsuite/gcc.target/powerpc/recip-1.c
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power6" } */
+/* { dg-final { scan-assembler-times "frsqrte" 2 } } */
+/* { dg-final { scan-assembler-times "fmsub" 2 } } */
+/* { dg-final { scan-assembler-times "fmul" 8 } } */
+/* { dg-final { scan-assembler-times "fnmsub" 4 } } */
+double
+rsqrt_d (double a)
+{
+  return 1.0 / __builtin_sqrt (a);
+}
+float
+rsqrt_f (float a)
+{
+  return 1.0f / __builtin_sqrtf (a);
+}
--- a/gcc/testsuite/gcc.target/powerpc/recip-2.c
+++ b/gcc/testsuite/gcc.target/powerpc/recip-2.c
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power5" } */
+/* { dg-final { scan-assembler-times "frsqrtes" 1 } } */
+/* { dg-final { scan-assembler-times "fmsubs" 1 } } */
+/* { dg-final { scan-assembler-times "fmuls" 6 } } */
+/* { dg-final { scan-assembler-times "fnmsubs" 3 } } */
+/* { dg-final { scan-assembler-times "fsqrt" 1 } } */
+/* power5 resqrte is not accurate enough, and should not be generated by
+   default for -mrecip.  */
+double
+rsqrt_d (double a)
+{
+  return 1.0 / __builtin_sqrt (a);
+}
+float
+rsqrt_f (float a)
+{
+  return 1.0f / __builtin_sqrtf (a);
+}
--- a/gcc/testsuite/gcc.target/powerpc/recip-3.c
+++ b/gcc/testsuite/gcc.target/powerpc/recip-3.c
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power7" } */
+/* { dg-final { scan-assembler-times "xsrsqrtedp" 1 } } */
+/* { dg-final { scan-assembler-times "xsmsub.dp" 1 } } */
+/* { dg-final { scan-assembler-times "xsmuldp" 4 } } */
+/* { dg-final { scan-assembler-times "xsnmsub.dp" 2 } } */
+/* { dg-final { scan-assembler-times "frsqrtes" 1 } } */
+/* { dg-final { scan-assembler-times "fmsubs" 1 } } */
+/* { dg-final { scan-assembler-times "fmuls" 4 } } */
+/* { dg-final { scan-assembler-times "fnmsubs" 2 } } */
+double
+rsqrt_d (double a)
+{
+  return 1.0 / __builtin_sqrt (a);
+}
+float
+rsqrt_f (float a)
+{
+  return 1.0f / __builtin_sqrtf (a);
+}
--- a/gcc/testsuite/gcc.target/powerpc/recip-4.c
+++ b/gcc/testsuite/gcc.target/powerpc/recip-4.c
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O3 -ftree-vectorize -mrecip -ffast-math -mcpu=power7 -fno-unroll-loops" } */
+/* { dg-final { scan-assembler-times "xvrsqrtedp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmsub.dp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmuldp" 4 } } */
+/* { dg-final { scan-assembler-times "xvnmsub.dp" 2 } } */
+/* { dg-final { scan-assembler-times "xvrsqrtesp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmsub.sp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmulsp" 4 } } */
+/* { dg-final { scan-assembler-times "xvnmsub.sp" 2 } } */
+#define SIZE 1024
+extern double a_d[SIZE] __attribute__((__aligned__(32)));
+extern double b_d[SIZE] __attribute__((__aligned__(32)));
+void
+vectorize_rsqrt_d (void)
+{
+  int i;
+  for (i = 0; i < SIZE; i++)
+    a_d[i] = 1.0 / __builtin_sqrt (b_d[i]);
+}
+extern float a_f[SIZE] __attribute__((__aligned__(32)));
+extern float b_f[SIZE] __attribute__((__aligned__(32)));
+void
+vectorize_rsqrt_f (void)
+{
+  int i;
+  for (i = 0; i < SIZE; i++)
+    a_f[i] = 1.0f / __builtin_sqrtf (b_f[i]);
+}
--- a/gcc/testsuite/gcc.target/powerpc/recip-5.c
+++ b/gcc/testsuite/gcc.target/powerpc/recip-5.c
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O3 -ftree-vectorize -mrecip=all -ffast-math -mcpu=power7 -fno-unroll-loops" } */
+/* { dg-final { scan-assembler-times "xvredp" 4 } } */
+/* { dg-final { scan-assembler-times "xvresp" 5 } } */
+/* { dg-final { scan-assembler-times "xsredp" 2 } } */
+/* { dg-final { scan-assembler-times "fres" 2 } } */
+#include <altivec.h>
+float f_recip (float a, float b) { return __builtin_recipdivf (a, b); }
+double d_recip (double a, double b) { return __builtin_recipdiv (a, b); }
+float f_div (float a, float b) { return a / b; }
+double d_div (double a, double b) { return a / b; }
+#define SIZE 1024
+double d_a[SIZE] __attribute__((__aligned__(32)));
+double d_b[SIZE] __attribute__((__aligned__(32)));
+double d_c[SIZE] __attribute__((__aligned__(32)));
+float f_a[SIZE] __attribute__((__aligned__(32)));
+float f_b[SIZE] __attribute__((__aligned__(32)));
+float f_c[SIZE] __attribute__((__aligned__(32)));
+void vec_f_recip (void)
+{
+  int i;
+  for (i = 0; i < SIZE; i++)
+    f_a[i] = __builtin_recipdivf (f_b[i], f_c[i]);
+}
+void vec_d_recip (void)
+{
+  int i;
+  for (i = 0; i < SIZE; i++)
+    d_a[i] = __builtin_recipdiv (d_b[i], d_c[i]);
+}
+void vec_f_div (void)
+{
+  int i;
+  for (i = 0; i < SIZE; i++)
+    f_a[i] = f_b[i] / f_c[i];
+}
+void vec_f_div2 (void)
+{
+  int i;
+  for (i = 0; i < SIZE; i++)
+    f_a[i] = f_b[i] / 2.0f;
+}
+void vec_f_div53 (void)
+{
+  int i;
+  for (i = 0; i < SIZE; i++)
+    f_a[i] = f_b[i] / 53.0f;
+}
+void vec_d_div (void)
+{
+  int i;
+  for (i = 0; i < SIZE; i++)
+    d_a[i] = d_b[i] / d_c[i];
+}
+void vec_d_div2 (void)
+{
+  int i;
+  for (i = 0; i < SIZE; i++)
+    d_a[i] = d_b[i] / 2.0;
+}
+void vec_d_div53 (void)
+{
+  int i;
+  for (i = 0; i < SIZE; i++)
+    d_a[i] = d_b[i] / 53.0;
+}
+vector float v4sf_recip1 (vector float a, vector float b) { return vec_recipdiv (a, b); }
+vector float v4sf_recip2 (vector float a, vector float b) { return __builtin_altivec_vrecipdivfp (a, b); }
+vector double v2df_recip1 (vector double a, vector double b) { return vec_recipdiv (a, b); }
+vector float v4sf_recip3 (vector float a, vector float b) { return __builtin_vsx_xvrecipdivsp (a, b); }
+vector double v2df_recip2 (vector double a, vector double b) { return __builtin_vsx_xvrecipdivdp (a, b); }
--- a/gcc/testsuite/gcc.target/powerpc/recip-6.c
+++ b/gcc/testsuite/gcc.target/powerpc/recip-6.c
+/* { dg-do run { target { powerpc*-*-linux* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */
+/* { dg-require-effective-target vsx_hw } */
+/* { dg-options "-mcpu=power7 -O3 -ftree-vectorize -ffast-math -mrecip=all -mrecip-precision" } */
+/* Check reciprocal estimate functions for accuracy.  */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+#include "recip-test.h"
--- a/gcc/testsuite/gcc.target/powerpc/recip-7.c
+++ b/gcc/testsuite/gcc.target/powerpc/recip-7.c
+/* { dg-do run { target { powerpc*-*-linux* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */
+/* { dg-require-effective-target ppc_recip_hw } */
+/* { dg-options "-O3 -ftree-vectorize -ffast-math -mrecip -mpowerpc-gfxopt -mpowerpc-gpopt -mpopcntb" } */
+/* Check reciprocal estimate functions for accuracy.  */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+#include "recip-test.h"
--- a/gcc/testsuite/gcc.target/powerpc/recip-test.h
+++ b/gcc/testsuite/gcc.target/powerpc/recip-test.h
+/* Check reciprocal estimate functions for accuracy.  */
+#ifdef _ARCH_PPC64
+typedef unsigned long uns64_t;
+#define UNUM64(x) x ## L
+#else
+typedef unsigned long long uns64_t;
+#define UNUM64(x) x ## LL
+#endif
+typedef unsigned int uns32_t;
+#define TNAME2(x) #x
+#define TNAME(x) TNAME2(x)
+/*
+ * Float functions.
+ */
+#define TYPE float
+#define NAME(PREFIX) PREFIX ## _float
+#define UNS_TYPE uns32_t
+#define UNS_ABS __builtin_abs
+#define EXP_SIZE 8
+#define MAN_SIZE 23
+#define FABS __builtin_fabsf
+#define FMAX __builtin_fmaxf
+#define FMIN __builtin_fminf
+#define SQRT __builtin_sqrtf
+#define RMIN 1.0e-10
+#define RMAX 1.0e+10
+#define BDIV 1
+#define BRSQRT 2
+#define ASMDIV "fdivs"
+#define ASMSQRT "fsqrts"
+#define INIT_DIV							\
+{									\
+  { 0x4fffffff },	/* 8589934080 */				\
+  { 0x4effffff },	/* 2147483520 */				\
+  { 0x40ffffff },	/* 7.99999952316284 */				\
+  { 0x3fffffff },	/* 1.99999988079071 */				\
+  { 0x417fffff },	/* 15.9999990463257 */				\
+  { 0x42ffffff },	/* 127.999992370605 */				\
+  { 0x3dffffff },	/* 0.124999992549419 */				\
+  { 0x3effffff },	/* 0.499999970197678 */				\
+}
+#define INIT_RSQRT							\
+{									\
+  { 0x457ffffe },	/* 4096 - small amount */			\
+  { 0x4c7fffff },	/* 6.71089e+07 */				\
+  { 0x3d7fffff },	/* 0.0625 - small amount */			\
+  { 0x307ffffe },	/* 9.31322e-10 */				\
+  { 0x4c7ffffe },	/* 6.71089e+07 */				\
+  { 0x397ffffe },	/* 0.000244141 */				\
+  { 0x2e7fffff },	/* 5.82077e-11 */				\
+  { 0x2f7fffff },	/* 2.32831e-10 */				\
+}
+#include "recip-test2.h"
+/*
+ * Double functions.
+ */
+#undef TYPE
+#undef NAME
+#undef UNS_TYPE
+#undef UNS_ABS
+#undef EXP_SIZE
+#undef MAN_SIZE
+#undef FABS
+#undef FMAX
+#undef FMIN
+#undef SQRT
+#undef RMIN
+#undef RMAX
+#undef BDIV
+#undef BRSQRT
+#undef ASMDIV
+#undef ASMSQRT
+#undef INIT_DIV
+#undef INIT_RSQRT
+#define TYPE double
+#define NAME(PREFIX) PREFIX ## _double
+#define UNS_TYPE uns64_t
+#define UNS_ABS __builtin_imaxabs
+#define EXP_SIZE 11
+#define MAN_SIZE 52
+#define FABS __builtin_fabs
+#define FMAX __builtin_fmax
+#define FMIN __builtin_fmin
+#define SQRT __builtin_sqrt
+#define RMIN 1.0e-100
+#define RMAX 1.0e+100
+#define BDIV 1
+#define BRSQRT 2
+#define ASMDIV "fdiv"
+#define ASMSQRT "fsqrt"
+#define INIT_DIV							\
+{									\
+  { UNUM64 (0x2b57be53f2a2f3a0) },	/* 6.78462e-100 */		\
+  { UNUM64 (0x2b35f8e8ea553e52) },	/* 1.56963e-100 */		\
+  { UNUM64 (0x2b5b9d861d2fe4fb) },	/* 7.89099e-100 */		\
+  { UNUM64 (0x2b45dc44a084e682) },	/* 3.12327e-100 */		\
+  { UNUM64 (0x2b424ce16945d777) },	/* 2.61463e-100 */		\
+  { UNUM64 (0x2b20b5023d496b50) },	/* 5.96749e-101 */		\
+  { UNUM64 (0x2b61170547f57caa) },	/* 9.76678e-100 */		\
+  { UNUM64 (0x2b543b9d498aac37) },	/* 5.78148e-100 */		\
+}
+#define INIT_RSQRT							\
+{									\
+  { UNUM64 (0x2b616f2d8cbbc646) },	/* 9.96359e-100 */		\
+  { UNUM64 (0x2b5c4db2da0a011d) },	/* 8.08764e-100 */		\
+  { UNUM64 (0x2b55a82d5735b262) },	/* 6.1884e-100 */		\
+  { UNUM64 (0x2b50b52908258cb8) },	/* 4.77416e-100 */		\
+  { UNUM64 (0x2b363989a4fb29af) },	/* 1.58766e-100 */		\
+  { UNUM64 (0x2b508b9f6f4180a9) },	/* 4.7278e-100 */		\
+  { UNUM64 (0x2b4f7a1d48accb40) },	/* 4.49723e-100 */		\
+  { UNUM64 (0x2b1146a37372a81f) },	/* 3.08534e-101 */		\
+  { UNUM64 (0x2b33f876a8c48050) },	/* 1.42663e-100 */		\
+}
+#include "recip-test2.h"
+int
+main (int argc __attribute__((__unused__)),
+      char *argv[] __attribute__((__unused__)))
+{
+  srand48 (1);
+  run_float ();
+#ifdef VERBOSE
+  printf ("\n");
+#endif
+  run_double ();
+  if (error_count_float != 0 || error_count_double != 0)
+    abort ();
+  return 0;
+}
--- a/gcc/testsuite/gcc.target/powerpc/recip-test2.h
+++ b/gcc/testsuite/gcc.target/powerpc/recip-test2.h
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -992,6 +992,30 @@ proc check_vmx_hw_available { } {
    }]
 }
+proc check_ppc_recip_hw_available { } {
+    return [check_cached_effective_target ppc_recip_hw_available {
+	# Some simulators may not support FRE/FRES/FRSQRTE/FRSQRTES
+	# For now, disable on Darwin
+	if { [istarget powerpc-*-eabi] || [istarget powerpc*-*-eabispe] || [istarget *-*-darwin*]} {
+	    expr 0
+	} else {
+	    set options "-mpowerpc-gfxopt -mpowerpc-gpopt -mpopcntb"
+	    check_runtime_nocache ppc_recip_hw_available {
+		volatile double d_recip, d_rsqrt, d_four = 4.0;
+		volatile float f_recip, f_rsqrt, f_four = 4.0f;
+		int main()
+		{
+		  asm volatile ("fres %0,%1" : "=f" (f_recip) : "f" (f_four));
+		  asm volatile ("fre %0,%1" : "=d" (d_recip) : "d" (d_four));
+		  asm volatile ("frsqrtes %0,%1" : "=f" (f_rsqrt) : "f" (f_four));
+		  asm volatile ("frsqrte %0,%1" : "=f" (d_rsqrt) : "d" (d_four));
+		  return 0;
+		}
+	    } $options
+	}
+    }]
+}
 # Return 1 if the target supports executing AltiVec and Cell PPU
 # instructions, 0 otherwise.  Cache the result.
@@ -2972,6 +2996,8 @@ proc is-effective-target { arg } {
    } else {
 	switch $arg {
 	  "vmx_hw"         { set selected [check_vmx_hw_available] }
+	  "vsx_hw"         { set selected [check_vsx_hw_available] }
+	  "ppc_recip_hw"   { set selected [check_ppc_recip_hw_available] }
 	  "named_sections" { set selected [check_named_sections_available] }
 	  "gc_sections"    { set selected [check_gc_sections_available] }
 	  "cxa_atexit"     { set selected [check_cxa_atexit_available] }
@@ -2991,6 +3017,8 @@ proc is-effective-target-keyword { arg } {
 	# These have different names for their check_* procs.
 	switch $arg {
 	  "vmx_hw"         { return 1 }
+	  "vsx_hw"         { return 1 }
+	  "ppc_recip_hw"   { return 1 }
 	  "named_sections" { return 1 }
 	  "gc_sections"    { return 1 }
 	  "cxa_atexit"     { return 1 }