PR target/44218, improve -mrecip on powerpc

From-SVN: r160199
author: Michael Meissner <meissner@linux.vnet.ibm.com> 2010-06-03 00:06:12 +0000
committer: Michael Meissner <meissner@gcc.gnu.org> 2010-06-03 00:06:12 +0000
commit: 92902797041a42ac500f7dc9639df8a680e0b691 (patch)
tree: d55e7fa0ae623e1c748075d3f81edeb35fb123fb /gcc
parent: 6c07d08b90b124d8d3be8015726caf799e2e2a13 (diff)
download: gcc-92902797041a42ac500f7dc9639df8a680e0b691.tar.gz
25 files changed, 2112 insertions, 454 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 59008a5ab40..ecfdab1044d 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,139 @@
+2010-06-02  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	PR target/44218
+	* doc/invoke.texi (RS/6000 and PowerPC Options): Delete obsolete
+	-mswdiv option.  Add -mrecip, -mrecip=<xxx>, -mrecip-precision
+	options.
+
+	* doc/extend.texi (powerpc builtins): Document vec_recip,
+	vec_rsqrt, vec_rsqrte altivec/vsx builtins.
+
+	* config/rs6000/rs60000-protos.h (rs6000_emit_swdiv): New
+	function.
+	(rs6000_emit_swrsqrt): Ditto.
+	(rs6000_emit_swdivsf): Delete.
+	(rs6000_emit_swdivdf): Ditto.
+	(rs6000_emit_swrsqrtsf): Ditto.
+
+	* config/rs6000/rs6000.c (rs6000_recip_bits): New global to
+	describe the reciprocal estimate support for each type.
+	(recip_options): Map -mrecip=<opt> into option bits.
+	(gen_2arg_fn_t): New typedef for binary rtx gen function.
+	(rs6000_debug_reg_global): If -mdebug=reg, print the state of the
+	reciprocal estimate instructions.
+	(rs6000_init_hard_regno_mode_ok): Key ws constraint off of the
+	debug -mvsx-scalar-memory switch instead of -mvsx-scalar-double.
+	Set up rs6000_recip_bits based on the -mrecip* options.  Print the
+	cost information if -mdebug=cost or -mdebug=reg.
+	(rs6000_override_options): Set -mrecip-precision for power6, and
+	power7 machines.  If -mvsx or -mdfp, enable various options that
+	came in previous instruction set ISAs, unless the option was
+	explicitly disabled by the command line option.  Parse
+	-mrecip=<opt> options.
+	(rs6000_builtin_vectorized_function): Add support for vectorizing
+	the reciprocal estimate builtins and expansions.
+	(rs6000_handle_option): Add -mrecip, -mrecip=<opt> support.
+	(bdesc_2arg): Add reciprocal estimate builtins.
+	(bdesc_1arg): Add reciprocal square root estimate builtins.
+	(rs6000_expand_builtin): Rewrite to use a switch statement,
+	instead of multiple if/then/elses.  Add reciprocal estimate
+	builtins.
+	(rs6000_init_builtins): Create declarations for reciprocal
+	estimate builtins.
+	(rs6000_preferred_reload_class): Simplify VSX preferences, if scalar
+	sized, prefer traditional floating point registers, if integer
+	vector types, prefer altivec registers.  Don't actually look at
+	the memory address any more.
+	(rs6000_builtin_reciprocal): Add new builtin reciprocal estimate
+	builtins.
+	(rs6000_load_constant_and_splat): New helper function to load up
+	the constant for reciprocal estimate instructions.
+	(rs6000_emit_madd): New helper function for generating
+	multiply/add type instructions, based on the current switches.
+	(rs6000_emit_msub): Ditto.
+	(rs6000_emit_mnsub): Ditto.
+	(rs6000_emit_swdiv_high_precision): Replace rs6000_emit_swdivsf to
+	replace a divide with a reciprocal estimate and fixup, adding
+	support for machines with high precision and vectors.
+	(rs6000_emit_swdiv_low_precision): Rewrite rs6000_emit_swdivdf for
+	low precision machines.
+	(rs6000_emit_swdiv): New common function to be called to replace a
+	division with reciprocal estimate and fixup.
+	(rs6000_emit_swrsqrt): Replace rs6000_emit_swrsqrtsf.  Add support
+	for double and vector types.  Add support for high precision
+	machines.
+
+	* config/rs6000/rs6000.h (TARGET_FRES): New macro to say whether
+	the reciprocal estimate instructions can be generated.
+	(TARGET_FRE): Ditto.
+	(TARGET_FRSQRTES): Ditto.
+	(TARGET_FRSQRTE): Ditto.
+	(RS6000_RECIP_*): New macros for reciprocal estimate support.
+
+	* config/rs6000/vector.md (rsqrte<mode>2): New insn for reciprocal
+	square root estimate on vectors.
+	(re<mode>2): New insn for reciprocal division estimate on vectors.
+
+	* config/rs6000/rs6000-buitlins.def (ALTIVEC_BUILTIN_VRSQRTFP):
+	New builtin.
+	(ALTIVEC_BUILTIN_VRECIPFP): Ditto.
+	(ALTIVEC_BUITLIN_VEC_RE): Ditto.
+	(ALTIVEC_BUILTIN_VEC_RSQRT): Ditto.
+	(VSX_BUILTIN_RSQRT_V4SF): Ditto.
+	(VSX_BUITLIN_RSQRT_V2DF): Ditto.
+	(RS6000_BUILTIN_RSQRT): Ditto.
+	(ALTIVEC_BUILTIN_VEC_RSQRTE): Denote that the builtin is a
+	floating point builtin.
+
+	* config/rs6000/rs6000-c.c (rs6000_cpu_cpp_builtins): Define
+	macros __RECIP__, __RECIPF__, __RSQRTE__, __RSQRTEF__,
+	__RECIP_PRECISION__ based on the command line switches.
+	(altivec_overloaded_builtins): Add reciprocal estimate builtins.
+
+	* config/rs6000/rs6000.opt (-mrecip): Document add support for
+	replacing division instructions with reciprocal estimate and
+	fixup.
+	(-mrecip=<opt>): New option.
+	(-mrecip-precision): Ditto.
+
+	* config/rs6000/vsx.md (UNSPEC_VSX_RSQRTE): Delete.
+	(vsx_rsqrte<mode>2): Use UNSPEC_RSQRT not UNSPEC_VSX_RSQRTE.
+	(vsx_copysignsf3): If -mvsx, use double precision cpsign on single
+	precision scalar.
+
+	* config/rs6000/altivec.md (UNSPEC_RSQRTEFP): Delete.
+	(UNSPEC_VREFP): Ditto.
+	(altivec_vnmsubfp*): Make altivec nmsub mirror the scalar and VSX
+	conterparts with regard to support of -mno-fused-madd and
+	-ffast-math.
+	(altivec_vrsqrtefp): Use common UNSPEC to allow scalar/vector
+	reciprocal estimate instructions to be generated.
+	(altivec_vrefp): Ditto.
+
+	* config/rs6000/rs6000.md (RECIPF): New iterator for reciprocal
+	estimate support.
+	(rreg): New mode attribute for reciprocal estimate support.
+	(recip<mode>3): New insn for division using reciprocal estimate
+	and fixup builtins.
+	(divide define_split): New define_split to convert floating point
+	division to use reciprocal estimate if the user used the
+	appropriate options and the split is run when we can add new
+	pseudo registers for the fixup.
+	(rsqrt<mode>2): New insn for reciprocal square root support.
+	(recipsf3): Move into recip<mode>3.
+	(recipdf3): Ditto.
+	(fres): Use TARGET_FRES.
+	(rsqrtsf2): Move into rsqrt<mode>2.
+	(rsqrtsf_internal1): Use TARGET_FRSQRTSES.
+	(copysignsf3): Add support for VSX.
+	(fred): Use TARGET_FRE.
+	(fred_fpr): Ditto.
+	(rsqrtdf_internal1): New function for frsqrte instruciton.
+
+	* config/rs6000/altivec.h (vec_recipdiv): Define new vector
+	builtin.
+	(vec_rsqrt): Ditto.
+
 2010-06-03  Richard Guenther  <rguenther@suse.de>
 
 	PR middle-end/44291
diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h
index bc4f30f7cb2..5f4510adc30 100644
--- a/gcc/config/rs6000/altivec.h
+++ b/gcc/config/rs6000/altivec.h
@@ -163,6 +163,8 @@
 #define vec_vpkshus __builtin_vec_vpkshus
 #define vec_re __builtin_vec_re
 #define vec_round __builtin_vec_round
+#define vec_recipdiv __builtin_vec_recipdiv
+#define vec_rsqrt __builtin_vec_rsqrt
 #define vec_rsqrte __builtin_vec_rsqrte
 #define vec_vsubfp __builtin_vec_vsubfp
 #define vec_subc __builtin_vec_subc
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 6fbb7cdcdac..7bf3c660312 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -75,9 +75,7 @@
    (UNSPEC_VCTSXS       154)
    (UNSPEC_VLOGEFP      155)
    (UNSPEC_VEXPTEFP     156)
-   (UNSPEC_VRSQRTEFP    157)
-   (UNSPEC_VREFP        158)
-   ;; 159-162 deleted
+   ;; 157-162 deleted
    (UNSPEC_VLSDOI       163)
    (UNSPEC_VUPKHSB      167)
    (UNSPEC_VUPKHPX      168)
@@ -141,10 +139,11 @@
    (UNSPEC_VPERMHI	321)
    (UNSPEC_INTERHI      322)
    (UNSPEC_INTERLO      323)
-   (UNSPEC_VUPKHS_V4SF   324)
-   (UNSPEC_VUPKLS_V4SF   325)
-   (UNSPEC_VUPKHU_V4SF   326)
-   (UNSPEC_VUPKLU_V4SF   327)
+   (UNSPEC_VUPKHS_V4SF  324)
+   (UNSPEC_VUPKLS_V4SF  325)
+   (UNSPEC_VUPKHU_V4SF  326)
+   (UNSPEC_VUPKLU_V4SF  327)
+   (UNSPEC_VNMSUBFP	328)
 ])
 
 (define_constants
@@ -628,11 +627,64 @@
 }")
 
 ;; Fused multiply subtract 
-(define_insn "altivec_vnmsubfp"
+(define_expand "altivec_vnmsubfp"
+  [(match_operand:V4SF 0 "register_operand" "")
+   (match_operand:V4SF 1 "register_operand" "")
+   (match_operand:V4SF 2 "register_operand" "")
+   (match_operand:V4SF 3 "register_operand" "")]
+  "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
+{
+  if (TARGET_FUSED_MADD && HONOR_SIGNED_ZEROS (SFmode))
+    {
+       emit_insn (gen_altivec_vnmsubfp_1 (operands[0], operands[1],
+					  operands[2], operands[3]));
+       DONE;
+    }
+  else if (TARGET_FUSED_MADD && !HONOR_SIGNED_ZEROS (DFmode))
+    {
+       emit_insn (gen_altivec_vnmsubfp_2 (operands[0], operands[1],
+					  operands[2], operands[3]));
+       DONE;
+    }
+  else
+    {
+       emit_insn (gen_altivec_vnmsubfp_3 (operands[0], operands[1],
+					  operands[2], operands[3]));
+       DONE;
+    }
+})
+
+(define_insn "altivec_vnmsubfp_1"
   [(set (match_operand:V4SF 0 "register_operand" "=v")
-	(neg:V4SF (minus:V4SF (mult:V4SF (match_operand:V4SF 1 "register_operand" "v")
-			       (match_operand:V4SF 2 "register_operand" "v"))
-	  	    (match_operand:V4SF 3 "register_operand" "v"))))]
+	(neg:V4SF
+	 (minus:V4SF
+	  (mult:V4SF
+	   (match_operand:V4SF 1 "register_operand" "v")
+	   (match_operand:V4SF 2 "register_operand" "v"))
+	  (match_operand:V4SF 3 "register_operand" "v"))))]
+  "VECTOR_UNIT_ALTIVEC_P (V4SFmode) && TARGET_FUSED_MADD
+   && HONOR_SIGNED_ZEROS (SFmode)"
+  "vnmsubfp %0,%1,%2,%3"
+  [(set_attr "type" "vecfloat")])
+
+(define_insn "altivec_vnmsubfp_2"
+  [(set (match_operand:V4SF 0 "register_operand" "=v")
+	(minus:V4SF
+	 (match_operand:V4SF 3 "register_operand" "v")
+	 (mult:V4SF
+	  (match_operand:V4SF 1 "register_operand" "v")
+	  (match_operand:V4SF 2 "register_operand" "v"))))]
+  "VECTOR_UNIT_ALTIVEC_P (V4SFmode) && TARGET_FUSED_MADD
+   && !HONOR_SIGNED_ZEROS (SFmode)"
+  "vnmsubfp %0,%1,%2,%3"
+  [(set_attr "type" "vecfloat")])
+
+(define_insn "altivec_vnmsubfp_3"
+  [(set (match_operand:V4SF 0 "register_operand" "=v")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
+		       (match_operand:V4SF 2 "register_operand" "v")
+		       (match_operand:V4SF 3 "register_operand" "v")]
+		      UNSPEC_VNMSUBFP))]
   "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
   "vnmsubfp %0,%1,%2,%3"
   [(set_attr "type" "vecfloat")])
@@ -1444,19 +1496,19 @@
   "vexptefp %0,%1"
   [(set_attr "type" "vecfloat")])
 
-(define_insn "altivec_vrsqrtefp"
+(define_insn "*altivec_vrsqrtefp"
   [(set (match_operand:V4SF 0 "register_operand" "=v")
         (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")]
-		     UNSPEC_VRSQRTEFP))]
-  "TARGET_ALTIVEC"
+		     UNSPEC_RSQRT))]
+  "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
   "vrsqrtefp %0,%1"
   [(set_attr "type" "vecfloat")])
 
 (define_insn "altivec_vrefp"
   [(set (match_operand:V4SF 0 "register_operand" "=v")
         (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")]
-		     UNSPEC_VREFP))]
-  "TARGET_ALTIVEC"
+		     UNSPEC_FRES))]
+  "VECTOR_UNIT_ALTIVEC_P (V4SFmode)"
   "vrefp %0,%1"
   [(set_attr "type" "vecfloat")])
 
diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def
index 7c5619a8e14..9f45a72e2c0 100644
--- a/gcc/config/rs6000/rs6000-builtin.def
+++ b/gcc/config/rs6000/rs6000-builtin.def
@@ -159,6 +159,7 @@ RS6000_BUILTIN(ALTIVEC_BUILTIN_VRFIZ,			RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VRLB,			RS6000_BTC_CONST)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VRLH,			RS6000_BTC_CONST)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VRLW,			RS6000_BTC_CONST)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VRSQRTFP,		RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VRSQRTEFP,		RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VSLB,			RS6000_BTC_CONST)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VSLH,			RS6000_BTC_CONST)
@@ -269,6 +270,7 @@ RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_EXT_V8HI,		RS6000_BTC_CONST)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_EXT_V16QI,		RS6000_BTC_CONST)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_EXT_V4SF,		RS6000_BTC_CONST)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_COPYSIGN_V4SF,		RS6000_BTC_CONST)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VRECIPFP,		RS6000_BTC_FP_PURE)
 
 /* Altivec overloaded builtins.  */
 /* For now, don't set the classification for overloaded functions.
@@ -351,10 +353,12 @@ RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_PACKS,		RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_PACKSU,		RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_PERM,		RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RE,			RS6000_BTC_MISC)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RECIP,		RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RL,			RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RINT,		RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_ROUND,		RS6000_BTC_MISC)
-RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RSQRTE,		RS6000_BTC_MISC)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RSQRT,		RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RSQRTE,		RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_SEL,			RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_SL,			RS6000_BTC_MISC)
 RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_SLD,			RS6000_BTC_MISC)
@@ -959,6 +963,10 @@ RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEL_V2DF,		RS6000_BTC_CONST)
 RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEL_V2DI,		RS6000_BTC_CONST)
 RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEH_V2DF,		RS6000_BTC_CONST)
 RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEH_V2DI,		RS6000_BTC_CONST)
+RS6000_BUILTIN(VSX_BUILTIN_VEC_RSQRT_V4SF,		RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(VSX_BUILTIN_VEC_RSQRT_V2DF,		RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(VSX_BUILTIN_RECIP_V4SF,			RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(VSX_BUILTIN_RECIP_V2DF,			RS6000_BTC_FP_PURE)
 
 /* VSX overloaded builtins, add the overloaded functions not present in
    Altivec.  */
@@ -991,4 +999,5 @@ RS6000_BUILTIN(POWER7_BUILTIN_BPERMD,			RS6000_BTC_CONST)
 RS6000_BUILTIN(RS6000_BUILTIN_RECIP,			RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(RS6000_BUILTIN_RECIPF,			RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(RS6000_BUILTIN_RSQRTF,			RS6000_BTC_FP_PURE)
+RS6000_BUILTIN(RS6000_BUILTIN_RSQRT,			RS6000_BTC_FP_PURE)
 RS6000_BUILTIN(RS6000_BUILTIN_BSWAP_HI,			RS6000_BTC_CONST)
diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index ac11336aee9..7a197c1fbcc 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -362,6 +362,16 @@ rs6000_cpu_cpp_builtins (cpp_reader *pfile)
       builtin_define ("__builtin_vsx_xvnmsubasp=__builtin_vsx_xvnmsubsp");
       builtin_define ("__builtin_vsx_xvnmsubmsp=__builtin_vsx_xvnmsubsp");
     }
+  if (RS6000_RECIP_HAVE_RE_P (DFmode))
+    builtin_define ("__RECIP__");
+  if (RS6000_RECIP_HAVE_RE_P (SFmode))
+    builtin_define ("__RECIPF__");
+  if (RS6000_RECIP_HAVE_RSQRTE_P (DFmode))
+    builtin_define ("__RSQRTE__");
+  if (RS6000_RECIP_HAVE_RSQRTE_P (SFmode))
+    builtin_define ("__RSQRTEF__");
+  if (TARGET_RECIP_PRECISION)
+    builtin_define ("__RECIP_PRECISION__");
 
   /* Tell users they can use __builtin_bswap{16,64}.  */
   builtin_define ("__HAVE_BSWAP__");
@@ -479,10 +489,22 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = {
     RS6000_BTI_void, RS6000_BTI_bool_V16QI, 0, 0 },
   { ALTIVEC_BUILTIN_VEC_RE, ALTIVEC_BUILTIN_VREFP,
     RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
+  { ALTIVEC_BUILTIN_VEC_RE, VSX_BUILTIN_XVREDP,
+    RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 },
   { ALTIVEC_BUILTIN_VEC_ROUND, ALTIVEC_BUILTIN_VRFIN,
     RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
+  { ALTIVEC_BUILTIN_VEC_RECIP, ALTIVEC_BUILTIN_VRECIPFP,
+    RS6000_BTI_V4SF, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0 },
+  { ALTIVEC_BUILTIN_VEC_RECIP, VSX_BUILTIN_RECIP_V2DF,
+    RS6000_BTI_V2DF, RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0 },
+  { ALTIVEC_BUILTIN_VEC_RSQRT, ALTIVEC_BUILTIN_VRSQRTFP,
+    RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
+  { ALTIVEC_BUILTIN_VEC_RSQRT, VSX_BUILTIN_VEC_RSQRT_V2DF,
+    RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 },
   { ALTIVEC_BUILTIN_VEC_RSQRTE, ALTIVEC_BUILTIN_VRSQRTEFP,
     RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
+  { ALTIVEC_BUILTIN_VEC_RSQRTE, VSX_BUILTIN_XVRSQRTEDP,
+    RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 },
   { ALTIVEC_BUILTIN_VEC_TRUNC, ALTIVEC_BUILTIN_VRFIZ,
     RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 },
   { ALTIVEC_BUILTIN_VEC_TRUNC, VSX_BUILTIN_XVRDPIZ,
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index 43ed634495b..3f022862332 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -106,9 +106,8 @@ extern void rs6000_split_compare_and_swap (rtx, rtx, rtx, rtx, rtx);
 extern void rs6000_expand_compare_and_swapqhi (rtx, rtx, rtx, rtx);
 extern void rs6000_split_compare_and_swapqhi (rtx, rtx, rtx, rtx, rtx, rtx);
 extern void rs6000_split_lock_test_and_set (rtx, rtx, rtx, rtx);
-extern void rs6000_emit_swdivsf (rtx, rtx, rtx);
-extern void rs6000_emit_swdivdf (rtx, rtx, rtx);
-extern void rs6000_emit_swrsqrtsf (rtx, rtx);
+extern void rs6000_emit_swdiv (rtx, rtx, rtx, bool);
+extern void rs6000_emit_swrsqrt (rtx, rtx);
 extern void output_toc (FILE *, rtx, int, enum machine_mode);
 extern rtx rs6000_longcall_ref (rtx);
 extern void rs6000_fatal_bad_address (rtx);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index a7434ca5257..9bfaf54c2a2 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -316,6 +316,61 @@ int rs6000_vector_align[NUM_MACHINE_MODES];
 
 /* Map selected modes to types for builtins.  */
 static GTY(()) tree builtin_mode_to_type[MAX_MACHINE_MODE][2];
+
+/* What modes to automatically generate reciprocal divide estimate (fre) and
+   reciprocal sqrt (frsqrte) for.  */
+unsigned char rs6000_recip_bits[MAX_MACHINE_MODE];
+
+/* Masks to determine which reciprocal esitmate instructions to generate
+   automatically.  */
+enum rs6000_recip_mask {
+  RECIP_SF_DIV		= 0x001,	/* Use divide estimate */
+  RECIP_DF_DIV		= 0x002,
+  RECIP_V4SF_DIV	= 0x004,
+  RECIP_V2DF_DIV	= 0x008,
+
+  RECIP_SF_RSQRT	= 0x010,	/* Use reciprocal sqrt estimate.  */
+  RECIP_DF_RSQRT	= 0x020,
+  RECIP_V4SF_RSQRT	= 0x040,
+  RECIP_V2DF_RSQRT	= 0x080,
+
+  /* Various combination of flags for -mrecip=xxx.  */
+  RECIP_NONE		= 0,
+  RECIP_ALL		= (RECIP_SF_DIV | RECIP_DF_DIV | RECIP_V4SF_DIV
+			   | RECIP_V2DF_DIV | RECIP_SF_RSQRT | RECIP_DF_RSQRT
+			   | RECIP_V4SF_RSQRT | RECIP_V2DF_RSQRT),
+
+  RECIP_HIGH_PRECISION	= RECIP_ALL,
+
+  /* On low precision machines like the power5, don't enable double precision
+     reciprocal square root estimate, since it isn't accurate enough.  */
+  RECIP_LOW_PRECISION	= (RECIP_ALL & ~(RECIP_DF_RSQRT | RECIP_V2DF_RSQRT))
+};
+
+static unsigned int rs6000_recip_control;
+static const char *rs6000_recip_name;
+
+/* -mrecip options.  */
+static struct
+{
+  const char *string;		/* option name */
+  unsigned int mask;		/* mask bits to set */
+} recip_options[] = {
+  { "all",	 RECIP_ALL },
+  { "none",	 RECIP_NONE },
+  { "div",	 (RECIP_SF_DIV | RECIP_DF_DIV | RECIP_V4SF_DIV
+		  | RECIP_V2DF_DIV) },
+  { "divf",	 (RECIP_SF_DIV | RECIP_V4SF_DIV) },
+  { "divd",	 (RECIP_DF_DIV | RECIP_V2DF_DIV) },
+  { "rsqrt",	 (RECIP_SF_RSQRT | RECIP_DF_RSQRT | RECIP_V4SF_RSQRT
+		  | RECIP_V2DF_RSQRT) },
+  { "rsqrtf",	 (RECIP_SF_RSQRT | RECIP_V4SF_RSQRT) },
+  { "rsqrtd",	 (RECIP_DF_RSQRT | RECIP_V2DF_RSQRT) },
+};
+
+/* 2 argument gen function typedef.  */
+typedef rtx (*gen_2arg_fn_t) (rtx, rtx, rtx);
+
 
 /* Target cpu costs.  */
 
@@ -1807,6 +1862,27 @@ rs6000_debug_reg_global (void)
   if (nl)
     fputs (nl, stderr);
 
+  if (rs6000_recip_control)
+    {
+      fprintf (stderr, "\nReciprocal mask = 0x%x\n", rs6000_recip_control);
+
+      for (m = 0; m < NUM_MACHINE_MODES; ++m)
+	if (rs6000_recip_bits[m])
+	  {
+	    fprintf (stderr,
+		     "Reciprocal estimate mode: %-5s divide: %s rsqrt: %s\n",
+		     GET_MODE_NAME (m),
+		     (RS6000_RECIP_AUTO_RE_P (m)
+		      ? "auto"
+		      : (RS6000_RECIP_HAVE_RE_P (m) ? "have" : "none")),
+		     (RS6000_RECIP_AUTO_RSQRTE_P (m)
+		      ? "auto"
+		      : (RS6000_RECIP_HAVE_RSQRTE_P (m) ? "have" : "none")));
+	  }
+
+      fputs ("\n", stderr);
+    }
+
   switch (rs6000_sched_costly_dep)
     {
     case max_dep_latency:
@@ -2014,8 +2090,9 @@ rs6000_init_hard_regno_mode_ok (void)
       rs6000_constraints[RS6000_CONSTRAINT_wa] = VSX_REGS;
       rs6000_constraints[RS6000_CONSTRAINT_wf] = VSX_REGS;
       rs6000_constraints[RS6000_CONSTRAINT_wd] = VSX_REGS;
-      if (TARGET_VSX_SCALAR_DOUBLE)
-	rs6000_constraints[RS6000_CONSTRAINT_ws] = VSX_REGS;
+      rs6000_constraints[RS6000_CONSTRAINT_ws] = (TARGET_VSX_SCALAR_MEMORY
+						  ? VSX_REGS
+						  : FLOAT_REGS);
     }
 
   if (TARGET_ALTIVEC)
@@ -2093,8 +2170,111 @@ rs6000_init_hard_regno_mode_ok (void)
   if (TARGET_E500_DOUBLE)
     rs6000_class_max_nregs[DFmode][GENERAL_REGS] = 1;
 
+  /* Calculate which modes to automatically generate code to use a the
+     reciprocal divide and square root instructions.  In the future, possibly
+     automatically generate the instructions even if the user did not specify
+     -mrecip.  The older machines double precision reciprocal sqrt estimate is
+     not accurate enough.  */
+  memset (rs6000_recip_bits, 0, sizeof (rs6000_recip_bits));
+  if (TARGET_FRES)
+    rs6000_recip_bits[SFmode] = RS6000_RECIP_MASK_HAVE_RE;
+  if (TARGET_FRE)
+    rs6000_recip_bits[DFmode] = RS6000_RECIP_MASK_HAVE_RE;
+  if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode))
+    rs6000_recip_bits[V4SFmode] = RS6000_RECIP_MASK_HAVE_RE;
+  if (VECTOR_UNIT_VSX_P (V2DFmode))
+    rs6000_recip_bits[V2DFmode] = RS6000_RECIP_MASK_HAVE_RE;
+
+  if (TARGET_FRSQRTES)
+    rs6000_recip_bits[SFmode] |= RS6000_RECIP_MASK_HAVE_RSQRTE;
+  if (TARGET_FRSQRTE)
+    rs6000_recip_bits[DFmode] |= RS6000_RECIP_MASK_HAVE_RSQRTE;
+  if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode))
+    rs6000_recip_bits[V4SFmode] |= RS6000_RECIP_MASK_HAVE_RSQRTE;
+  if (VECTOR_UNIT_VSX_P (V2DFmode))
+    rs6000_recip_bits[V2DFmode] |= RS6000_RECIP_MASK_HAVE_RSQRTE;
+
+  if (rs6000_recip_control)
+    {
+      if (!TARGET_FUSED_MADD)
+	warning (0, "-mrecip requires -mfused-madd");
+      if (!flag_finite_math_only)
+	warning (0, "-mrecip requires -ffinite-math or -ffast-math");
+      if (flag_trapping_math)
+	warning (0, "-mrecip requires -fno-trapping-math or -ffast-math");
+      if (!flag_reciprocal_math)
+	warning (0, "-mrecip requires -freciprocal-math or -ffast-math");
+      if (TARGET_FUSED_MADD && flag_finite_math_only && !flag_trapping_math
+	  && flag_reciprocal_math)
+	{
+	  if (RS6000_RECIP_HAVE_RE_P (SFmode)
+	      && (rs6000_recip_control & RECIP_SF_DIV) != 0)
+	    rs6000_recip_bits[SFmode] |= RS6000_RECIP_MASK_AUTO_RE;
+
+	  if (RS6000_RECIP_HAVE_RE_P (DFmode)
+	      && (rs6000_recip_control & RECIP_DF_DIV) != 0)
+	    rs6000_recip_bits[DFmode] |= RS6000_RECIP_MASK_AUTO_RE;
+
+	  if (RS6000_RECIP_HAVE_RE_P (V4SFmode)
+	      && (rs6000_recip_control & RECIP_V4SF_DIV) != 0)
+	    rs6000_recip_bits[V4SFmode] |= RS6000_RECIP_MASK_AUTO_RE;
+
+	  if (RS6000_RECIP_HAVE_RE_P (V2DFmode)
+	      && (rs6000_recip_control & RECIP_V2DF_DIV) != 0)
+	    rs6000_recip_bits[V2DFmode] |= RS6000_RECIP_MASK_AUTO_RE;
+
+	  if (RS6000_RECIP_HAVE_RSQRTE_P (SFmode)
+	      && (rs6000_recip_control & RECIP_SF_RSQRT) != 0)
+	    rs6000_recip_bits[SFmode] |= RS6000_RECIP_MASK_AUTO_RSQRTE;
+
+	  if (RS6000_RECIP_HAVE_RSQRTE_P (DFmode)
+	      && (rs6000_recip_control & RECIP_DF_RSQRT) != 0)
+	    rs6000_recip_bits[DFmode] |= RS6000_RECIP_MASK_AUTO_RSQRTE;
+
+	  if (RS6000_RECIP_HAVE_RSQRTE_P (V4SFmode)
+	      && (rs6000_recip_control & RECIP_V4SF_RSQRT) != 0)
+	    rs6000_recip_bits[V4SFmode] |= RS6000_RECIP_MASK_AUTO_RSQRTE;
+
+	  if (RS6000_RECIP_HAVE_RSQRTE_P (V2DFmode)
+	      && (rs6000_recip_control & RECIP_V2DF_RSQRT) != 0)
+	    rs6000_recip_bits[V2DFmode] |= RS6000_RECIP_MASK_AUTO_RSQRTE;
+	}
+    }
+
   if (TARGET_DEBUG_REG)
     rs6000_debug_reg_global ();
+
+  if (TARGET_DEBUG_COST || TARGET_DEBUG_REG)
+    fprintf (stderr,
+	     "SImode variable mult cost       = %d\n"
+	     "SImode constant mult cost       = %d\n"
+	     "SImode short constant mult cost = %d\n"
+	     "DImode multipliciation cost     = %d\n"
+	     "SImode division cost            = %d\n"
+	     "DImode division cost            = %d\n"
+	     "Simple fp operation cost        = %d\n"
+	     "DFmode multiplication cost      = %d\n"
+	     "SFmode division cost            = %d\n"
+	     "DFmode division cost            = %d\n"
+	     "cache line size                 = %d\n"
+	     "l1 cache size                   = %d\n"
+	     "l2 cache size                   = %d\n"
+	     "simultaneous prefetches         = %d\n"
+	     "\n",
+	     rs6000_cost->mulsi,
+	     rs6000_cost->mulsi_const,
+	     rs6000_cost->mulsi_const9,
+	     rs6000_cost->muldi,
+	     rs6000_cost->divsi,
+	     rs6000_cost->divdi,
+	     rs6000_cost->fp,
+	     rs6000_cost->dmul,
+	     rs6000_cost->sdiv,
+	     rs6000_cost->ddiv,
+	     rs6000_cost->cache_line_size,
+	     rs6000_cost->l1_cache_size,
+	     rs6000_cost->l2_cache_size,
+	     rs6000_cost->simultaneous_prefetches);
 }
 
 #if TARGET_MACHO
@@ -2271,15 +2451,16 @@ rs6000_override_options (const char *default_cpu)
 	  | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND},
  	 {"power6", PROCESSOR_POWER6,
 	  POWERPC_BASE_MASK | MASK_POWERPC64 | MASK_PPC_GPOPT | MASK_PPC_GFXOPT
-	  | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_CMPB | MASK_DFP},
+	  | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_CMPB | MASK_DFP
+	  | MASK_RECIP_PRECISION},
 	 {"power6x", PROCESSOR_POWER6,
 	  POWERPC_BASE_MASK | MASK_POWERPC64 | MASK_PPC_GPOPT | MASK_PPC_GFXOPT
 	  | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_CMPB | MASK_DFP
-	  | MASK_MFPGPR},
+	  | MASK_MFPGPR | MASK_RECIP_PRECISION},
 	 {"power7", PROCESSOR_POWER7,
 	  POWERPC_7400_MASK | MASK_POWERPC64 | MASK_PPC_GPOPT | MASK_MFCRF
 	  | MASK_POPCNTB | MASK_FPRND | MASK_CMPB | MASK_DFP | MASK_POPCNTD
-	  | MASK_VSX},		/* Don't add MASK_ISEL by default */
+	  | MASK_VSX| MASK_RECIP_PRECISION},	/* Don't add MASK_ISEL by default */
 	 {"powerpc", PROCESSOR_POWERPC, POWERPC_BASE_MASK},
 	 {"powerpc64", PROCESSOR_POWERPC64,
 	  POWERPC_BASE_MASK | MASK_PPC_GFXOPT | MASK_POWERPC64},
@@ -2307,7 +2488,24 @@ rs6000_override_options (const char *default_cpu)
 		     | MASK_PPC_GFXOPT | MASK_POWERPC64 | MASK_ALTIVEC
 		     | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_MULHW
 		     | MASK_DLMZB | MASK_CMPB | MASK_MFPGPR | MASK_DFP
-		     | MASK_POPCNTD | MASK_VSX | MASK_ISEL | MASK_NO_UPDATE)
+		     | MASK_POPCNTD | MASK_VSX | MASK_ISEL | MASK_NO_UPDATE
+		     | MASK_RECIP_PRECISION)
+  };
+
+  /* Masks for instructions set at various powerpc ISAs.  */
+  enum {
+    ISA_2_1_MASKS = MASK_MFCRF,
+    ISA_2_2_MASKS = (ISA_2_1_MASKS | MASK_POPCNTB | MASK_FPRND),
+
+    /* For ISA 2.05, do not add MFPGPR, since it isn't in ISA 2.06, and
+       don't add ALTIVEC, since in general it isn't a win on power6.  */
+    ISA_2_5_MASKS = (ISA_2_2_MASKS | MASK_CMPB | MASK_RECIP_PRECISION
+		     | MASK_DFP),
+
+    /* For ISA 2.06, don't add ISEL, since in general it isn't a win, but
+       altivec is a win so enable it.  */
+    ISA_2_6_MASKS = (ISA_2_5_MASKS | MASK_ALTIVEC | MASK_POPCNTD
+		     | MASK_VSX | MASK_RECIP_PRECISION)
   };
 
   /* Numerous experiment shows that IRA based loop pressure
@@ -2449,10 +2647,17 @@ rs6000_override_options (const char *default_cpu)
 	  warning (0, msg);
 	  target_flags &= ~ MASK_VSX;
 	}
-      else if (TARGET_VSX && !TARGET_ALTIVEC)
-	target_flags |= MASK_ALTIVEC;
     }
 
+  /* For the newer switches (vsx, dfp, etc.) set some of the older options,
+     unless the user explicitly used the -mno-<option> to disable the code.  */
+  if (TARGET_VSX)
+    target_flags |= (ISA_2_6_MASKS & (target_flags_explicit & ~ISA_2_6_MASKS));
+  else if (TARGET_DFP)
+    target_flags |= (ISA_2_5_MASKS & (target_flags_explicit & ~ISA_2_5_MASKS));
+  else if (TARGET_ALTIVEC)
+    target_flags |= (MASK_PPC_GFXOPT & (target_flags_explicit & ~MASK_PPC_GFXOPT));
+
   /* Set debug flags */
   if (rs6000_debug_name)
     {
@@ -2875,6 +3080,52 @@ rs6000_override_options (const char *default_cpu)
      the DERAT mispredict penalty.  */
     TARGET_AVOID_XFORM = (rs6000_cpu == PROCESSOR_POWER6 && TARGET_CMPB);
 
+  /* Set the -mrecip options.  */
+  if (rs6000_recip_name)
+    {
+      char *p = ASTRDUP (rs6000_recip_name);
+      char *q;
+      unsigned int mask, i;
+      bool invert;
+
+      while ((q = strtok (p, ",")) != NULL)
+	{
+	  p = NULL;
+	  if (*q == '!')
+	    {
+	      invert = true;
+	      q++;
+	    }
+	  else
+	    invert = false;
+
+	  if (!strcmp (q, "default"))
+	    mask = ((TARGET_RECIP_PRECISION)
+		    ? RECIP_HIGH_PRECISION : RECIP_LOW_PRECISION);
+	  else
+	    {
+	      for (i = 0; i < ARRAY_SIZE (recip_options); i++)
+		if (!strcmp (q, recip_options[i].string))
+		  {
+		    mask = recip_options[i].mask;
+		    break;
+		  }
+
+	      if (i == ARRAY_SIZE (recip_options))
+		{
+		  error ("Unknown option for -mrecip=%s", q);
+		  invert = false;
+		  mask = 0;
+		}
+	    }
+
+	  if (invert)
+	    rs6000_recip_control &= ~mask;
+	  else
+	    rs6000_recip_control |= mask;
+	}
+    }
+
   rs6000_init_hard_regno_mode_ok ();
 }
 
@@ -3191,12 +3442,10 @@ rs6000_builtin_vectorized_function (tree fndecl, tree type_out,
 {
   enum machine_mode in_mode, out_mode;
   int in_n, out_n;
-  enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
 
   if (TREE_CODE (type_out) != VECTOR_TYPE
       || TREE_CODE (type_in) != VECTOR_TYPE
-      || !TARGET_VECTORIZE_BUILTINS
-      || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
+      || !TARGET_VECTORIZE_BUILTINS)
     return NULL_TREE;
 
   out_mode = TYPE_MODE (TREE_TYPE (type_out));
@@ -3204,111 +3453,151 @@ rs6000_builtin_vectorized_function (tree fndecl, tree type_out,
   in_mode = TYPE_MODE (TREE_TYPE (type_in));
   in_n = TYPE_VECTOR_SUBPARTS (type_in);
 
-  switch (fn)
+  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
     {
-    case BUILT_IN_COPYSIGN:
-      if (VECTOR_UNIT_VSX_P (V2DFmode)
-	  && out_mode == DFmode && out_n == 2
-	  && in_mode == DFmode && in_n == 2)
-	return rs6000_builtin_decls[VSX_BUILTIN_CPSGNDP];
-      break;
-    case BUILT_IN_COPYSIGNF:
-      if (out_mode != SFmode || out_n != 4
-	  || in_mode != SFmode || in_n != 4)
-	break;
-      if (VECTOR_UNIT_VSX_P (V4SFmode))
-	return rs6000_builtin_decls[VSX_BUILTIN_CPSGNSP];
-      if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
-	return rs6000_builtin_decls[ALTIVEC_BUILTIN_COPYSIGN_V4SF];
-      break;
-    case BUILT_IN_SQRT:
-      if (VECTOR_UNIT_VSX_P (V2DFmode)
-	  && out_mode == DFmode && out_n == 2
-	  && in_mode == DFmode && in_n == 2)
-	return rs6000_builtin_decls[VSX_BUILTIN_XVSQRTDP];
-      break;
-    case BUILT_IN_SQRTF:
-      if (VECTOR_UNIT_VSX_P (V4SFmode)
-	  && out_mode == SFmode && out_n == 4
-	  && in_mode == SFmode && in_n == 4)
-	return rs6000_builtin_decls[VSX_BUILTIN_XVSQRTSP];
-      break;
-    case BUILT_IN_CEIL:
-      if (VECTOR_UNIT_VSX_P (V2DFmode)
-	  && out_mode == DFmode && out_n == 2
-	  && in_mode == DFmode && in_n == 2)
-	return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIP];
-      break;
-    case BUILT_IN_CEILF:
-      if (out_mode != SFmode || out_n != 4
-	  || in_mode != SFmode || in_n != 4)
-	break;
-      if (VECTOR_UNIT_VSX_P (V4SFmode))
-	return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIP];
-      if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
-	return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIP];
-      break;
-    case BUILT_IN_FLOOR:
-      if (VECTOR_UNIT_VSX_P (V2DFmode)
-	  && out_mode == DFmode && out_n == 2
-	  && in_mode == DFmode && in_n == 2)
-	return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIM];
-      break;
-    case BUILT_IN_FLOORF:
-      if (out_mode != SFmode || out_n != 4
-	  || in_mode != SFmode || in_n != 4)
-	break;
-      if (VECTOR_UNIT_VSX_P (V4SFmode))
-	return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIM];
-      if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
-	return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIM];
-      break;
-    case BUILT_IN_TRUNC:
-      if (VECTOR_UNIT_VSX_P (V2DFmode)
-	  && out_mode == DFmode && out_n == 2
-	  && in_mode == DFmode && in_n == 2)
-	return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIZ];
-      break;
-    case BUILT_IN_TRUNCF:
-      if (out_mode != SFmode || out_n != 4
-	  || in_mode != SFmode || in_n != 4)
-	break;
-      if (VECTOR_UNIT_VSX_P (V4SFmode))
-	return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIZ];
-      if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
-	return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIZ];
-      break;
-    case BUILT_IN_NEARBYINT:
-      if (VECTOR_UNIT_VSX_P (V2DFmode)
-	  && flag_unsafe_math_optimizations
-	  && out_mode == DFmode && out_n == 2
-	  && in_mode == DFmode && in_n == 2)
-	return rs6000_builtin_decls[VSX_BUILTIN_XVRDPI];
-      break;
-    case BUILT_IN_NEARBYINTF:
-      if (VECTOR_UNIT_VSX_P (V4SFmode)
-	  && flag_unsafe_math_optimizations
-	  && out_mode == SFmode && out_n == 4
-	  && in_mode == SFmode && in_n == 4)
-	return rs6000_builtin_decls[VSX_BUILTIN_XVRSPI];
-      break;
-    case BUILT_IN_RINT:
-      if (VECTOR_UNIT_VSX_P (V2DFmode)
-	  && !flag_trapping_math
-          && out_mode == DFmode && out_n == 2
-          && in_mode == DFmode && in_n == 2)
-        return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIC];
-      break;
-    case BUILT_IN_RINTF:
-      if (VECTOR_UNIT_VSX_P (V4SFmode)
-	  && !flag_trapping_math
-          && out_mode == SFmode && out_n == 4
-          && in_mode == SFmode && in_n == 4)
-        return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIC];
-      break;
-    default:
-      break;
+      enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
+      switch (fn)
+	{
+	case BUILT_IN_COPYSIGN:
+	  if (VECTOR_UNIT_VSX_P (V2DFmode)
+	      && out_mode == DFmode && out_n == 2
+	      && in_mode == DFmode && in_n == 2)
+	    return rs6000_builtin_decls[VSX_BUILTIN_CPSGNDP];
+	  break;
+	case BUILT_IN_COPYSIGNF:
+	  if (out_mode != SFmode || out_n != 4
+	      || in_mode != SFmode || in_n != 4)
+	    break;
+	  if (VECTOR_UNIT_VSX_P (V4SFmode))
+	    return rs6000_builtin_decls[VSX_BUILTIN_CPSGNSP];
+	  if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
+	    return rs6000_builtin_decls[ALTIVEC_BUILTIN_COPYSIGN_V4SF];
+	  break;
+	case BUILT_IN_SQRT:
+	  if (VECTOR_UNIT_VSX_P (V2DFmode)
+	      && out_mode == DFmode && out_n == 2
+	      && in_mode == DFmode && in_n == 2)
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVSQRTDP];
+	  break;
+	case BUILT_IN_SQRTF:
+	  if (VECTOR_UNIT_VSX_P (V4SFmode)
+	      && out_mode == SFmode && out_n == 4
+	      && in_mode == SFmode && in_n == 4)
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVSQRTSP];
+	  break;
+	case BUILT_IN_CEIL:
+	  if (VECTOR_UNIT_VSX_P (V2DFmode)
+	      && out_mode == DFmode && out_n == 2
+	      && in_mode == DFmode && in_n == 2)
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIP];
+	  break;
+	case BUILT_IN_CEILF:
+	  if (out_mode != SFmode || out_n != 4
+	      || in_mode != SFmode || in_n != 4)
+	    break;
+	  if (VECTOR_UNIT_VSX_P (V4SFmode))
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIP];
+	  if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
+	    return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIP];
+	  break;
+	case BUILT_IN_FLOOR:
+	  if (VECTOR_UNIT_VSX_P (V2DFmode)
+	      && out_mode == DFmode && out_n == 2
+	      && in_mode == DFmode && in_n == 2)
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIM];
+	  break;
+	case BUILT_IN_FLOORF:
+	  if (out_mode != SFmode || out_n != 4
+	      || in_mode != SFmode || in_n != 4)
+	    break;
+	  if (VECTOR_UNIT_VSX_P (V4SFmode))
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIM];
+	  if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
+	    return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIM];
+	  break;
+	case BUILT_IN_TRUNC:
+	  if (VECTOR_UNIT_VSX_P (V2DFmode)
+	      && out_mode == DFmode && out_n == 2
+	      && in_mode == DFmode && in_n == 2)
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIZ];
+	  break;
+	case BUILT_IN_TRUNCF:
+	  if (out_mode != SFmode || out_n != 4
+	      || in_mode != SFmode || in_n != 4)
+	    break;
+	  if (VECTOR_UNIT_VSX_P (V4SFmode))
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIZ];
+	  if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
+	    return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIZ];
+	  break;
+	case BUILT_IN_NEARBYINT:
+	  if (VECTOR_UNIT_VSX_P (V2DFmode)
+	      && flag_unsafe_math_optimizations
+	      && out_mode == DFmode && out_n == 2
+	      && in_mode == DFmode && in_n == 2)
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVRDPI];
+	  break;
+	case BUILT_IN_NEARBYINTF:
+	  if (VECTOR_UNIT_VSX_P (V4SFmode)
+	      && flag_unsafe_math_optimizations
+	      && out_mode == SFmode && out_n == 4
+	      && in_mode == SFmode && in_n == 4)
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVRSPI];
+	  break;
+	case BUILT_IN_RINT:
+	  if (VECTOR_UNIT_VSX_P (V2DFmode)
+	      && !flag_trapping_math
+	      && out_mode == DFmode && out_n == 2
+	      && in_mode == DFmode && in_n == 2)
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIC];
+	  break;
+	case BUILT_IN_RINTF:
+	  if (VECTOR_UNIT_VSX_P (V4SFmode)
+	      && !flag_trapping_math
+	      && out_mode == SFmode && out_n == 4
+	      && in_mode == SFmode && in_n == 4)
+	    return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIC];
+	  break;
+	default:
+	  break;
+	}
     }
+
+  else if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
+    {
+      enum rs6000_builtins fn
+	= (enum rs6000_builtins)DECL_FUNCTION_CODE (fndecl);
+      switch (fn)
+	{
+	case RS6000_BUILTIN_RSQRTF:
+	  if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode)
+	      && out_mode == SFmode && out_n == 4
+	      && in_mode == SFmode && in_n == 4)
+	    return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRSQRTFP];
+	  break;
+	case RS6000_BUILTIN_RSQRT:
+	  if (VECTOR_UNIT_VSX_P (V2DFmode)
+	      && out_mode == DFmode && out_n == 2
+	      && in_mode == DFmode && in_n == 2)
+	    return rs6000_builtin_decls[VSX_BUILTIN_VEC_RSQRT_V2DF];
+	  break;
+	case RS6000_BUILTIN_RECIPF:
+	  if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode)
+	      && out_mode == SFmode && out_n == 4
+	      && in_mode == SFmode && in_n == 4)
+	    return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRECIPFP];
+	  break;
+	case RS6000_BUILTIN_RECIP:
+	  if (VECTOR_UNIT_VSX_P (V2DFmode)
+	      && out_mode == DFmode && out_n == 2
+	      && in_mode == DFmode && in_n == 2)
+	    return rs6000_builtin_decls[VSX_BUILTIN_RECIP_V2DF];
+	  break;
+	default:
+	  break;
+	}
+    }
+
   return NULL_TREE;
 }
 
@@ -3668,6 +3957,13 @@ rs6000_handle_option (size_t code, const char *arg, int value)
         target_flags_explicit |= MASK_SOFT_FLOAT;
         rs6000_single_float = rs6000_double_float = 0;
       }
+
+    case OPT_mrecip:
+      rs6000_recip_name = (value) ? "default" : "none";
+      break;
+
+    case OPT_mrecip_:
+      rs6000_recip_name = arg;
       break;
     }
   return true;
@@ -8865,6 +9161,7 @@ static struct builtin_description bdesc_2arg[] =
   { MASK_ALTIVEC, CODE_FOR_altivec_vpkshus, "__builtin_altivec_vpkshus", ALTIVEC_BUILTIN_VPKSHUS },
   { MASK_ALTIVEC, CODE_FOR_altivec_vpkuwus, "__builtin_altivec_vpkuwus", ALTIVEC_BUILTIN_VPKUWUS },
   { MASK_ALTIVEC, CODE_FOR_altivec_vpkswus, "__builtin_altivec_vpkswus", ALTIVEC_BUILTIN_VPKSWUS },
+  { MASK_ALTIVEC, CODE_FOR_recipv4sf3, "__builtin_altivec_vrecipdivfp", ALTIVEC_BUILTIN_VRECIPFP },
   { MASK_ALTIVEC, CODE_FOR_vrotlv16qi3, "__builtin_altivec_vrlb", ALTIVEC_BUILTIN_VRLB },
   { MASK_ALTIVEC, CODE_FOR_vrotlv8hi3, "__builtin_altivec_vrlh", ALTIVEC_BUILTIN_VRLH },
   { MASK_ALTIVEC, CODE_FOR_vrotlv4si3, "__builtin_altivec_vrlw", ALTIVEC_BUILTIN_VRLW },
@@ -8907,6 +9204,7 @@ static struct builtin_description bdesc_2arg[] =
   { MASK_VSX, CODE_FOR_subv2df3, "__builtin_vsx_xvsubdp", VSX_BUILTIN_XVSUBDP },
   { MASK_VSX, CODE_FOR_mulv2df3, "__builtin_vsx_xvmuldp", VSX_BUILTIN_XVMULDP },
   { MASK_VSX, CODE_FOR_divv2df3, "__builtin_vsx_xvdivdp", VSX_BUILTIN_XVDIVDP },
+  { MASK_VSX, CODE_FOR_recipv2df3, "__builtin_vsx_xvrecipdivdp", VSX_BUILTIN_RECIP_V2DF },
   { MASK_VSX, CODE_FOR_sminv2df3, "__builtin_vsx_xvmindp", VSX_BUILTIN_XVMINDP },
   { MASK_VSX, CODE_FOR_smaxv2df3, "__builtin_vsx_xvmaxdp", VSX_BUILTIN_XVMAXDP },
   { MASK_VSX, CODE_FOR_vsx_tdivv2df3_fe, "__builtin_vsx_xvtdivdp_fe", VSX_BUILTIN_XVTDIVDP_FE },
@@ -8919,6 +9217,7 @@ static struct builtin_description bdesc_2arg[] =
   { MASK_VSX, CODE_FOR_subv4sf3, "__builtin_vsx_xvsubsp", VSX_BUILTIN_XVSUBSP },
   { MASK_VSX, CODE_FOR_mulv4sf3, "__builtin_vsx_xvmulsp", VSX_BUILTIN_XVMULSP },
   { MASK_VSX, CODE_FOR_divv4sf3, "__builtin_vsx_xvdivsp", VSX_BUILTIN_XVDIVSP },
+  { MASK_VSX, CODE_FOR_recipv4sf3, "__builtin_vsx_xvrecipdivsp", VSX_BUILTIN_RECIP_V4SF },
   { MASK_VSX, CODE_FOR_sminv4sf3, "__builtin_vsx_xvminsp", VSX_BUILTIN_XVMINSP },
   { MASK_VSX, CODE_FOR_smaxv4sf3, "__builtin_vsx_xvmaxsp", VSX_BUILTIN_XVMAXSP },
   { MASK_VSX, CODE_FOR_vsx_tdivv4sf3_fe, "__builtin_vsx_xvtdivsp_fe", VSX_BUILTIN_XVTDIVSP_FE },
@@ -9035,6 +9334,7 @@ static struct builtin_description bdesc_2arg[] =
   { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_packsu", ALTIVEC_BUILTIN_VEC_PACKSU },
   { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_vpkswus", ALTIVEC_BUILTIN_VEC_VPKSWUS },
   { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_vpkshus", ALTIVEC_BUILTIN_VEC_VPKSHUS },
+  { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_recipdiv", ALTIVEC_BUILTIN_VEC_RECIP },
   { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_rl", ALTIVEC_BUILTIN_VEC_RL },
   { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_vrlw", ALTIVEC_BUILTIN_VEC_VRLW },
   { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_vrlh", ALTIVEC_BUILTIN_VEC_VRLH },
@@ -9364,12 +9664,13 @@ static struct builtin_description bdesc_1arg[] =
 {
   { MASK_ALTIVEC, CODE_FOR_altivec_vexptefp, "__builtin_altivec_vexptefp", ALTIVEC_BUILTIN_VEXPTEFP },
   { MASK_ALTIVEC, CODE_FOR_altivec_vlogefp, "__builtin_altivec_vlogefp", ALTIVEC_BUILTIN_VLOGEFP },
-  { MASK_ALTIVEC, CODE_FOR_altivec_vrefp, "__builtin_altivec_vrefp", ALTIVEC_BUILTIN_VREFP },
+  { MASK_ALTIVEC, CODE_FOR_rev4sf2, "__builtin_altivec_vrefp", ALTIVEC_BUILTIN_VREFP },
   { MASK_ALTIVEC, CODE_FOR_vector_floorv4sf2, "__builtin_altivec_vrfim", ALTIVEC_BUILTIN_VRFIM },
   { MASK_ALTIVEC, CODE_FOR_altivec_vrfin, "__builtin_altivec_vrfin", ALTIVEC_BUILTIN_VRFIN },
   { MASK_ALTIVEC, CODE_FOR_vector_ceilv4sf2, "__builtin_altivec_vrfip", ALTIVEC_BUILTIN_VRFIP },
   { MASK_ALTIVEC, CODE_FOR_vector_btruncv4sf2, "__builtin_altivec_vrfiz", ALTIVEC_BUILTIN_VRFIZ },
-  { MASK_ALTIVEC, CODE_FOR_altivec_vrsqrtefp, "__builtin_altivec_vrsqrtefp", ALTIVEC_BUILTIN_VRSQRTEFP },
+  { MASK_ALTIVEC, CODE_FOR_rsqrtv4sf2, "__builtin_altivec_vrsqrtfp", ALTIVEC_BUILTIN_VRSQRTFP },
+  { MASK_ALTIVEC, CODE_FOR_rsqrtev4sf2, "__builtin_altivec_vrsqrtefp", ALTIVEC_BUILTIN_VRSQRTEFP },
   { MASK_ALTIVEC, CODE_FOR_altivec_vspltisb, "__builtin_altivec_vspltisb", ALTIVEC_BUILTIN_VSPLTISB },
   { MASK_ALTIVEC, CODE_FOR_altivec_vspltish, "__builtin_altivec_vspltish", ALTIVEC_BUILTIN_VSPLTISH },
   { MASK_ALTIVEC, CODE_FOR_altivec_vspltisw, "__builtin_altivec_vspltisw", ALTIVEC_BUILTIN_VSPLTISW },
@@ -9382,14 +9683,16 @@ static struct builtin_description bdesc_1arg[] =
 
   { MASK_VSX, CODE_FOR_negv2df2, "__builtin_vsx_xvnegdp", VSX_BUILTIN_XVNEGDP },
   { MASK_VSX, CODE_FOR_sqrtv2df2, "__builtin_vsx_xvsqrtdp", VSX_BUILTIN_XVSQRTDP },
-  { MASK_VSX, CODE_FOR_vsx_rsqrtev2df2, "__builtin_vsx_xvrsqrtedp", VSX_BUILTIN_XVRSQRTEDP },
+  { MASK_VSX, CODE_FOR_rsqrtv2df2, "__builtin_vsx_xvrsqrtdp", VSX_BUILTIN_VEC_RSQRT_V2DF },
+  { MASK_VSX, CODE_FOR_rsqrtev2df2, "__builtin_vsx_xvrsqrtedp", VSX_BUILTIN_XVRSQRTEDP },
   { MASK_VSX, CODE_FOR_vsx_tsqrtv2df2_fe, "__builtin_vsx_xvtsqrtdp_fe", VSX_BUILTIN_XVTSQRTDP_FE },
   { MASK_VSX, CODE_FOR_vsx_tsqrtv2df2_fg, "__builtin_vsx_xvtsqrtdp_fg", VSX_BUILTIN_XVTSQRTDP_FG },
   { MASK_VSX, CODE_FOR_vsx_frev2df2, "__builtin_vsx_xvredp", VSX_BUILTIN_XVREDP },
 
   { MASK_VSX, CODE_FOR_negv4sf2, "__builtin_vsx_xvnegsp", VSX_BUILTIN_XVNEGSP },
   { MASK_VSX, CODE_FOR_sqrtv4sf2, "__builtin_vsx_xvsqrtsp", VSX_BUILTIN_XVSQRTSP },
-  { MASK_VSX, CODE_FOR_vsx_rsqrtev4sf2, "__builtin_vsx_xvrsqrtesp", VSX_BUILTIN_XVRSQRTESP },
+  { MASK_VSX, CODE_FOR_rsqrtv4sf2, "__builtin_vsx_xvrsqrtsp", VSX_BUILTIN_VEC_RSQRT_V4SF },
+  { MASK_VSX, CODE_FOR_rsqrtev4sf2, "__builtin_vsx_xvrsqrtesp", VSX_BUILTIN_XVRSQRTESP },
   { MASK_VSX, CODE_FOR_vsx_tsqrtv4sf2_fe, "__builtin_vsx_xvtsqrtsp_fe", VSX_BUILTIN_XVTSQRTSP_FE },
   { MASK_VSX, CODE_FOR_vsx_tsqrtv4sf2_fg, "__builtin_vsx_xvtsqrtsp_fg", VSX_BUILTIN_XVTSQRTSP_FG },
   { MASK_VSX, CODE_FOR_vsx_frev4sf2, "__builtin_vsx_xvresp", VSX_BUILTIN_XVRESP },
@@ -9448,6 +9751,7 @@ static struct builtin_description bdesc_1arg[] =
   { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_mtvscr", ALTIVEC_BUILTIN_VEC_MTVSCR },
   { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_re", ALTIVEC_BUILTIN_VEC_RE },
   { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_round", ALTIVEC_BUILTIN_VEC_ROUND },
+  { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_rsqrt", ALTIVEC_BUILTIN_VEC_RSQRT },
   { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_rsqrte", ALTIVEC_BUILTIN_VEC_RSQRTE },
   { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_trunc", ALTIVEC_BUILTIN_VEC_TRUNC },
   { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_unpackh", ALTIVEC_BUILTIN_VEC_UNPACKH },
@@ -10963,73 +11267,83 @@ rs6000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
   rtx ret;
   bool success;
 
-  if (fcode == RS6000_BUILTIN_RECIP)
+  switch (fcode)
+    {
+    case RS6000_BUILTIN_RECIP:
       return rs6000_expand_binop_builtin (CODE_FOR_recipdf3, exp, target);
 
-  if (fcode == RS6000_BUILTIN_RECIPF)
+    case RS6000_BUILTIN_RECIPF:
       return rs6000_expand_binop_builtin (CODE_FOR_recipsf3, exp, target);
 
-  if (fcode == RS6000_BUILTIN_RSQRTF)
+    case RS6000_BUILTIN_RSQRTF:
       return rs6000_expand_unop_builtin (CODE_FOR_rsqrtsf2, exp, target);
 
-  if (fcode == RS6000_BUILTIN_BSWAP_HI)
-    return rs6000_expand_unop_builtin (CODE_FOR_bswaphi2, exp, target);
-
-  if (fcode == POWER7_BUILTIN_BPERMD)
-    return rs6000_expand_binop_builtin (((TARGET_64BIT)
-					 ? CODE_FOR_bpermd_di
-					 : CODE_FOR_bpermd_si), exp, target);
+    case RS6000_BUILTIN_RSQRT:
+      return rs6000_expand_unop_builtin (CODE_FOR_rsqrtdf2, exp, target);
 
-  if (fcode == ALTIVEC_BUILTIN_MASK_FOR_LOAD
-      || fcode == ALTIVEC_BUILTIN_MASK_FOR_STORE)
-    {
-      int icode = (int) CODE_FOR_altivec_lvsr;
-      enum machine_mode tmode = insn_data[icode].operand[0].mode;
-      enum machine_mode mode = insn_data[icode].operand[1].mode;
-      tree arg;
-      rtx op, addr, pat;
+    case RS6000_BUILTIN_BSWAP_HI:
+      return rs6000_expand_unop_builtin (CODE_FOR_bswaphi2, exp, target);
 
-      gcc_assert (TARGET_ALTIVEC);
+    case POWER7_BUILTIN_BPERMD:
+      return rs6000_expand_binop_builtin (((TARGET_64BIT)
+					   ? CODE_FOR_bpermd_di
+					   : CODE_FOR_bpermd_si), exp, target);
 
-      arg = CALL_EXPR_ARG (exp, 0);
-      gcc_assert (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE);
-      op = expand_expr (arg, NULL_RTX, Pmode, EXPAND_NORMAL);
-      addr = memory_address (mode, op);
-      if (fcode == ALTIVEC_BUILTIN_MASK_FOR_STORE)
-	op = addr;
-      else
-	{
-	  /* For the load case need to negate the address.  */
-	  op = gen_reg_rtx (GET_MODE (addr));
-	  emit_insn (gen_rtx_SET (VOIDmode, op,
-			 gen_rtx_NEG (GET_MODE (addr), addr)));
-	}
-      op = gen_rtx_MEM (mode, op);
+    case ALTIVEC_BUILTIN_MASK_FOR_LOAD:
+    case ALTIVEC_BUILTIN_MASK_FOR_STORE:
+      {
+	int icode = (int) CODE_FOR_altivec_lvsr;
+	enum machine_mode tmode = insn_data[icode].operand[0].mode;
+	enum machine_mode mode = insn_data[icode].operand[1].mode;
+	tree arg;
+	rtx op, addr, pat;
+
+	gcc_assert (TARGET_ALTIVEC);
+
+	arg = CALL_EXPR_ARG (exp, 0);
+	gcc_assert (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE);
+	op = expand_expr (arg, NULL_RTX, Pmode, EXPAND_NORMAL);
+	addr = memory_address (mode, op);
+	if (fcode == ALTIVEC_BUILTIN_MASK_FOR_STORE)
+	  op = addr;
+	else
+	  {
+	    /* For the load case need to negate the address.  */
+	    op = gen_reg_rtx (GET_MODE (addr));
+	    emit_insn (gen_rtx_SET (VOIDmode, op,
+				    gen_rtx_NEG (GET_MODE (addr), addr)));
+	  }
+	op = gen_rtx_MEM (mode, op);
 
-      if (target == 0
-	  || GET_MODE (target) != tmode
-	  || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
-	target = gen_reg_rtx (tmode);
+	if (target == 0
+	    || GET_MODE (target) != tmode
+	    || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
+	  target = gen_reg_rtx (tmode);
 
-      /*pat = gen_altivec_lvsr (target, op);*/
-      pat = GEN_FCN (icode) (target, op);
-      if (!pat)
-	return 0;
-      emit_insn (pat);
+	/*pat = gen_altivec_lvsr (target, op);*/
+	pat = GEN_FCN (icode) (target, op);
+	if (!pat)
+	  return 0;
+	emit_insn (pat);
 
-      return target;
-    }
+	return target;
+      }
 
+    case ALTIVEC_BUILTIN_VCFUX:
+    case ALTIVEC_BUILTIN_VCFSX:
+    case ALTIVEC_BUILTIN_VCTUXS:
+    case ALTIVEC_BUILTIN_VCTSXS:
   /* FIXME: There's got to be a nicer way to handle this case than
      constructing a new CALL_EXPR.  */
-  if (fcode == ALTIVEC_BUILTIN_VCFUX
-      || fcode == ALTIVEC_BUILTIN_VCFSX
-      || fcode == ALTIVEC_BUILTIN_VCTUXS
-      || fcode == ALTIVEC_BUILTIN_VCTSXS)
-    {
       if (call_expr_nargs (exp) == 1)
-	exp = build_call_nary (TREE_TYPE (exp), CALL_EXPR_FN (exp),
-			       2, CALL_EXPR_ARG (exp, 0), integer_zero_node);
+	{
+	  exp = build_call_nary (TREE_TYPE (exp), CALL_EXPR_FN (exp),
+				 2, CALL_EXPR_ARG (exp, 0), integer_zero_node);
+	}
+      break;
+
+    default:
+      break;
     }
 
   if (TARGET_ALTIVEC)
@@ -11081,6 +11395,7 @@ static void
 rs6000_init_builtins (void)
 {
   tree tdecl;
+  tree ftype;
 
   V2SI_type_node = build_vector_type (intSI_type_node, 2);
   V2SF_type_node = build_vector_type (float_type_node, 2);
@@ -11270,29 +11585,38 @@ rs6000_init_builtins (void)
     altivec_init_builtins ();
   if (TARGET_ALTIVEC || TARGET_SPE || TARGET_PAIRED_FLOAT || TARGET_VSX)
     rs6000_common_init_builtins ();
-  if (TARGET_PPC_GFXOPT)
+  if (TARGET_FRE)
+    {
+      ftype = builtin_function_type (DFmode, DFmode, DFmode, VOIDmode,
+				     RS6000_BUILTIN_RECIP,
+				     "__builtin_recipdiv");
+      def_builtin (MASK_POPCNTB, "__builtin_recipdiv", ftype,
+		   RS6000_BUILTIN_RECIP);
+    }
+  if (TARGET_FRES)
     {
-      tree ftype = builtin_function_type (SFmode, SFmode, SFmode, VOIDmode,
-					  RS6000_BUILTIN_RECIPF,
-					  "__builtin_recipdivf");
+      ftype = builtin_function_type (SFmode, SFmode, SFmode, VOIDmode,
+				     RS6000_BUILTIN_RECIPF,
+				     "__builtin_recipdivf");
       def_builtin (MASK_PPC_GFXOPT, "__builtin_recipdivf", ftype,
 		   RS6000_BUILTIN_RECIPF);
-
+    }
+  if (TARGET_FRSQRTE)
+    {
+      ftype = builtin_function_type (DFmode, DFmode, VOIDmode, VOIDmode,
+				     RS6000_BUILTIN_RSQRT,
+				     "__builtin_rsqrt");
+      def_builtin (MASK_PPC_GFXOPT, "__builtin_rsqrt", ftype,
+		   RS6000_BUILTIN_RSQRT);
+    }
+  if (TARGET_FRSQRTES)
+    {
       ftype = builtin_function_type (SFmode, SFmode, VOIDmode, VOIDmode,
 				     RS6000_BUILTIN_RSQRTF,
 				     "__builtin_rsqrtf");
       def_builtin (MASK_PPC_GFXOPT, "__builtin_rsqrtf", ftype,
 		   RS6000_BUILTIN_RSQRTF);
     }
-  if (TARGET_POPCNTB)
-    {
-      tree ftype = builtin_function_type (DFmode, DFmode, DFmode, VOIDmode,
-					  RS6000_BUILTIN_RECIP,
-					  "__builtin_recipdiv");
-      def_builtin (MASK_POPCNTB, "__builtin_recipdiv", ftype,
-		   RS6000_BUILTIN_RECIP);
-
-    }
   if (TARGET_POPCNTD)
     {
       enum machine_mode mode = (TARGET_64BIT) ? DImode : SImode;
@@ -13800,30 +14124,16 @@ rs6000_preferred_reload_class (rtx x, enum reg_class rclass)
   if (GET_MODE_CLASS (mode) == MODE_INT && rclass == NON_SPECIAL_REGS)
     return GENERAL_REGS;
 
-  /* For VSX, prefer the traditional registers for DF if the address is of the
-     form reg+offset because we can use the non-VSX loads.  Prefer the Altivec
-     registers if Altivec is handling the vector operations (i.e. V16QI, V8HI,
-     and V4SI).  */
-  if (rclass == VSX_REGS && VECTOR_MEM_VSX_P (mode))
+  /* For VSX, prefer the traditional registers for 64-bit values because we can
+     use the non-VSX loads.  Prefer the Altivec registers if Altivec is
+     handling the vector operations (i.e. V16QI, V8HI, and V4SI), or if we
+     prefer Altivec loads..  */
+  if (rclass == VSX_REGS)
     {
-      if (mode == DFmode && GET_CODE (x) == MEM)
-	{
-	  rtx addr = XEXP (x, 0);
-
-	  if (legitimate_indirect_address_p (addr, false))	/* reg */
-	    return VSX_REGS;
-
-	  if (legitimate_indexed_address_p (addr, false))	/* reg+reg */
-	    return VSX_REGS;
+      if (GET_MODE_SIZE (mode) <= 8)
+	return FLOAT_REGS;
 
-	  if (GET_CODE (addr) == PRE_MODIFY
-	      && legitimate_indexed_address_p (XEXP (addr, 0), false))
-	    return VSX_REGS;
-
-	  return FLOAT_REGS;
-	}
-
-      if (VECTOR_UNIT_ALTIVEC_P (mode))
+      if (VECTOR_UNIT_ALTIVEC_P (mode) || VECTOR_MEM_ALTIVEC_P (mode))
 	return ALTIVEC_REGS;
 
       return rclass;
@@ -25110,17 +25420,41 @@ static tree
 rs6000_builtin_reciprocal (unsigned int fn, bool md_fn,
 			   bool sqrt ATTRIBUTE_UNUSED)
 {
-  if (! (TARGET_RECIP && TARGET_PPC_GFXOPT && !optimize_size
-	 && flag_finite_math_only && !flag_trapping_math
-	 && flag_unsafe_math_optimizations))
+  if (optimize_insn_for_size_p ())
     return NULL_TREE;
 
   if (md_fn)
-    return NULL_TREE;
+    switch (fn)
+      {
+      case VSX_BUILTIN_XVSQRTDP:
+	if (!RS6000_RECIP_AUTO_RSQRTE_P (V2DFmode))
+	  return NULL_TREE;
+
+	return rs6000_builtin_decls[VSX_BUILTIN_VEC_RSQRT_V2DF];
+
+      case VSX_BUILTIN_XVSQRTSP:
+	if (!RS6000_RECIP_AUTO_RSQRTE_P (V4SFmode))
+	  return NULL_TREE;
+
+	return rs6000_builtin_decls[VSX_BUILTIN_VEC_RSQRT_V4SF];
+
+      default:
+	return NULL_TREE;
+      }
+
   else
     switch (fn)
       {
+      case BUILT_IN_SQRT:
+	if (!RS6000_RECIP_AUTO_RSQRTE_P (DFmode))
+	  return NULL_TREE;
+
+	return rs6000_builtin_decls[RS6000_BUILTIN_RSQRT];
+
       case BUILT_IN_SQRTF:
+	if (!RS6000_RECIP_AUTO_RSQRTE_P (SFmode))
+	  return NULL_TREE;
+
 	return rs6000_builtin_decls[RS6000_BUILTIN_RSQRTF];
 
       default:
@@ -25128,192 +25462,300 @@ rs6000_builtin_reciprocal (unsigned int fn, bool md_fn,
       }
 }
 
-/* Newton-Raphson approximation of single-precision floating point divide n/d.
-   Assumes no trapping math and finite arguments.  */
+/* Load up a constant.  If the mode is a vector mode, splat the value across
+   all of the vector elements.  */
 
-void
-rs6000_emit_swdivsf (rtx dst, rtx n, rtx d)
+static rtx
+rs6000_load_constant_and_splat (enum machine_mode mode, REAL_VALUE_TYPE dconst)
+{
+  rtx reg;
+
+  if (mode == SFmode || mode == DFmode)
+    {
+      rtx d = CONST_DOUBLE_FROM_REAL_VALUE (dconst, mode);
+      reg = force_reg (mode, d);
+    }
+  else if (mode == V4SFmode)
+    {
+      rtx d = CONST_DOUBLE_FROM_REAL_VALUE (dconst, SFmode);
+      rtvec v = gen_rtvec (4, d, d, d, d);
+      reg = gen_reg_rtx (mode);
+      rs6000_expand_vector_init (reg, gen_rtx_PARALLEL (mode, v));
+    }
+  else if (mode == V2DFmode)
+    {
+      rtx d = CONST_DOUBLE_FROM_REAL_VALUE (dconst, DFmode);
+      rtvec v = gen_rtvec (2, d, d);
+      reg = gen_reg_rtx (mode);
+      rs6000_expand_vector_init (reg, gen_rtx_PARALLEL (mode, v));
+    }
+  else
+    gcc_unreachable ();
+
+  return reg;
+}
+
+/* Generate a FMADD instruction:
+	dst = (m1 * m2) + a
+
+   generating different RTL based on the fused multiply/add switch.  */
+
+static void
+rs6000_emit_madd (rtx dst, rtx m1, rtx m2, rtx a)
+{
+  enum machine_mode mode = GET_MODE (dst);
+
+  if (!TARGET_FUSED_MADD)
+    {
+      /* For the simple ops, use the generator function, rather than assuming
+	 that the RTL is standard.  */
+      enum insn_code mcode = optab_handler (smul_optab, mode)->insn_code;
+      enum insn_code acode = optab_handler (add_optab, mode)->insn_code;
+      gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (mcode);
+      gen_2arg_fn_t gen_add = (gen_2arg_fn_t) GEN_FCN (acode);
+      rtx mreg = gen_reg_rtx (mode);
+
+      gcc_assert (mcode != CODE_FOR_nothing && acode != CODE_FOR_nothing);
+      emit_insn (gen_mul (mreg, m1, m2));
+      emit_insn (gen_add (dst, mreg, a));
+    }
+
+  else
+    emit_insn (gen_rtx_SET (VOIDmode, dst,
+			    gen_rtx_PLUS (mode,
+					  gen_rtx_MULT (mode, m1, m2),
+					  a)));
+}
+
+/* Generate a FMSUB instruction:
+	dst = (m1 * m2) - a
+
+   generating different RTL based on the fused multiply/add switch.  */
+
+static void
+rs6000_emit_msub (rtx dst, rtx m1, rtx m2, rtx a)
+{
+  enum machine_mode mode = GET_MODE (dst);
+
+  if (!TARGET_FUSED_MADD
+      || (mode == V4SFmode && VECTOR_UNIT_ALTIVEC_P (V4SFmode)))
+    {
+      /* For the simple ops, use the generator function, rather than assuming
+	 that the RTL is standard.  */
+      enum insn_code mcode = optab_handler (smul_optab, mode)->insn_code;
+      enum insn_code scode = optab_handler (add_optab, mode)->insn_code;
+      gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (mcode);
+      gen_2arg_fn_t gen_sub = (gen_2arg_fn_t) GEN_FCN (scode);
+      rtx mreg = gen_reg_rtx (mode);
+
+      gcc_assert (mcode != CODE_FOR_nothing && scode != CODE_FOR_nothing);
+      emit_insn (gen_mul (mreg, m1, m2));
+      emit_insn (gen_sub (dst, mreg, a));
+    }
+
+  else
+    emit_insn (gen_rtx_SET (VOIDmode, dst,
+			    gen_rtx_MINUS (mode,
+					   gen_rtx_MULT (mode, m1, m2),
+					   a)));
+}
+
+/* Generate a FNMSUB instruction:
+	dst = - ((m1 * m2) - a)
+
+   Which is equivalent to (except in the prescence of -0.0):
+	dst = a - (m1 * m2)
+
+   generating different RTL based on the fast-math and fused multiply/add
+   switches.  */
+
+static void
+rs6000_emit_nmsub (rtx dst, rtx m1, rtx m2, rtx a)
 {
-  rtx x0, e0, e1, y1, u0, v0, one;
+  enum machine_mode mode = GET_MODE (dst);
+
+  if (!TARGET_FUSED_MADD)
+    {
+      /* For the simple ops, use the generator function, rather than assuming
+	 that the RTL is standard.  */
+      enum insn_code mcode = optab_handler (smul_optab, mode)->insn_code;
+      enum insn_code scode = optab_handler (sub_optab, mode)->insn_code;
+      gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (mcode);
+      gen_2arg_fn_t gen_sub = (gen_2arg_fn_t) GEN_FCN (scode);
+      rtx mreg = gen_reg_rtx (mode);
 
-  x0 = gen_reg_rtx (SFmode);
-  e0 = gen_reg_rtx (SFmode);
-  e1 = gen_reg_rtx (SFmode);
-  y1 = gen_reg_rtx (SFmode);
-  u0 = gen_reg_rtx (SFmode);
-  v0 = gen_reg_rtx (SFmode);
-  one = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconst1, SFmode));
+      gcc_assert (mcode != CODE_FOR_nothing && scode != CODE_FOR_nothing);
+      emit_insn (gen_mul (mreg, m1, m2));
+      emit_insn (gen_sub (dst, a, mreg));
+    }
+
+  else
+    {
+      rtx m = gen_rtx_MULT (mode, m1, m2);
+
+      if (!HONOR_SIGNED_ZEROS (mode))
+	emit_insn (gen_rtx_SET (VOIDmode, dst, gen_rtx_MINUS (mode, a, m)));
+
+      else
+	emit_insn (gen_rtx_SET (VOIDmode, dst,
+				gen_rtx_NEG (mode,
+					     gen_rtx_MINUS (mode, m, a))));
+    }
+}
+
+/* Newton-Raphson approximation of floating point divide with just 2 passes
+   (either single precision floating point, or newer machines with higher
+   accuracy estimates).  Support both scalar and vector divide.  Assumes no
+   trapping math and finite arguments.  */
+
+static void
+rs6000_emit_swdiv_high_precision (rtx dst, rtx n, rtx d)
+{
+  enum machine_mode mode = GET_MODE (dst);
+  rtx x0, e0, e1, y1, u0, v0;
+  enum insn_code code = optab_handler (smul_optab, mode)->insn_code;
+  gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code);
+  rtx one = rs6000_load_constant_and_splat (mode, dconst1);
+
+  gcc_assert (code != CODE_FOR_nothing);
 
   /* x0 = 1./d estimate */
+  x0 = gen_reg_rtx (mode);
   emit_insn (gen_rtx_SET (VOIDmode, x0,
-			  gen_rtx_UNSPEC (SFmode, gen_rtvec (1, d),
+			  gen_rtx_UNSPEC (mode, gen_rtvec (1, d),
 					  UNSPEC_FRES)));
-  /* e0 = 1. - d * x0 */
-  emit_insn (gen_rtx_SET (VOIDmode, e0,
-			  gen_rtx_MINUS (SFmode, one,
-					 gen_rtx_MULT (SFmode, d, x0))));
-  /* e1 = e0 + e0 * e0 */
-  emit_insn (gen_rtx_SET (VOIDmode, e1,
-			  gen_rtx_PLUS (SFmode,
-					gen_rtx_MULT (SFmode, e0, e0), e0)));
-  /* y1 = x0 + e1 * x0 */
-  emit_insn (gen_rtx_SET (VOIDmode, y1,
-			  gen_rtx_PLUS (SFmode,
-					gen_rtx_MULT (SFmode, e1, x0), x0)));
-  /* u0 = n * y1 */
-  emit_insn (gen_rtx_SET (VOIDmode, u0,
-			  gen_rtx_MULT (SFmode, n, y1)));
-  /* v0 = n - d * u0 */
-  emit_insn (gen_rtx_SET (VOIDmode, v0,
-			  gen_rtx_MINUS (SFmode, n,
-					 gen_rtx_MULT (SFmode, d, u0))));
-  /* dst = u0 + v0 * y1 */
-  emit_insn (gen_rtx_SET (VOIDmode, dst,
-			  gen_rtx_PLUS (SFmode,
-					gen_rtx_MULT (SFmode, v0, y1), u0)));
-}
-
-/* Newton-Raphson approximation of double-precision floating point divide n/d.
-   Assumes no trapping math and finite arguments.  */
 
-void
-rs6000_emit_swdivdf (rtx dst, rtx n, rtx d)
+  e0 = gen_reg_rtx (mode);
+  rs6000_emit_nmsub (e0, d, x0, one);		/* e0 = 1. - (d * x0) */
+
+  e1 = gen_reg_rtx (mode);
+  rs6000_emit_madd (e1, e0, e0, e0);		/* e1 = (e0 * e0) + e0 */
+
+  y1 = gen_reg_rtx (mode);
+  rs6000_emit_madd (y1, e1, x0, x0);		/* y1 = (e1 * x0) + x0 */
+
+  u0 = gen_reg_rtx (mode);
+  emit_insn (gen_mul (u0, n, y1));		/* u0 = n * y1 */
+
+  v0 = gen_reg_rtx (mode);
+  rs6000_emit_nmsub (v0, d, u0, n);		/* v0 = n - (d * u0) */
+
+  rs6000_emit_madd (dst, v0, y1, u0);		/* dst = (v0 * y1) + u0 */
+}
+
+/* Newton-Raphson approximation of floating point divide that has a low
+   precision estimate.  Assumes no trapping math and finite arguments.  */
+
+static void
+rs6000_emit_swdiv_low_precision (rtx dst, rtx n, rtx d)
 {
+  enum machine_mode mode = GET_MODE (dst);
   rtx x0, e0, e1, e2, y1, y2, y3, u0, v0, one;
+  enum insn_code code = optab_handler (smul_optab, mode)->insn_code;
+  gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code);
 
-  x0 = gen_reg_rtx (DFmode);
-  e0 = gen_reg_rtx (DFmode);
-  e1 = gen_reg_rtx (DFmode);
-  e2 = gen_reg_rtx (DFmode);
-  y1 = gen_reg_rtx (DFmode);
-  y2 = gen_reg_rtx (DFmode);
-  y3 = gen_reg_rtx (DFmode);
-  u0 = gen_reg_rtx (DFmode);
-  v0 = gen_reg_rtx (DFmode);
-  one = force_reg (DFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconst1, DFmode));
+  gcc_assert (code != CODE_FOR_nothing);
+
+  one = rs6000_load_constant_and_splat (mode, dconst1);
 
   /* x0 = 1./d estimate */
+  x0 = gen_reg_rtx (mode);
   emit_insn (gen_rtx_SET (VOIDmode, x0,
-			  gen_rtx_UNSPEC (DFmode, gen_rtvec (1, d),
+			  gen_rtx_UNSPEC (mode, gen_rtvec (1, d),
 					  UNSPEC_FRES)));
-  /* e0 = 1. - d * x0 */
-  emit_insn (gen_rtx_SET (VOIDmode, e0,
-			  gen_rtx_MINUS (DFmode, one,
-					 gen_rtx_MULT (SFmode, d, x0))));
-  /* y1 = x0 + e0 * x0 */
-  emit_insn (gen_rtx_SET (VOIDmode, y1,
-			  gen_rtx_PLUS (DFmode,
-					gen_rtx_MULT (DFmode, e0, x0), x0)));
-  /* e1 = e0 * e0 */
-  emit_insn (gen_rtx_SET (VOIDmode, e1,
-			  gen_rtx_MULT (DFmode, e0, e0)));
-  /* y2 = y1 + e1 * y1 */
-  emit_insn (gen_rtx_SET (VOIDmode, y2,
-			  gen_rtx_PLUS (DFmode,
-					gen_rtx_MULT (DFmode, e1, y1), y1)));
-  /* e2 = e1 * e1 */
-  emit_insn (gen_rtx_SET (VOIDmode, e2,
-			  gen_rtx_MULT (DFmode, e1, e1)));
-  /* y3 = y2 + e2 * y2 */
-  emit_insn (gen_rtx_SET (VOIDmode, y3,
-			  gen_rtx_PLUS (DFmode,
-					gen_rtx_MULT (DFmode, e2, y2), y2)));
-  /* u0 = n * y3 */
-  emit_insn (gen_rtx_SET (VOIDmode, u0,
-			  gen_rtx_MULT (DFmode, n, y3)));
-  /* v0 = n - d * u0 */
-  emit_insn (gen_rtx_SET (VOIDmode, v0,
-			  gen_rtx_MINUS (DFmode, n,
-					 gen_rtx_MULT (DFmode, d, u0))));
-  /* dst = u0 + v0 * y3 */
-  emit_insn (gen_rtx_SET (VOIDmode, dst,
-			  gen_rtx_PLUS (DFmode,
-					gen_rtx_MULT (DFmode, v0, y3), u0)));
-}
-
-
-/* Newton-Raphson approximation of single-precision floating point rsqrt.
-   Assumes no trapping math and finite arguments.  */
+
+  e0 = gen_reg_rtx (mode);
+  rs6000_emit_nmsub (e0, d, x0, one);		/* e0 = 1. - d * x0 */
+
+  y1 = gen_reg_rtx (mode);
+  rs6000_emit_madd (y1, e0, x0, x0);		/* y1 = x0 + e0 * x0 */
+
+  e1 = gen_reg_rtx (mode);
+  emit_insn (gen_mul (e1, e0, e0));		/* e1 = e0 * e0 */
+
+  y2 = gen_reg_rtx (mode);
+  rs6000_emit_madd (y2, e1, y1, y1);		/* y2 = y1 + e1 * y1 */
+
+  e2 = gen_reg_rtx (mode);
+  emit_insn (gen_mul (e2, e1, e1));		/* e2 = e1 * e1 */
+
+  y3 = gen_reg_rtx (mode);
+  rs6000_emit_madd (y3, e2, y2, y2);		/* y3 = y2 + e2 * y2 */
+
+  u0 = gen_reg_rtx (mode);
+  emit_insn (gen_mul (u0, n, y3));		/* u0 = n * y3 */
+
+  v0 = gen_reg_rtx (mode);
+  rs6000_emit_nmsub (v0, d, u0, n);		/* v0 = n - d * u0 */
+
+  rs6000_emit_madd (dst, v0, y3, u0);		/* dst = u0 + v0 * y3 */
+}
+
+/* Newton-Raphson approximation of floating point divide DST = N/D.  If NOTE_P,
+   add a reg_note saying that this was a division.  Support both scalar and
+   vector divide.  Assumes no trapping math and finite arguments.  */
 
 void
-rs6000_emit_swrsqrtsf (rtx dst, rtx src)
-{
-  rtx x0, x1, x2, y1, u0, u1, u2, v0, v1, v2, t0,
-    half, one, halfthree, c1, cond, label;
-
-  x0 = gen_reg_rtx (SFmode);
-  x1 = gen_reg_rtx (SFmode);
-  x2 = gen_reg_rtx (SFmode);
-  y1 = gen_reg_rtx (SFmode);
-  u0 = gen_reg_rtx (SFmode);
-  u1 = gen_reg_rtx (SFmode);
-  u2 = gen_reg_rtx (SFmode);
-  v0 = gen_reg_rtx (SFmode);
-  v1 = gen_reg_rtx (SFmode);
-  v2 = gen_reg_rtx (SFmode);
-  t0 = gen_reg_rtx (SFmode);
-  halfthree = gen_reg_rtx (SFmode);
-  cond = gen_rtx_REG (CCFPmode, CR1_REGNO);
-  label = gen_rtx_LABEL_REF (VOIDmode, gen_label_rtx ());
+rs6000_emit_swdiv (rtx dst, rtx n, rtx d, bool note_p)
+{
+  enum machine_mode mode = GET_MODE (dst);
 
-  /* check 0.0, 1.0, NaN, Inf by testing src * src = src */
-  emit_insn (gen_rtx_SET (VOIDmode, t0,
-			  gen_rtx_MULT (SFmode, src, src)));
+  if (RS6000_RECIP_HIGH_PRECISION_P (mode))
+    rs6000_emit_swdiv_high_precision (dst, n, d);
+  else
+    rs6000_emit_swdiv_low_precision (dst, n, d);
 
-  emit_insn (gen_rtx_SET (VOIDmode, cond,
-			  gen_rtx_COMPARE (CCFPmode, t0, src)));
-  c1 = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
-  emit_unlikely_jump (c1, label);
+  if (note_p)
+    add_reg_note (get_last_insn (), REG_EQUAL, gen_rtx_DIV (mode, n, d));
+}
 
-  half = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, SFmode));
-  one = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconst1, SFmode));
+/* Newton-Raphson approximation of single/double-precision floating point
+   rsqrt.  Assumes no trapping math and finite arguments.  */
+
+void
+rs6000_emit_swrsqrt (rtx dst, rtx src)
+{
+  enum machine_mode mode = GET_MODE (src);
+  rtx x0 = gen_reg_rtx (mode);
+  rtx y = gen_reg_rtx (mode);
+  int passes = (TARGET_RECIP_PRECISION) ? 2 : 3;
+  REAL_VALUE_TYPE dconst3_2;
+  int i;
+  rtx halfthree;
+  enum insn_code code = optab_handler (smul_optab, mode)->insn_code;
+  gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code);
 
-  /* halfthree = 1.5 = 1.0 + 0.5 */
-  emit_insn (gen_rtx_SET (VOIDmode, halfthree,
-			  gen_rtx_PLUS (SFmode, one, half)));
+  gcc_assert (code != CODE_FOR_nothing);
+
+  /* Load up the constant 1.5 either as a scalar, or as a vector.  */
+  real_from_integer (&dconst3_2, VOIDmode, 3, 0, 0);
+  SET_REAL_EXP (&dconst3_2, REAL_EXP (&dconst3_2) - 1);
+
+  halfthree = rs6000_load_constant_and_splat (mode, dconst3_2);
 
   /* x0 = rsqrt estimate */
   emit_insn (gen_rtx_SET (VOIDmode, x0,
-			  gen_rtx_UNSPEC (SFmode, gen_rtvec (1, src),
+			  gen_rtx_UNSPEC (mode, gen_rtvec (1, src),
 					  UNSPEC_RSQRT)));
 
-  /* y1 = 0.5 * src = 1.5 * src - src -> fewer constants */
-  emit_insn (gen_rtx_SET (VOIDmode, y1,
-			  gen_rtx_MINUS (SFmode,
-					 gen_rtx_MULT (SFmode, src, halfthree),
-					 src)));
-
-  /* x1 = x0 * (1.5 - y1 * (x0 * x0)) */
-  emit_insn (gen_rtx_SET (VOIDmode, u0,
-			  gen_rtx_MULT (SFmode, x0, x0)));
-  emit_insn (gen_rtx_SET (VOIDmode, v0,
-			  gen_rtx_MINUS (SFmode,
-					 halfthree,
-					 gen_rtx_MULT (SFmode, y1, u0))));
-  emit_insn (gen_rtx_SET (VOIDmode, x1,
-			  gen_rtx_MULT (SFmode, x0, v0)));
-
-  /* x2 = x1 * (1.5 - y1 * (x1 * x1)) */
-  emit_insn (gen_rtx_SET (VOIDmode, u1,
-			  gen_rtx_MULT (SFmode, x1, x1)));
-  emit_insn (gen_rtx_SET (VOIDmode, v1,
-			  gen_rtx_MINUS (SFmode,
-					 halfthree,
-					 gen_rtx_MULT (SFmode, y1, u1))));
-  emit_insn (gen_rtx_SET (VOIDmode, x2,
-			  gen_rtx_MULT (SFmode, x1, v1)));
-
-  /* dst = x2 * (1.5 - y1 * (x2 * x2)) */
-  emit_insn (gen_rtx_SET (VOIDmode, u2,
-			  gen_rtx_MULT (SFmode, x2, x2)));
-  emit_insn (gen_rtx_SET (VOIDmode, v2,
-			  gen_rtx_MINUS (SFmode,
-					 halfthree,
-					 gen_rtx_MULT (SFmode, y1, u2))));
-  emit_insn (gen_rtx_SET (VOIDmode, dst,
-			  gen_rtx_MULT (SFmode, x2, v2)));
+  /* y = 0.5 * src = 1.5 * src - src -> fewer constants */
+  rs6000_emit_msub (y, src, halfthree, src);
 
-  emit_label (XEXP (label, 0));
+  for (i = 0; i < passes; i++)
+    {
+      rtx x1 = gen_reg_rtx (mode);
+      rtx u = gen_reg_rtx (mode);
+      rtx v = gen_reg_rtx (mode);
+
+      /* x1 = x0 * (1.5 - y * (x0 * x0)) */
+      emit_insn (gen_mul (u, x0, x0));
+      rs6000_emit_nmsub (v, y, u, halfthree);
+      emit_insn (gen_mul (x1, x0, v));
+      x0 = x1;
+    }
+
+  emit_move_insn (dst, x0);
+  return;
 }
 
 /* Emit popcount intrinsic on TARGET_POPCNTB (Power5) and TARGET_POPCNTD
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index 327673e160c..5a1d7eeed68 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -543,6 +543,46 @@ extern int rs6000_vector_align[];
 /* E500 processors only support plain "sync", not lwsync.  */
 #define TARGET_NO_LWSYNC TARGET_E500
 
+/* Which machine supports the various reciprocal estimate instructions.  */
+#define TARGET_FRES	(TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT \
+			 && TARGET_FPRS && TARGET_SINGLE_FLOAT)
+
+#define TARGET_FRE	(TARGET_HARD_FLOAT && TARGET_FPRS \
+			 && TARGET_DOUBLE_FLOAT \
+			 && (TARGET_POPCNTB || VECTOR_UNIT_VSX_P (DFmode)))
+
+#define TARGET_FRSQRTES	(TARGET_HARD_FLOAT && TARGET_POPCNTB \
+			 && TARGET_FPRS && TARGET_SINGLE_FLOAT)
+
+#define TARGET_FRSQRTE	(TARGET_HARD_FLOAT && TARGET_FPRS \
+			 && TARGET_DOUBLE_FLOAT \
+			 && (TARGET_PPC_GFXOPT || VECTOR_UNIT_VSX_P (DFmode)))
+
+/* Whether the various reciprocal divide/square root estimate instructions
+   exist, and whether we should automatically generate code for the instruction
+   by default.  */
+#define RS6000_RECIP_MASK_HAVE_RE	0x1	/* have RE instruction.  */
+#define RS6000_RECIP_MASK_AUTO_RE	0x2	/* generate RE by default.  */
+#define RS6000_RECIP_MASK_HAVE_RSQRTE	0x4	/* have RSQRTE instruction.  */
+#define RS6000_RECIP_MASK_AUTO_RSQRTE	0x8	/* gen. RSQRTE by default.  */
+
+extern unsigned char rs6000_recip_bits[];
+
+#define RS6000_RECIP_HAVE_RE_P(MODE) \
+  (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_HAVE_RE)
+
+#define RS6000_RECIP_AUTO_RE_P(MODE) \
+  (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_AUTO_RE)
+
+#define RS6000_RECIP_HAVE_RSQRTE_P(MODE) \
+  (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_HAVE_RSQRTE)
+
+#define RS6000_RECIP_AUTO_RSQRTE_P(MODE) \
+  (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_AUTO_RSQRTE)
+
+#define RS6000_RECIP_HIGH_PRECISION_P(MODE) \
+  ((MODE) == SFmode || (MODE) == V4SFmode || TARGET_RECIP_PRECISION)
+
 /* Sometimes certain combinations of command options do not make sense
    on a particular target machine.  You can define a macro
    `OVERRIDE_OPTIONS' to take account of this.  This macro, if
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index bcb66ec1479..8f7093a6735 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -220,6 +220,9 @@
 ; These modes do not fit in integer registers in 32-bit mode.
 (define_mode_iterator DIFD [DI DF DD])
 
+;; Iterator for reciprocal estimate instructions
+(define_mode_iterator RECIPF [SF DF V4SF V2DF])
+
 ; Various instructions that come in SI and DI forms.
 ; A generic w/d attribute, for things like cmpw/cmpd.
 (define_mode_attr wd [(QI "b") (HI "h") (SI "w") (DI "d")])
@@ -240,6 +243,11 @@
 (define_mode_attr mptrsize [(SI "si")
 			    (DI "di")])
 
+(define_mode_attr rreg [(SF   "f")
+			(DF   "Ws")
+			(V4SF "Wf")
+			(V2DF "Wd")])
+
 
 ;; Start with fixed-point load and store insns.  Here we put only the more
 ;; complex forms.  Basic data transfer is done later.
@@ -5563,6 +5571,45 @@
   [(set_attr "type" "var_delayed_compare,delayed_compare,var_delayed_compare,delayed_compare")
    (set_attr "length" "4,4,8,8")])
 
+;; Builtins to replace a division to generate FRE reciprocal estimate
+;; instructions and the necessary fixup instructions
+(define_expand "recip<mode>3"
+  [(match_operand:RECIPF 0 "gpc_reg_operand" "")
+   (match_operand:RECIPF 1 "gpc_reg_operand" "")
+   (match_operand:RECIPF 2 "gpc_reg_operand" "")]
+  "RS6000_RECIP_HAVE_RE_P (<MODE>mode)"
+{
+   rs6000_emit_swdiv (operands[0], operands[1], operands[2], false);
+   DONE;
+})
+
+;; Split to create division from FRE/FRES/etc. and fixup instead of the normal
+;; hardware division.  This is only done before register allocation and with
+;; -ffast-math.  This must appear before the divsf3/divdf3 insns.
+(define_split
+  [(set (match_operand:RECIPF 0 "gpc_reg_operand" "")
+	(div:RECIPF (match_operand 1 "gpc_reg_operand" "")
+		    (match_operand 2 "gpc_reg_operand" "")))]
+  "RS6000_RECIP_AUTO_RE_P (<MODE>mode)
+   && can_create_pseudo_p () && optimize_insn_for_speed_p ()
+   && flag_finite_math_only && !flag_trapping_math && flag_reciprocal_math"
+  [(const_int 0)]
+{
+  rs6000_emit_swdiv (operands[0], operands[1], operands[2], true);
+  DONE;
+})
+
+;; Builtins to replace 1/sqrt(x) with instructions using RSQRTE and the
+;; appropriate fixup.
+(define_expand "rsqrt<mode>2"
+  [(match_operand:RECIPF 0 "gpc_reg_operand" "")
+   (match_operand:RECIPF 1 "gpc_reg_operand" "")]
+  "RS6000_RECIP_HAVE_RSQRT_P (<MODE>mode)"
+{
+  rs6000_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})
+
 (define_split
   [(set (match_operand:CC 3 "cc_reg_not_micro_cr0_operand" "")
 	(compare:CC (ashiftrt:SI (match_operand:SI 1 "gpc_reg_operand" "")
@@ -5766,22 +5813,10 @@
   "{fd|fdiv} %0,%1,%2"
   [(set_attr "type" "ddiv")])
 
-(define_expand "recipsf3"
-  [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
-	(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")
-		    (match_operand:SF 2 "gpc_reg_operand" "f")]
-		   UNSPEC_FRES))]
-  "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT && !optimize_size
-   && flag_finite_math_only && !flag_trapping_math"
-{
-   rs6000_emit_swdivsf (operands[0], operands[1], operands[2]);
-   DONE;
-})
-
 (define_insn "fres"
   [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
 	(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")] UNSPEC_FRES))]
-  "TARGET_PPC_GFXOPT && flag_finite_math_only"
+  "TARGET_FRES"
   "fres %0,%1"
   [(set_attr "type" "fp")])
 
@@ -5931,23 +5966,12 @@
   "fsqrt %0,%1"
   [(set_attr "type" "dsqrt")])
 
-(define_expand "rsqrtsf2"
+(define_insn "*rsqrtsf_internal1"
   [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
 	(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")]
 		   UNSPEC_RSQRT))]
-  "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT && !optimize_size
-   && flag_finite_math_only && !flag_trapping_math"
-{
-  rs6000_emit_swrsqrtsf (operands[0], operands[1]);
-  DONE;
-})
-
-(define_insn "*rsqrt_internal1"
-  [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
-	(unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")]
-		   UNSPEC_RSQRT))]
-  "TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT"
-  "frsqrte %0,%1"
+  "TARGET_FRSQRTES"
+  "frsqrtes %0,%1"
   [(set_attr "type" "fp")])
 
 (define_expand "copysignsf3"
@@ -5960,9 +5984,18 @@
 	                     (match_dup 5))
 			 (match_dup 3)
 			 (match_dup 4)))]
-  "TARGET_PPC_GFXOPT && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT
-   && !HONOR_NANS (SFmode) && !HONOR_SIGNED_ZEROS (SFmode)"
+  "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT
+   && ((TARGET_PPC_GFXOPT
+        && !HONOR_NANS (SFmode)
+        && !HONOR_SIGNED_ZEROS (SFmode))
+       || VECTOR_UNIT_VSX_P (DFmode))"
   {
+     if (VECTOR_UNIT_VSX_P (DFmode))
+       {
+	 emit_insn (gen_vsx_copysignsf3 (operands[0], operands[1], operands[2],
+					 CONST0_RTX (SFmode)));
+	 DONE;
+       }
      operands[3] = gen_reg_rtx (SFmode);
      operands[4] = gen_reg_rtx (SFmode);
      operands[5] = CONST0_RTX (SFmode);
@@ -6222,31 +6255,21 @@
   "{fd|fdiv} %0,%1,%2"
   [(set_attr "type" "ddiv")])
 
-(define_expand "recipdf3"
-  [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
-	(unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")
-		    (match_operand:DF 2 "gpc_reg_operand" "d")]
-		   UNSPEC_FRES))]
-  "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_POPCNTB && !optimize_size
-   && flag_finite_math_only && !flag_trapping_math"
-{
-   rs6000_emit_swdivdf (operands[0], operands[1], operands[2]);
-   DONE;
-})
-
-(define_expand "fred"
-  [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
-	(unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")] UNSPEC_FRES))]
-  "(TARGET_POPCNTB || VECTOR_UNIT_VSX_P (DFmode)) && flag_finite_math_only"
-  "")
-
 (define_insn "*fred_fpr"
   [(set (match_operand:DF 0 "gpc_reg_operand" "=f")
 	(unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "f")] UNSPEC_FRES))]
-  "TARGET_POPCNTB && flag_finite_math_only && !VECTOR_UNIT_VSX_P (DFmode)"
+  "TARGET_FRE && !VECTOR_UNIT_VSX_P (DFmode)"
   "fre %0,%1"
   [(set_attr "type" "fp")])
 
+(define_insn "*rsqrtdf_internal1"
+  [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
+	(unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")]
+		   UNSPEC_RSQRT))]
+  "TARGET_FRSQRTE && !VECTOR_UNIT_VSX_P (DFmode)"
+  "frsqrte %0,%1"
+  [(set_attr "type" "fp")])
+
 (define_insn "*fmadddf4_fpr"
   [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
 	(plus:DF (mult:DF (match_operand:DF 1 "gpc_reg_operand" "%d")
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 63f0f8c1582..e70172a19a6 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -195,8 +195,16 @@ Target Report Var(TARGET_XL_COMPAT)
 Conform more closely to IBM XLC semantics
 
 mrecip
-Target Report Var(TARGET_RECIP)
-Generate software reciprocal sqrt for better throughput
+Target Report
+Generate software reciprocal divide and square root for better throughput.
+
+mrecip=
+Target Report RejectNegative Joined
+Generate software reciprocal divide and square root for better throughput.
+
+mrecip-precision
+Target Report Mask(RECIP_PRECISION)
+Assume that the reciprocal estimate instructions provide more accuracy.
 
 mno-fp-in-toc
 Target Report RejectNegative Var(TARGET_NO_FP_IN_TOC)
diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index 46fb2926c9f..760baeb458d 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -267,6 +267,20 @@
   "VECTOR_UNIT_VSX_P (<MODE>mode)"
   "")
 
+(define_expand "rsqrte<mode>2"
+  [(set (match_operand:VEC_F 0 "vfloat_operand" "")
+        (unspec:VEC_F [(match_operand:VEC_F 1 "vfloat_operand" "")]
+		      UNSPEC_RSQRT))]
+  "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
+  "")
+
+(define_expand "re<mode>2"
+  [(set (match_operand:VEC_F 0 "vfloat_operand" "")
+	(unspec:VEC_F [(match_operand:VEC_F 1 "vfloat_operand" "f")]
+		      UNSPEC_FRES))]
+  "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)"
+  "")
+
 (define_expand "ftrunc<mode>2"
   [(set (match_operand:VEC_F 0 "vfloat_operand" "")
   	(fix:VEC_F (match_operand:VEC_F 1 "vfloat_operand" "")))]
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 7d572a48412..213d53ae5d1 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -195,7 +195,7 @@
    (UNSPEC_VSX_MSUB		511)
    (UNSPEC_VSX_NMADD		512)
    (UNSPEC_VSX_NMSUB		513)
-   (UNSPEC_VSX_RSQRTE		514)
+   ;; 514 deleted
    (UNSPEC_VSX_TDIV		515)
    (UNSPEC_VSX_TSQRT		516)
    (UNSPEC_VSX_XXPERMDI		517)
@@ -446,10 +446,10 @@
   [(set_attr "type" "<VStype_sqrt>")
    (set_attr "fp_type" "<VSfptype_sqrt>")])
 
-(define_insn "vsx_rsqrte<mode>2"
+(define_insn "*vsx_rsqrte<mode>2"
   [(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,?wa")
 	(unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "<VSr>,wa")]
-		      UNSPEC_VSX_RSQRTE))]
+		      UNSPEC_RSQRT))]
   "VECTOR_UNIT_VSX_P (<MODE>mode)"
   "x<VSv>rsqrte<VSs> %x0,%x1"
   [(set_attr "type" "<VStype_simple>")
@@ -862,6 +862,20 @@
   [(set_attr "type" "<VStype_simple>")
    (set_attr "fp_type" "<VSfptype_simple>")])
 
+;; Special version of copysign for single precision that knows internally
+;; scalar single values are kept as double
+(define_insn "vsx_copysignsf3"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=f")
+	(if_then_else:SF
+	 (ge:SF (match_operand:SF 2 "vsx_register_operand" "f")
+		(match_operand:SF 3 "zero_constant" "j"))
+	 (abs:SF (match_operand:SF 1 "vsx_register_operand" "f"))
+	 (neg:SF (abs:SF (match_dup 1)))))]
+  "VECTOR_UNIT_VSX_P (DFmode)"
+  "xscpsgndp %x0,%x2,%x1"
+  [(set_attr "type" "fp")
+   (set_attr "fp_type" "fp_addsub_d")])
+
 ;; For the conversions, limit the register class for the integer value to be
 ;; the fprs because we don't want to add the altivec registers to movdi/movsi.
 ;; For the unsigned tests, there isn't a generic double -> unsigned conversion
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 8e9a7061b2b..5f0d7624a04 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -10994,6 +10994,10 @@ vector unsigned char vec_vrlb (vector unsigned char,
 
 vector float vec_round (vector float);
 
+vector float vec_recip (vector float, vector float);
+
+vector float vec_rsqrt (vector float);
+
 vector float vec_rsqrte (vector float);
 
 vector float vec_sel (vector float, vector float, vector bool int);
@@ -11922,8 +11926,10 @@ vector double vec_or (vector bool long, vector double);
 vector double vec_perm (vector double,
                         vector double,
                         vector unsigned char);
-vector float vec_rint (vector float);
 vector double vec_rint (vector double);
+vector double vec_recip (vector double, vector double);
+vector double vec_rsqrt (vector double);
+vector double vec_rsqrte (vector double);
 vector double vec_sel (vector double, vector double, vector bool long);
 vector double vec_sel (vector double, vector double, vector unsigned long);
 vector double vec_sub (vector double, vector double);
@@ -11964,10 +11970,20 @@ GCC provides a few other builtins on Powerpc to access certain instructions:
 float __builtin_recipdivf (float, float);
 float __builtin_rsqrtf (float);
 double __builtin_recipdiv (double, double);
+double __builtin_rsqrt (double);
 long __builtin_bpermd (long, long);
 int __builtin_bswap16 (int);
 @end smallexample
 
+The @code{vec_rsqrt}, @code{__builtin_rsqrt}, and
+@code{__builtin_rsqrtf} functions generate multiple instructions to
+implement the reciprocal sqrt functionality using reciprocal sqrt
+estimate instructions.
+
+The @code{__builtin_recipdiv}, and @code{__builtin_recipdivf}
+functions generate multiple instructions to implement division using
+the reciprocal estimate instructions.
+
 @node RX Built-in Functions
 @subsection RX Built-in Functions
 GCC supports some of the RX instructions which cannot be expressed in
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 2a4ea479682..d8c0c22bed6 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -783,7 +783,8 @@ See RS/6000 and PowerPC Options.
 -mfloat-gprs=yes  -mfloat-gprs=no -mfloat-gprs=single -mfloat-gprs=double @gol
 -mprototype  -mno-prototype @gol
 -msim  -mmvme  -mads  -myellowknife  -memb  -msdata @gol
--msdata=@var{opt}  -mvxworks  -G @var{num}  -pthread}
+-msdata=@var{opt}  -mvxworks  -G @var{num}  -pthread @gol
+-mrecip -mrecip=@var{opt} -mno-recip -mrecip-precision -mno-recip-precision}
 
 @emph{RX Options}
 @gccoptlist{-m64bit-doubles  -m32bit-doubles  -fpu  -nofpu@gol
@@ -14975,17 +14976,6 @@ values for @var{cpu_type} are used for @option{-mtune} as for
 architecture, registers, and mnemonics set by @option{-mcpu}, but the
 scheduling parameters set by @option{-mtune}.
 
-@item -mswdiv
-@itemx -mno-swdiv
-@opindex mswdiv
-@opindex mno-swdiv
-Generate code to compute division as reciprocal estimate and iterative
-refinement, creating opportunities for increased throughput.  This
-feature requires: optional PowerPC Graphics instruction set for single
-precision and FRE instruction for double precision, assuming divides
-cannot generate user-visible traps, and the domain values not include
-Infinities, denormals or zero denominator.
-
 @item -maltivec
 @itemx -mno-altivec
 @opindex maltivec
@@ -15641,6 +15631,52 @@ sequence.
 Adds support for multithreading with the @dfn{pthreads} library.
 This option sets flags for both the preprocessor and linker.
 
+@item -mrecip
+@itemx -mno-recip
+@opindex mrecip
+This option will enable GCC to use the reciprocal estimate and
+reciprocal square root estimate instructions with additional
+Newton-Raphson steps to increase precision instead of doing a divide or
+square root and divide for floating point arguments.  You should use
+the @option{-ffast-math} option when using @option{-mrecip} (or at
+least @option{-funsafe-math-optimizations},
+@option{-finite-math-only}, @option{-freciprocal-math} and
+@option{-fno-trapping-math}).  Note that while the throughput of the
+sequence is generally higher than the throughput of the non-reciprocal
+instruction, the precision of the sequence can be decreased by up to 2
+ulp (i.e. the inverse of 1.0 equals 0.99999994) for reciprocal square
+roots.
+
+@item -mrecip=@var{opt}
+@opindex mrecip=opt
+This option allows to control which reciprocal estimate instructions
+may be used.  @var{opt} is a comma separated list of options, that may
+be preceeded by a @code{!} to invert the option:
+@code{all}: enable all estimate instructions,
+@code{default}: enable the default instructions, equvalent to @option{-mrecip},
+@code{none}: disable all estimate instructions, equivalent to @option{-mno-recip};
+@code{div}: enable the reciprocal approximation instructions for both single and double precision;
+@code{divf}: enable the single precision reciprocal approximation instructions;
+@code{divd}: enable the double precision reciprocal approximation instructions;
+@code{rsqrt}: enable the reciprocal square root approximation instructions for both single and double precision;
+@code{rsqrtf}: enable the single precision reciprocal square root approximation instructions;
+@code{rsqrtd}: enable the double precision reciprocal square root approximation instructions;
+
+So for example, @option{-mrecip=all,!rsqrtd} would enable the
+all of the reciprocal estimate instructions, except for the
+@code{FRSQRTE}, @code{XSRSQRTEDP}, and @code{XVRSQRTEDP} instructions
+which handle the double precision reciprocal square root calculations.
+
+@item -mrecip-precision
+@itemx -mno-recip-precision
+@opindex mrecip-precision
+Assume (do not assume) that the reciprocal estimate instructions
+provide higher precision estimates than is mandated by the powerpc
+ABI.  Selecting @option{-mcpu=power6} or @option{-mcpu=power7}
+automatically selects @option{-mrecip-precision}.  The double
+precision square root estimate instructions are not generated by
+default on low precision machines, since they do not provide an
+estimate that converges after three steps.
 @end table
 
 @node RX Options
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 4f97f01350f..ccbd30c6fdc 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,16 @@
+2010-06-02  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	PR target/44218
+	* gcc.target/powerpc/recip-1.c: New test for -mrecip support.
+	* gcc.target/powerpc/recip-2.c: Ditto.
+	* gcc.target/powerpc/recip-3.c: Ditto.
+	* gcc.target/powerpc/recip-4.c: Ditto.
+	* gcc.target/powerpc/recip-5.c: Ditto.
+	* gcc.target/powerpc/recip-6.c: Ditto.
+	* gcc.target/powerpc/recip-7.c: Ditto.
+	* gcc.target/powerpc/recip-test.h: Ditto.
+	* gcc.target/powerpc/recip-test2.h: Ditto.
+
 2010-06-02  H.J. Lu  <hongjiu.lu@intel.com>
 
 	* g++.dg/torture/pr44295.C (size_t): Use __SIZE_TYPE__.
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-1.c b/gcc/testsuite/gcc.target/powerpc/recip-1.c
new file mode 100644
index 00000000000..d1e383dc4ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power6" } */
+/* { dg-final { scan-assembler-times "frsqrte" 2 } } */
+/* { dg-final { scan-assembler-times "fmsub" 2 } } */
+/* { dg-final { scan-assembler-times "fmul" 8 } } */
+/* { dg-final { scan-assembler-times "fnmsub" 4 } } */
+
+double
+rsqrt_d (double a)
+{
+  return 1.0 / __builtin_sqrt (a);
+}
+
+float
+rsqrt_f (float a)
+{
+  return 1.0f / __builtin_sqrtf (a);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-2.c b/gcc/testsuite/gcc.target/powerpc/recip-2.c
new file mode 100644
index 00000000000..69442733aab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-2.c
@@ -0,0 +1,21 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power5" } */
+/* { dg-final { scan-assembler-times "frsqrtes" 1 } } */
+/* { dg-final { scan-assembler-times "fmsubs" 1 } } */
+/* { dg-final { scan-assembler-times "fmuls" 6 } } */
+/* { dg-final { scan-assembler-times "fnmsubs" 3 } } */
+/* { dg-final { scan-assembler-times "fsqrt" 1 } } */
+
+/* power5 resqrte is not accurate enough, and should not be generated by
+   default for -mrecip.  */
+double
+rsqrt_d (double a)
+{
+  return 1.0 / __builtin_sqrt (a);
+}
+
+float
+rsqrt_f (float a)
+{
+  return 1.0f / __builtin_sqrtf (a);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-3.c b/gcc/testsuite/gcc.target/powerpc/recip-3.c
new file mode 100644
index 00000000000..80a34e8ee59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-3.c
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power7" } */
+/* { dg-final { scan-assembler-times "xsrsqrtedp" 1 } } */
+/* { dg-final { scan-assembler-times "xsmsub.dp" 1 } } */
+/* { dg-final { scan-assembler-times "xsmuldp" 4 } } */
+/* { dg-final { scan-assembler-times "xsnmsub.dp" 2 } } */
+/* { dg-final { scan-assembler-times "frsqrtes" 1 } } */
+/* { dg-final { scan-assembler-times "fmsubs" 1 } } */
+/* { dg-final { scan-assembler-times "fmuls" 4 } } */
+/* { dg-final { scan-assembler-times "fnmsubs" 2 } } */
+
+double
+rsqrt_d (double a)
+{
+  return 1.0 / __builtin_sqrt (a);
+}
+
+float
+rsqrt_f (float a)
+{
+  return 1.0f / __builtin_sqrtf (a);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-4.c b/gcc/testsuite/gcc.target/powerpc/recip-4.c
new file mode 100644
index 00000000000..bd496d70e25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-4.c
@@ -0,0 +1,36 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O3 -ftree-vectorize -mrecip -ffast-math -mcpu=power7 -fno-unroll-loops" } */
+/* { dg-final { scan-assembler-times "xvrsqrtedp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmsub.dp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmuldp" 4 } } */
+/* { dg-final { scan-assembler-times "xvnmsub.dp" 2 } } */
+/* { dg-final { scan-assembler-times "xvrsqrtesp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmsub.sp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmulsp" 4 } } */
+/* { dg-final { scan-assembler-times "xvnmsub.sp" 2 } } */
+
+#define SIZE 1024
+
+extern double a_d[SIZE] __attribute__((__aligned__(32)));
+extern double b_d[SIZE] __attribute__((__aligned__(32)));
+
+void
+vectorize_rsqrt_d (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    a_d[i] = 1.0 / __builtin_sqrt (b_d[i]);
+}
+
+extern float a_f[SIZE] __attribute__((__aligned__(32)));
+extern float b_f[SIZE] __attribute__((__aligned__(32)));
+
+void
+vectorize_rsqrt_f (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    a_f[i] = 1.0f / __builtin_sqrtf (b_f[i]);
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-5.c b/gcc/testsuite/gcc.target/powerpc/recip-5.c
new file mode 100644
index 00000000000..4a9c496201a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-5.c
@@ -0,0 +1,94 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-options "-O3 -ftree-vectorize -mrecip=all -ffast-math -mcpu=power7 -fno-unroll-loops" } */
+/* { dg-final { scan-assembler-times "xvredp" 4 } } */
+/* { dg-final { scan-assembler-times "xvresp" 5 } } */
+/* { dg-final { scan-assembler-times "xsredp" 2 } } */
+/* { dg-final { scan-assembler-times "fres" 2 } } */
+
+#include <altivec.h>
+
+float f_recip (float a, float b) { return __builtin_recipdivf (a, b); }
+double d_recip (double a, double b) { return __builtin_recipdiv (a, b); }
+
+float f_div (float a, float b) { return a / b; }
+double d_div (double a, double b) { return a / b; }
+
+#define SIZE 1024
+
+double d_a[SIZE] __attribute__((__aligned__(32)));
+double d_b[SIZE] __attribute__((__aligned__(32)));
+double d_c[SIZE] __attribute__((__aligned__(32)));
+
+float f_a[SIZE] __attribute__((__aligned__(32)));
+float f_b[SIZE] __attribute__((__aligned__(32)));
+float f_c[SIZE] __attribute__((__aligned__(32)));
+
+void vec_f_recip (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    f_a[i] = __builtin_recipdivf (f_b[i], f_c[i]);
+}
+
+void vec_d_recip (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    d_a[i] = __builtin_recipdiv (d_b[i], d_c[i]);
+}
+
+void vec_f_div (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    f_a[i] = f_b[i] / f_c[i];
+}
+
+void vec_f_div2 (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    f_a[i] = f_b[i] / 2.0f;
+}
+
+void vec_f_div53 (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    f_a[i] = f_b[i] / 53.0f;
+}
+
+void vec_d_div (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    d_a[i] = d_b[i] / d_c[i];
+}
+
+void vec_d_div2 (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    d_a[i] = d_b[i] / 2.0;
+}
+
+void vec_d_div53 (void)
+{
+  int i;
+
+  for (i = 0; i < SIZE; i++)
+    d_a[i] = d_b[i] / 53.0;
+}
+
+vector float v4sf_recip1 (vector float a, vector float b) { return vec_recipdiv (a, b); }
+vector float v4sf_recip2 (vector float a, vector float b) { return __builtin_altivec_vrecipdivfp (a, b); }
+vector double v2df_recip1 (vector double a, vector double b) { return vec_recipdiv (a, b); }
+vector float v4sf_recip3 (vector float a, vector float b) { return __builtin_vsx_xvrecipdivsp (a, b); }
+vector double v2df_recip2 (vector double a, vector double b) { return __builtin_vsx_xvrecipdivdp (a, b); }
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-6.c b/gcc/testsuite/gcc.target/powerpc/recip-6.c
new file mode 100644
index 00000000000..7d71df6709d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-6.c
@@ -0,0 +1,16 @@
+/* { dg-do run { target { powerpc*-*-linux* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */
+/* { dg-require-effective-target vsx_hw } */
+/* { dg-options "-mcpu=power7 -O3 -ftree-vectorize -ffast-math -mrecip=all -mrecip-precision" } */
+
+/* Check reciprocal estimate functions for accuracy.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+
+#include "recip-test.h"
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-7.c b/gcc/testsuite/gcc.target/powerpc/recip-7.c
new file mode 100644
index 00000000000..7b32ba076a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-7.c
@@ -0,0 +1,16 @@
+/* { dg-do run { target { powerpc*-*-linux* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */
+/* { dg-require-effective-target ppc_recip_hw } */
+/* { dg-options "-O3 -ftree-vectorize -ffast-math -mrecip -mpowerpc-gfxopt -mpowerpc-gpopt -mpopcntb" } */
+
+/* Check reciprocal estimate functions for accuracy.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+
+#include "recip-test.h"
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-test.h b/gcc/testsuite/gcc.target/powerpc/recip-test.h
new file mode 100644
index 00000000000..7a42df5757d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-test.h
@@ -0,0 +1,149 @@
+/* Check reciprocal estimate functions for accuracy.  */
+
+#ifdef _ARCH_PPC64
+typedef unsigned long uns64_t;
+#define UNUM64(x) x ## L
+
+#else
+typedef unsigned long long uns64_t;
+#define UNUM64(x) x ## LL
+#endif
+
+typedef unsigned int uns32_t;
+
+#define TNAME2(x) #x
+#define TNAME(x) TNAME2(x)
+
+/*
+ * Float functions.
+ */
+
+#define TYPE float
+#define NAME(PREFIX) PREFIX ## _float
+#define UNS_TYPE uns32_t
+#define UNS_ABS __builtin_abs
+#define EXP_SIZE 8
+#define MAN_SIZE 23
+#define FABS __builtin_fabsf
+#define FMAX __builtin_fmaxf
+#define FMIN __builtin_fminf
+#define SQRT __builtin_sqrtf
+#define RMIN 1.0e-10
+#define RMAX 1.0e+10
+#define BDIV 1
+#define BRSQRT 2
+#define ASMDIV "fdivs"
+#define ASMSQRT "fsqrts"
+
+#define INIT_DIV							\
+{									\
+  { 0x4fffffff },	/* 8589934080 */				\
+  { 0x4effffff },	/* 2147483520 */				\
+  { 0x40ffffff },	/* 7.99999952316284 */				\
+  { 0x3fffffff },	/* 1.99999988079071 */				\
+  { 0x417fffff },	/* 15.9999990463257 */				\
+  { 0x42ffffff },	/* 127.999992370605 */				\
+  { 0x3dffffff },	/* 0.124999992549419 */				\
+  { 0x3effffff },	/* 0.499999970197678 */				\
+}
+
+#define INIT_RSQRT							\
+{									\
+  { 0x457ffffe },	/* 4096 - small amount */			\
+  { 0x4c7fffff },	/* 6.71089e+07 */				\
+  { 0x3d7fffff },	/* 0.0625 - small amount */			\
+  { 0x307ffffe },	/* 9.31322e-10 */				\
+  { 0x4c7ffffe },	/* 6.71089e+07 */				\
+  { 0x397ffffe },	/* 0.000244141 */				\
+  { 0x2e7fffff },	/* 5.82077e-11 */				\
+  { 0x2f7fffff },	/* 2.32831e-10 */				\
+}
+
+
+#include "recip-test2.h"
+
+/*
+ * Double functions.
+ */
+
+#undef TYPE
+#undef NAME
+#undef UNS_TYPE
+#undef UNS_ABS
+#undef EXP_SIZE
+#undef MAN_SIZE
+#undef FABS
+#undef FMAX
+#undef FMIN
+#undef SQRT
+#undef RMIN
+#undef RMAX
+#undef BDIV
+#undef BRSQRT
+#undef ASMDIV
+#undef ASMSQRT
+#undef INIT_DIV
+#undef INIT_RSQRT
+
+#define TYPE double
+#define NAME(PREFIX) PREFIX ## _double
+#define UNS_TYPE uns64_t
+#define UNS_ABS __builtin_imaxabs
+#define EXP_SIZE 11
+#define MAN_SIZE 52
+#define FABS __builtin_fabs
+#define FMAX __builtin_fmax
+#define FMIN __builtin_fmin
+#define SQRT __builtin_sqrt
+#define RMIN 1.0e-100
+#define RMAX 1.0e+100
+#define BDIV 1
+#define BRSQRT 2
+#define ASMDIV "fdiv"
+#define ASMSQRT "fsqrt"
+
+#define INIT_DIV							\
+{									\
+  { UNUM64 (0x2b57be53f2a2f3a0) },	/* 6.78462e-100 */		\
+  { UNUM64 (0x2b35f8e8ea553e52) },	/* 1.56963e-100 */		\
+  { UNUM64 (0x2b5b9d861d2fe4fb) },	/* 7.89099e-100 */		\
+  { UNUM64 (0x2b45dc44a084e682) },	/* 3.12327e-100 */		\
+  { UNUM64 (0x2b424ce16945d777) },	/* 2.61463e-100 */		\
+  { UNUM64 (0x2b20b5023d496b50) },	/* 5.96749e-101 */		\
+  { UNUM64 (0x2b61170547f57caa) },	/* 9.76678e-100 */		\
+  { UNUM64 (0x2b543b9d498aac37) },	/* 5.78148e-100 */		\
+}
+
+#define INIT_RSQRT							\
+{									\
+  { UNUM64 (0x2b616f2d8cbbc646) },	/* 9.96359e-100 */		\
+  { UNUM64 (0x2b5c4db2da0a011d) },	/* 8.08764e-100 */		\
+  { UNUM64 (0x2b55a82d5735b262) },	/* 6.1884e-100 */		\
+  { UNUM64 (0x2b50b52908258cb8) },	/* 4.77416e-100 */		\
+  { UNUM64 (0x2b363989a4fb29af) },	/* 1.58766e-100 */		\
+  { UNUM64 (0x2b508b9f6f4180a9) },	/* 4.7278e-100 */		\
+  { UNUM64 (0x2b4f7a1d48accb40) },	/* 4.49723e-100 */		\
+  { UNUM64 (0x2b1146a37372a81f) },	/* 3.08534e-101 */		\
+  { UNUM64 (0x2b33f876a8c48050) },	/* 1.42663e-100 */		\
+}
+
+#include "recip-test2.h"
+
+int
+main (int argc __attribute__((__unused__)),
+      char *argv[] __attribute__((__unused__)))
+{
+  srand48 (1);
+  run_float ();
+
+#ifdef VERBOSE
+  printf ("\n");
+#endif
+
+  run_double ();
+
+  if (error_count_float != 0 || error_count_double != 0)
+    abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/recip-test2.h b/gcc/testsuite/gcc.target/powerpc/recip-test2.h
new file mode 100644
index 00000000000..3ec356cdfd8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/recip-test2.h
@@ -0,0 +1,432 @@
+/*
+ * Included file to common source float/double checking
+ * The following macros should be defined:
+ *	TYPE	   -- floating point type
+ *	NAME	   -- convert a name to include the type
+ *	UNS_TYPE   -- type to hold TYPE as an unsigned number
+ *	EXP_SIZE   -- size in bits of the exponent
+ *	MAN_SIZE   -- size in bits of the mantissa
+ *	UNS_ABS	   -- absolute value for UNS_TYPE
+ *	FABS	   -- absolute value function for TYPE
+ *	FMAX	   -- maximum function for TYPE
+ *	FMIN	   -- minimum function for TYPE
+ *	SQRT	   -- square root function for TYPE
+ *	RMIN	   -- minimum random number to generate
+ *	RMAX	   -- maximum random number to generate
+ *	ASMDIV	   -- assembler instruction to do divide
+ *	ASMSQRT	   -- assembler instruction to do square root
+ *	BDIV	   -- # of bits of inaccuracy to allow for division
+ *	BRSQRT	   -- # of bits of inaccuracy to allow for 1/sqrt
+ *	INIT_DIV   -- Initial values to test 1/x against
+ *	INIT_RSQRT -- Initial values to test 1/sqrt(x) against
+ */
+
+typedef union
+{
+  UNS_TYPE i;
+  TYPE x;
+} NAME (union);
+
+/*
+ * Input/output arrays.
+ */
+
+static NAME (union) NAME (div_input)  [] __attribute__((__aligned__(32))) = INIT_DIV;
+static NAME (union) NAME (rsqrt_input)[] __attribute__((__aligned__(32))) = INIT_RSQRT;
+
+#define DIV_SIZE   (sizeof (NAME (div_input))   / sizeof (TYPE))
+#define RSQRT_SIZE (sizeof (NAME (rsqrt_input)) / sizeof (TYPE))
+
+static TYPE NAME (div_expected)[DIV_SIZE] __attribute__((__aligned__(32)));
+static TYPE NAME (div_output)  [DIV_SIZE] __attribute__((__aligned__(32)));
+
+static TYPE NAME (rsqrt_expected)[RSQRT_SIZE] __attribute__((__aligned__(32)));
+static TYPE NAME (rsqrt_output)  [RSQRT_SIZE] __attribute__((__aligned__(32)));
+
+
+/*
+ * Crack a floating point number into sign bit, exponent, and mantissa.
+ */
+
+static void
+NAME (crack) (TYPE number, unsigned int *p_sign, unsigned *p_exponent, UNS_TYPE *p_mantissa)
+{
+  NAME (union) u;
+  UNS_TYPE bits;
+
+  u.x = number;
+  bits = u.i;
+
+  *p_sign = (unsigned int)((bits >> (EXP_SIZE + MAN_SIZE)) & 0x1);
+  *p_exponent = (unsigned int)((bits >> MAN_SIZE) & ((((UNS_TYPE)1) << EXP_SIZE) - 1));
+  *p_mantissa = bits & ((((UNS_TYPE)1) << MAN_SIZE) - 1);
+  return;
+}
+
+
+/*
+ * Prevent optimizer from eliminating + 0.0 to remove -0.0.
+ */
+
+volatile TYPE NAME (math_diff_0) = ((TYPE) 0.0);
+
+/*
+ * Return negative if two numbers are significanly different or return the
+ * number of bits that are different in the mantissa.
+ */
+
+static int
+NAME (math_diff) (TYPE a, TYPE b, int bits)
+{
+  TYPE zero = NAME (math_diff_0);
+  unsigned int sign_a, sign_b;
+  unsigned int exponent_a, exponent_b;
+  UNS_TYPE mantissa_a, mantissa_b, diff;
+  int i;
+
+  /* eliminate signed zero.  */
+  a += zero;
+  b += zero;
+
+  /* special case Nan.  */
+  if (__builtin_isnan (a))
+    return (__builtin_isnan (b) ? 0 : -1);
+
+  if (a == b)
+    return 0;
+
+  /* special case infinity.  */
+  if (__builtin_isinf (a))
+    return (__builtin_isinf (b) ? 0 : -1);
+
+  /* punt on denormal numbers.  */
+  if (!__builtin_isnormal (a) || !__builtin_isnormal (b))
+    return -1;
+
+  NAME (crack) (a, &sign_a, &exponent_a, &mantissa_a);
+  NAME (crack) (b, &sign_b, &exponent_b, &mantissa_b);
+
+  /* If the sign is different, there is no hope.  */
+  if (sign_a != sign_b)
+    return -1;
+
+  /* If the exponent is off by 1, see if the values straddle the power of two,
+     and adjust things to do the mantassa check if we can.  */
+  if ((exponent_a == (exponent_b+1)) || (exponent_a == (exponent_b-1)))
+    {
+      TYPE big = FMAX (a, b);
+      TYPE small = FMIN (a, b);
+      TYPE diff = FABS (a - b);
+      unsigned int sign_big, sign_small, sign_test;
+      unsigned int exponent_big, exponent_small, exponent_test;
+      UNS_TYPE mantissa_big, mantissa_small, mantissa_test;
+
+      NAME (crack) (big, &sign_big, &exponent_big, &mantissa_big);
+      NAME (crack) (small, &sign_small, &exponent_small, &mantissa_small);
+
+      NAME (crack) (small - diff, &sign_test, &exponent_test, &mantissa_test);
+      if ((sign_test == sign_small) && (exponent_test == exponent_small))
+	{
+	  mantissa_a = mantissa_small;
+	  mantissa_b = mantissa_test;
+	}
+
+      else
+	{
+	  NAME (crack) (big + diff, &sign_test, &exponent_test, &mantissa_test);
+	  if ((sign_test == sign_big) && (exponent_test == exponent_big))
+	    {
+	      mantissa_a = mantissa_big;
+	      mantissa_b = mantissa_test;
+	    }
+
+	  else
+	    return -1;
+	}
+    }
+
+  else if (exponent_a != exponent_b)
+    return -1;
+
+  diff = UNS_ABS (mantissa_a - mantissa_b);
+  for (i = MAN_SIZE; i > 0; i--)
+    {
+      if ((diff & ((UNS_TYPE)1) << (i-1)) != 0)
+	return i;
+    }
+
+  return -1;
+}
+
+
+/*
+ * Turn off inlining to make code inspection easier.
+ */
+
+static void NAME (asm_div) (void) __attribute__((__noinline__));
+static void NAME (vector_div) (void) __attribute__((__noinline__));
+static void NAME (scalar_div) (void) __attribute__((__noinline__));
+static void NAME (asm_rsqrt) (void) __attribute__((__noinline__));
+static void NAME (vector_rsqrt) (void) __attribute__((__noinline__));
+static void NAME (scalar_rsqrt) (void) __attribute__((__noinline__));
+static void NAME (check_div) (const char *) __attribute__((__noinline__));
+static void NAME (check_rsqrt) (const char *) __attribute__((__noinline__));
+static void NAME (run) (void) __attribute__((__noinline__));
+
+
+/*
+ * Division function that might be vectorized.
+ */
+
+static void
+NAME (vector_div) (void)
+{
+  size_t i;
+
+  for (i = 0; i < DIV_SIZE; i++)
+    NAME (div_output)[i] = ((TYPE) 1.0) / NAME (div_input)[i].x;
+}
+
+/*
+ * Division function that is not vectorized.
+ */
+
+static void
+NAME (scalar_div) (void)
+{
+  size_t i;
+
+  for (i = 0; i < DIV_SIZE; i++)
+    {
+      TYPE x = ((TYPE) 1.0) / NAME (div_input)[i].x;
+      TYPE y;
+      __asm__ ("" : "=d" (y) : "0" (x));
+      NAME (div_output)[i] = y;
+    }
+}
+
+/*
+ * Generate the division instruction via asm.
+ */
+
+static void
+NAME (asm_div) (void)
+{
+  size_t i;
+
+  for (i = 0; i < DIV_SIZE; i++)
+    {
+      TYPE x;
+      __asm__ (ASMDIV " %0,%1,%2"
+	       : "=d" (x)
+	       : "d" ((TYPE) 1.0), "d" (NAME (div_input)[i].x));
+      NAME (div_expected)[i] = x;
+    }
+}
+
+/*
+ * Reciprocal square root function that might be vectorized.
+ */
+
+static void
+NAME (vector_rsqrt) (void)
+{
+  size_t i;
+
+  for (i = 0; i < RSQRT_SIZE; i++)
+    NAME (rsqrt_output)[i] = ((TYPE) 1.0) / SQRT (NAME (rsqrt_input)[i].x);
+}
+
+/*
+ * Reciprocal square root function that is not vectorized.
+ */
+
+static void
+NAME (scalar_rsqrt) (void)
+{
+  size_t i;
+
+  for (i = 0; i < RSQRT_SIZE; i++)
+    {
+      TYPE x = ((TYPE) 1.0) / SQRT (NAME (rsqrt_input)[i].x);
+      TYPE y;
+      __asm__ ("" : "=d" (y) : "0" (x));
+      NAME (rsqrt_output)[i] = y;
+    }
+}
+
+/*
+ * Generate the 1/sqrt instructions via asm.
+ */
+
+static void
+NAME (asm_rsqrt) (void)
+{
+  size_t i;
+
+  for (i = 0; i < RSQRT_SIZE; i++)
+    {
+      TYPE x;
+      TYPE y;
+      __asm__ (ASMSQRT " %0,%1" : "=d" (x) : "d" (NAME (rsqrt_input)[i].x));
+      __asm__ (ASMDIV " %0,%1,%2" : "=d" (y) : "d" ((TYPE) 1.0), "d" (x));
+      NAME (rsqrt_expected)[i] = y;
+    }
+}
+
+
+/*
+ * Functions to abort or report errors.
+ */
+
+static int NAME (error_count) = 0;
+
+#ifdef VERBOSE
+static int NAME (max_bits_div)   = 0;
+static int NAME (max_bits_rsqrt) = 0;
+#endif
+
+
+/*
+ * Compare the expected value with the value we got.
+ */
+
+static void
+NAME (check_div) (const char *test)
+{
+  size_t i;
+  int b;
+
+  for (i = 0; i < DIV_SIZE; i++)
+    {
+      TYPE exp = NAME (div_expected)[i];
+      TYPE out = NAME (div_output)[i];
+      b = NAME (math_diff) (exp, out, BDIV);
+
+#ifdef VERBOSE
+      if (b != 0)
+	{
+	  NAME (union) u_in = NAME (div_input)[i];
+	  NAME (union) u_exp;
+	  NAME (union) u_out;
+	  char explanation[64];
+	  const char *p_exp;
+
+	  if (b < 0)
+	    p_exp = "failed";
+	  else
+	    {
+	      p_exp = explanation;
+	      sprintf (explanation, "%d bit error%s", b, (b > BDIV) ? ", failed" : "");
+	    }
+
+	  u_exp.x = exp;
+	  u_out.x = out;
+	  printf ("%s %s %s for 1.0 / %g [0x%llx], expected %g [0x%llx], got %g [0x%llx]\n",
+		  TNAME (TYPE), test, p_exp,
+		  (double) u_in.x, (unsigned long long) u_in.i,
+		  (double) exp,    (unsigned long long) u_exp.i,
+		  (double) out,    (unsigned long long) u_out.i);
+	}
+#endif
+
+      if (b < 0 || b > BDIV)
+	NAME (error_count)++;
+
+#ifdef VERBOSE
+      if (b > NAME (max_bits_div))
+	NAME (max_bits_div) = b;
+#endif
+    }
+}
+
+static void
+NAME (check_rsqrt) (const char *test)
+{
+  size_t i;
+  int b;
+
+  for (i = 0; i < RSQRT_SIZE; i++)
+    {
+      TYPE exp = NAME (rsqrt_expected)[i];
+      TYPE out = NAME (rsqrt_output)[i];
+      b = NAME (math_diff) (exp, out, BRSQRT);
+
+#ifdef VERBOSE
+      if (b != 0)
+	{
+	  NAME (union) u_in = NAME (rsqrt_input)[i];
+	  NAME (union) u_exp;
+	  NAME (union) u_out;
+	  char explanation[64];
+	  const char *p_exp;
+
+	  if (b < 0)
+	    p_exp = "failed";
+	  else
+	    {
+	      p_exp = explanation;
+	      sprintf (explanation, "%d bit error%s", b, (b > BDIV) ? ", failed" : "");
+	    }
+
+	  u_exp.x = exp;
+	  u_out.x = out;
+	  printf ("%s %s %s for 1 / sqrt (%g) [0x%llx], expected %g [0x%llx], got %g [0x%llx]\n",
+		  TNAME (TYPE), test, p_exp,
+		  (double) u_in.x, (unsigned long long) u_in.i,
+		  (double) exp,    (unsigned long long) u_exp.i,
+		  (double) out,    (unsigned long long) u_out.i);
+	}
+#endif
+
+      if (b < 0 || b > BRSQRT)
+	NAME (error_count)++;
+
+#ifdef VERBOSE
+      if (b > NAME (max_bits_rsqrt))
+	NAME (max_bits_rsqrt) = b;
+#endif
+    }
+}
+
+
+/*
+ * Now do everything.
+ */
+
+static void
+NAME (run) (void)
+{
+#ifdef VERBOSE
+  printf ("start run_%s, divide size = %ld, rsqrt size = %ld, %d bit%s for a/b, %d bit%s for 1/sqrt(a)\n",
+	  TNAME (TYPE),
+	  (long)DIV_SIZE,
+	  (long)RSQRT_SIZE,
+	  BDIV, (BDIV == 1) ? "" : "s",
+	  BRSQRT, (BRSQRT == 1) ? "" : "s");
+#endif
+
+  NAME (asm_div) ();
+
+  NAME (scalar_div) ();
+  NAME (check_div) ("scalar");
+
+  NAME (vector_div) ();
+  NAME (check_div) ("vector");
+
+  NAME (asm_rsqrt) ();
+
+  NAME (scalar_rsqrt) ();
+  NAME (check_rsqrt) ("scalar");
+
+  NAME (vector_rsqrt) ();
+  NAME (check_rsqrt) ("vector");
+
+#ifdef VERBOSE
+  printf ("end run_%s, errors = %d, max div bits = %d, max rsqrt bits = %d\n",
+	  TNAME (TYPE),
+	  NAME (error_count),
+	  NAME (max_bits_div),
+	  NAME (max_bits_rsqrt));
+#endif
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 1a36127e95b..6ad8c344585 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -992,6 +992,30 @@ proc check_vmx_hw_available { } {
     }]
 }
 
+proc check_ppc_recip_hw_available { } {
+    return [check_cached_effective_target ppc_recip_hw_available {
+	# Some simulators may not support FRE/FRES/FRSQRTE/FRSQRTES
+	# For now, disable on Darwin
+	if { [istarget powerpc-*-eabi] || [istarget powerpc*-*-eabispe] || [istarget *-*-darwin*]} {
+	    expr 0
+	} else {
+	    set options "-mpowerpc-gfxopt -mpowerpc-gpopt -mpopcntb"
+	    check_runtime_nocache ppc_recip_hw_available {
+		volatile double d_recip, d_rsqrt, d_four = 4.0;
+		volatile float f_recip, f_rsqrt, f_four = 4.0f;
+		int main()
+		{
+		  asm volatile ("fres %0,%1" : "=f" (f_recip) : "f" (f_four));
+		  asm volatile ("fre %0,%1" : "=d" (d_recip) : "d" (d_four));
+		  asm volatile ("frsqrtes %0,%1" : "=f" (f_rsqrt) : "f" (f_four));
+		  asm volatile ("frsqrte %0,%1" : "=f" (d_rsqrt) : "d" (d_four));
+		  return 0;
+		}
+	    } $options
+	}
+    }]
+}
+
 # Return 1 if the target supports executing AltiVec and Cell PPU
 # instructions, 0 otherwise.  Cache the result.
 
@@ -2972,6 +2996,8 @@ proc is-effective-target { arg } {
     } else {
 	switch $arg {
 	  "vmx_hw"         { set selected [check_vmx_hw_available] }
+	  "vsx_hw"         { set selected [check_vsx_hw_available] }
+	  "ppc_recip_hw"   { set selected [check_ppc_recip_hw_available] }
 	  "named_sections" { set selected [check_named_sections_available] }
 	  "gc_sections"    { set selected [check_gc_sections_available] }
 	  "cxa_atexit"     { set selected [check_cxa_atexit_available] }
@@ -2991,6 +3017,8 @@ proc is-effective-target-keyword { arg } {
 	# These have different names for their check_* procs.
 	switch $arg {
 	  "vmx_hw"         { return 1 }
+	  "vsx_hw"         { return 1 }
+	  "ppc_recip_hw"   { return 1 }
 	  "named_sections" { return 1 }
 	  "gc_sections"    { return 1 }
 	  "cxa_atexit"     { return 1 }
author	Michael Meissner <meissner@linux.vnet.ibm.com>	2010-06-03 00:06:12 +0000
committer	Michael Meissner <meissner@gcc.gnu.org>	2010-06-03 00:06:12 +0000
commit	92902797041a42ac500f7dc9639df8a680e0b691 (patch)
tree	d55e7fa0ae623e1c748075d3f81edeb35fb123fb /gcc
parent	6c07d08b90b124d8d3be8015726caf799e2e2a13 (diff)
download	gcc-92902797041a42ac500f7dc9639df8a680e0b691.tar.gz