diff options
author | Michael Meissner <meissner@linux.vnet.ibm.com> | 2010-06-03 00:06:12 +0000 |
---|---|---|
committer | Michael Meissner <meissner@gcc.gnu.org> | 2010-06-03 00:06:12 +0000 |
commit | 92902797041a42ac500f7dc9639df8a680e0b691 (patch) | |
tree | d55e7fa0ae623e1c748075d3f81edeb35fb123fb /gcc | |
parent | 6c07d08b90b124d8d3be8015726caf799e2e2a13 (diff) | |
download | gcc-92902797041a42ac500f7dc9639df8a680e0b691.tar.gz |
PR target/44218, improve -mrecip on powerpc
From-SVN: r160199
Diffstat (limited to 'gcc')
25 files changed, 2112 insertions, 454 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 59008a5ab40..ecfdab1044d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,139 @@ +2010-06-02 Michael Meissner <meissner@linux.vnet.ibm.com> + + PR target/44218 + * doc/invoke.texi (RS/6000 and PowerPC Options): Delete obsolete + -mswdiv option. Add -mrecip, -mrecip=<xxx>, -mrecip-precision + options. + + * doc/extend.texi (powerpc builtins): Document vec_recip, + vec_rsqrt, vec_rsqrte altivec/vsx builtins. + + * config/rs6000/rs60000-protos.h (rs6000_emit_swdiv): New + function. + (rs6000_emit_swrsqrt): Ditto. + (rs6000_emit_swdivsf): Delete. + (rs6000_emit_swdivdf): Ditto. + (rs6000_emit_swrsqrtsf): Ditto. + + * config/rs6000/rs6000.c (rs6000_recip_bits): New global to + describe the reciprocal estimate support for each type. + (recip_options): Map -mrecip=<opt> into option bits. + (gen_2arg_fn_t): New typedef for binary rtx gen function. + (rs6000_debug_reg_global): If -mdebug=reg, print the state of the + reciprocal estimate instructions. + (rs6000_init_hard_regno_mode_ok): Key ws constraint off of the + debug -mvsx-scalar-memory switch instead of -mvsx-scalar-double. + Set up rs6000_recip_bits based on the -mrecip* options. Print the + cost information if -mdebug=cost or -mdebug=reg. + (rs6000_override_options): Set -mrecip-precision for power6, and + power7 machines. If -mvsx or -mdfp, enable various options that + came in previous instruction set ISAs, unless the option was + explicitly disabled by the command line option. Parse + -mrecip=<opt> options. + (rs6000_builtin_vectorized_function): Add support for vectorizing + the reciprocal estimate builtins and expansions. + (rs6000_handle_option): Add -mrecip, -mrecip=<opt> support. + (bdesc_2arg): Add reciprocal estimate builtins. + (bdesc_1arg): Add reciprocal square root estimate builtins. + (rs6000_expand_builtin): Rewrite to use a switch statement, + instead of multiple if/then/elses. Add reciprocal estimate + builtins. + (rs6000_init_builtins): Create declarations for reciprocal + estimate builtins. + (rs6000_preferred_reload_class): Simplify VSX preferences, if scalar + sized, prefer traditional floating point registers, if integer + vector types, prefer altivec registers. Don't actually look at + the memory address any more. + (rs6000_builtin_reciprocal): Add new builtin reciprocal estimate + builtins. + (rs6000_load_constant_and_splat): New helper function to load up + the constant for reciprocal estimate instructions. + (rs6000_emit_madd): New helper function for generating + multiply/add type instructions, based on the current switches. + (rs6000_emit_msub): Ditto. + (rs6000_emit_mnsub): Ditto. + (rs6000_emit_swdiv_high_precision): Replace rs6000_emit_swdivsf to + replace a divide with a reciprocal estimate and fixup, adding + support for machines with high precision and vectors. + (rs6000_emit_swdiv_low_precision): Rewrite rs6000_emit_swdivdf for + low precision machines. + (rs6000_emit_swdiv): New common function to be called to replace a + division with reciprocal estimate and fixup. + (rs6000_emit_swrsqrt): Replace rs6000_emit_swrsqrtsf. Add support + for double and vector types. Add support for high precision + machines. + + * config/rs6000/rs6000.h (TARGET_FRES): New macro to say whether + the reciprocal estimate instructions can be generated. + (TARGET_FRE): Ditto. + (TARGET_FRSQRTES): Ditto. + (TARGET_FRSQRTE): Ditto. + (RS6000_RECIP_*): New macros for reciprocal estimate support. + + * config/rs6000/vector.md (rsqrte<mode>2): New insn for reciprocal + square root estimate on vectors. + (re<mode>2): New insn for reciprocal division estimate on vectors. + + * config/rs6000/rs6000-buitlins.def (ALTIVEC_BUILTIN_VRSQRTFP): + New builtin. + (ALTIVEC_BUILTIN_VRECIPFP): Ditto. + (ALTIVEC_BUITLIN_VEC_RE): Ditto. + (ALTIVEC_BUILTIN_VEC_RSQRT): Ditto. + (VSX_BUILTIN_RSQRT_V4SF): Ditto. + (VSX_BUITLIN_RSQRT_V2DF): Ditto. + (RS6000_BUILTIN_RSQRT): Ditto. + (ALTIVEC_BUILTIN_VEC_RSQRTE): Denote that the builtin is a + floating point builtin. + + * config/rs6000/rs6000-c.c (rs6000_cpu_cpp_builtins): Define + macros __RECIP__, __RECIPF__, __RSQRTE__, __RSQRTEF__, + __RECIP_PRECISION__ based on the command line switches. + (altivec_overloaded_builtins): Add reciprocal estimate builtins. + + * config/rs6000/rs6000.opt (-mrecip): Document add support for + replacing division instructions with reciprocal estimate and + fixup. + (-mrecip=<opt>): New option. + (-mrecip-precision): Ditto. + + * config/rs6000/vsx.md (UNSPEC_VSX_RSQRTE): Delete. + (vsx_rsqrte<mode>2): Use UNSPEC_RSQRT not UNSPEC_VSX_RSQRTE. + (vsx_copysignsf3): If -mvsx, use double precision cpsign on single + precision scalar. + + * config/rs6000/altivec.md (UNSPEC_RSQRTEFP): Delete. + (UNSPEC_VREFP): Ditto. + (altivec_vnmsubfp*): Make altivec nmsub mirror the scalar and VSX + conterparts with regard to support of -mno-fused-madd and + -ffast-math. + (altivec_vrsqrtefp): Use common UNSPEC to allow scalar/vector + reciprocal estimate instructions to be generated. + (altivec_vrefp): Ditto. + + * config/rs6000/rs6000.md (RECIPF): New iterator for reciprocal + estimate support. + (rreg): New mode attribute for reciprocal estimate support. + (recip<mode>3): New insn for division using reciprocal estimate + and fixup builtins. + (divide define_split): New define_split to convert floating point + division to use reciprocal estimate if the user used the + appropriate options and the split is run when we can add new + pseudo registers for the fixup. + (rsqrt<mode>2): New insn for reciprocal square root support. + (recipsf3): Move into recip<mode>3. + (recipdf3): Ditto. + (fres): Use TARGET_FRES. + (rsqrtsf2): Move into rsqrt<mode>2. + (rsqrtsf_internal1): Use TARGET_FRSQRTSES. + (copysignsf3): Add support for VSX. + (fred): Use TARGET_FRE. + (fred_fpr): Ditto. + (rsqrtdf_internal1): New function for frsqrte instruciton. + + * config/rs6000/altivec.h (vec_recipdiv): Define new vector + builtin. + (vec_rsqrt): Ditto. + 2010-06-03 Richard Guenther <rguenther@suse.de> PR middle-end/44291 diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h index bc4f30f7cb2..5f4510adc30 100644 --- a/gcc/config/rs6000/altivec.h +++ b/gcc/config/rs6000/altivec.h @@ -163,6 +163,8 @@ #define vec_vpkshus __builtin_vec_vpkshus #define vec_re __builtin_vec_re #define vec_round __builtin_vec_round +#define vec_recipdiv __builtin_vec_recipdiv +#define vec_rsqrt __builtin_vec_rsqrt #define vec_rsqrte __builtin_vec_rsqrte #define vec_vsubfp __builtin_vec_vsubfp #define vec_subc __builtin_vec_subc diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 6fbb7cdcdac..7bf3c660312 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -75,9 +75,7 @@ (UNSPEC_VCTSXS 154) (UNSPEC_VLOGEFP 155) (UNSPEC_VEXPTEFP 156) - (UNSPEC_VRSQRTEFP 157) - (UNSPEC_VREFP 158) - ;; 159-162 deleted + ;; 157-162 deleted (UNSPEC_VLSDOI 163) (UNSPEC_VUPKHSB 167) (UNSPEC_VUPKHPX 168) @@ -141,10 +139,11 @@ (UNSPEC_VPERMHI 321) (UNSPEC_INTERHI 322) (UNSPEC_INTERLO 323) - (UNSPEC_VUPKHS_V4SF 324) - (UNSPEC_VUPKLS_V4SF 325) - (UNSPEC_VUPKHU_V4SF 326) - (UNSPEC_VUPKLU_V4SF 327) + (UNSPEC_VUPKHS_V4SF 324) + (UNSPEC_VUPKLS_V4SF 325) + (UNSPEC_VUPKHU_V4SF 326) + (UNSPEC_VUPKLU_V4SF 327) + (UNSPEC_VNMSUBFP 328) ]) (define_constants @@ -628,11 +627,64 @@ }") ;; Fused multiply subtract -(define_insn "altivec_vnmsubfp" +(define_expand "altivec_vnmsubfp" + [(match_operand:V4SF 0 "register_operand" "") + (match_operand:V4SF 1 "register_operand" "") + (match_operand:V4SF 2 "register_operand" "") + (match_operand:V4SF 3 "register_operand" "")] + "VECTOR_UNIT_ALTIVEC_P (V4SFmode)" +{ + if (TARGET_FUSED_MADD && HONOR_SIGNED_ZEROS (SFmode)) + { + emit_insn (gen_altivec_vnmsubfp_1 (operands[0], operands[1], + operands[2], operands[3])); + DONE; + } + else if (TARGET_FUSED_MADD && !HONOR_SIGNED_ZEROS (DFmode)) + { + emit_insn (gen_altivec_vnmsubfp_2 (operands[0], operands[1], + operands[2], operands[3])); + DONE; + } + else + { + emit_insn (gen_altivec_vnmsubfp_3 (operands[0], operands[1], + operands[2], operands[3])); + DONE; + } +}) + +(define_insn "altivec_vnmsubfp_1" [(set (match_operand:V4SF 0 "register_operand" "=v") - (neg:V4SF (minus:V4SF (mult:V4SF (match_operand:V4SF 1 "register_operand" "v") - (match_operand:V4SF 2 "register_operand" "v")) - (match_operand:V4SF 3 "register_operand" "v"))))] + (neg:V4SF + (minus:V4SF + (mult:V4SF + (match_operand:V4SF 1 "register_operand" "v") + (match_operand:V4SF 2 "register_operand" "v")) + (match_operand:V4SF 3 "register_operand" "v"))))] + "VECTOR_UNIT_ALTIVEC_P (V4SFmode) && TARGET_FUSED_MADD + && HONOR_SIGNED_ZEROS (SFmode)" + "vnmsubfp %0,%1,%2,%3" + [(set_attr "type" "vecfloat")]) + +(define_insn "altivec_vnmsubfp_2" + [(set (match_operand:V4SF 0 "register_operand" "=v") + (minus:V4SF + (match_operand:V4SF 3 "register_operand" "v") + (mult:V4SF + (match_operand:V4SF 1 "register_operand" "v") + (match_operand:V4SF 2 "register_operand" "v"))))] + "VECTOR_UNIT_ALTIVEC_P (V4SFmode) && TARGET_FUSED_MADD + && !HONOR_SIGNED_ZEROS (SFmode)" + "vnmsubfp %0,%1,%2,%3" + [(set_attr "type" "vecfloat")]) + +(define_insn "altivec_vnmsubfp_3" + [(set (match_operand:V4SF 0 "register_operand" "=v") + (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v") + (match_operand:V4SF 2 "register_operand" "v") + (match_operand:V4SF 3 "register_operand" "v")] + UNSPEC_VNMSUBFP))] "VECTOR_UNIT_ALTIVEC_P (V4SFmode)" "vnmsubfp %0,%1,%2,%3" [(set_attr "type" "vecfloat")]) @@ -1444,19 +1496,19 @@ "vexptefp %0,%1" [(set_attr "type" "vecfloat")]) -(define_insn "altivec_vrsqrtefp" +(define_insn "*altivec_vrsqrtefp" [(set (match_operand:V4SF 0 "register_operand" "=v") (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")] - UNSPEC_VRSQRTEFP))] - "TARGET_ALTIVEC" + UNSPEC_RSQRT))] + "VECTOR_UNIT_ALTIVEC_P (V4SFmode)" "vrsqrtefp %0,%1" [(set_attr "type" "vecfloat")]) (define_insn "altivec_vrefp" [(set (match_operand:V4SF 0 "register_operand" "=v") (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")] - UNSPEC_VREFP))] - "TARGET_ALTIVEC" + UNSPEC_FRES))] + "VECTOR_UNIT_ALTIVEC_P (V4SFmode)" "vrefp %0,%1" [(set_attr "type" "vecfloat")]) diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def index 7c5619a8e14..9f45a72e2c0 100644 --- a/gcc/config/rs6000/rs6000-builtin.def +++ b/gcc/config/rs6000/rs6000-builtin.def @@ -159,6 +159,7 @@ RS6000_BUILTIN(ALTIVEC_BUILTIN_VRFIZ, RS6000_BTC_FP_PURE) RS6000_BUILTIN(ALTIVEC_BUILTIN_VRLB, RS6000_BTC_CONST) RS6000_BUILTIN(ALTIVEC_BUILTIN_VRLH, RS6000_BTC_CONST) RS6000_BUILTIN(ALTIVEC_BUILTIN_VRLW, RS6000_BTC_CONST) +RS6000_BUILTIN(ALTIVEC_BUILTIN_VRSQRTFP, RS6000_BTC_FP_PURE) RS6000_BUILTIN(ALTIVEC_BUILTIN_VRSQRTEFP, RS6000_BTC_FP_PURE) RS6000_BUILTIN(ALTIVEC_BUILTIN_VSLB, RS6000_BTC_CONST) RS6000_BUILTIN(ALTIVEC_BUILTIN_VSLH, RS6000_BTC_CONST) @@ -269,6 +270,7 @@ RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_EXT_V8HI, RS6000_BTC_CONST) RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_EXT_V16QI, RS6000_BTC_CONST) RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_EXT_V4SF, RS6000_BTC_CONST) RS6000_BUILTIN(ALTIVEC_BUILTIN_COPYSIGN_V4SF, RS6000_BTC_CONST) +RS6000_BUILTIN(ALTIVEC_BUILTIN_VRECIPFP, RS6000_BTC_FP_PURE) /* Altivec overloaded builtins. */ /* For now, don't set the classification for overloaded functions. @@ -351,10 +353,12 @@ RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_PACKS, RS6000_BTC_MISC) RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_PACKSU, RS6000_BTC_MISC) RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_PERM, RS6000_BTC_MISC) RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RE, RS6000_BTC_MISC) +RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RECIP, RS6000_BTC_FP_PURE) RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RL, RS6000_BTC_MISC) RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RINT, RS6000_BTC_MISC) RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_ROUND, RS6000_BTC_MISC) -RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RSQRTE, RS6000_BTC_MISC) +RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RSQRT, RS6000_BTC_FP_PURE) +RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_RSQRTE, RS6000_BTC_FP_PURE) RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_SEL, RS6000_BTC_MISC) RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_SL, RS6000_BTC_MISC) RS6000_BUILTIN(ALTIVEC_BUILTIN_VEC_SLD, RS6000_BTC_MISC) @@ -959,6 +963,10 @@ RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEL_V2DF, RS6000_BTC_CONST) RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEL_V2DI, RS6000_BTC_CONST) RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEH_V2DF, RS6000_BTC_CONST) RS6000_BUILTIN(VSX_BUILTIN_VEC_MERGEH_V2DI, RS6000_BTC_CONST) +RS6000_BUILTIN(VSX_BUILTIN_VEC_RSQRT_V4SF, RS6000_BTC_FP_PURE) +RS6000_BUILTIN(VSX_BUILTIN_VEC_RSQRT_V2DF, RS6000_BTC_FP_PURE) +RS6000_BUILTIN(VSX_BUILTIN_RECIP_V4SF, RS6000_BTC_FP_PURE) +RS6000_BUILTIN(VSX_BUILTIN_RECIP_V2DF, RS6000_BTC_FP_PURE) /* VSX overloaded builtins, add the overloaded functions not present in Altivec. */ @@ -991,4 +999,5 @@ RS6000_BUILTIN(POWER7_BUILTIN_BPERMD, RS6000_BTC_CONST) RS6000_BUILTIN(RS6000_BUILTIN_RECIP, RS6000_BTC_FP_PURE) RS6000_BUILTIN(RS6000_BUILTIN_RECIPF, RS6000_BTC_FP_PURE) RS6000_BUILTIN(RS6000_BUILTIN_RSQRTF, RS6000_BTC_FP_PURE) +RS6000_BUILTIN(RS6000_BUILTIN_RSQRT, RS6000_BTC_FP_PURE) RS6000_BUILTIN(RS6000_BUILTIN_BSWAP_HI, RS6000_BTC_CONST) diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index ac11336aee9..7a197c1fbcc 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -362,6 +362,16 @@ rs6000_cpu_cpp_builtins (cpp_reader *pfile) builtin_define ("__builtin_vsx_xvnmsubasp=__builtin_vsx_xvnmsubsp"); builtin_define ("__builtin_vsx_xvnmsubmsp=__builtin_vsx_xvnmsubsp"); } + if (RS6000_RECIP_HAVE_RE_P (DFmode)) + builtin_define ("__RECIP__"); + if (RS6000_RECIP_HAVE_RE_P (SFmode)) + builtin_define ("__RECIPF__"); + if (RS6000_RECIP_HAVE_RSQRTE_P (DFmode)) + builtin_define ("__RSQRTE__"); + if (RS6000_RECIP_HAVE_RSQRTE_P (SFmode)) + builtin_define ("__RSQRTEF__"); + if (TARGET_RECIP_PRECISION) + builtin_define ("__RECIP_PRECISION__"); /* Tell users they can use __builtin_bswap{16,64}. */ builtin_define ("__HAVE_BSWAP__"); @@ -479,10 +489,22 @@ const struct altivec_builtin_types altivec_overloaded_builtins[] = { RS6000_BTI_void, RS6000_BTI_bool_V16QI, 0, 0 }, { ALTIVEC_BUILTIN_VEC_RE, ALTIVEC_BUILTIN_VREFP, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 }, + { ALTIVEC_BUILTIN_VEC_RE, VSX_BUILTIN_XVREDP, + RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 }, { ALTIVEC_BUILTIN_VEC_ROUND, ALTIVEC_BUILTIN_VRFIN, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 }, + { ALTIVEC_BUILTIN_VEC_RECIP, ALTIVEC_BUILTIN_VRECIPFP, + RS6000_BTI_V4SF, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0 }, + { ALTIVEC_BUILTIN_VEC_RECIP, VSX_BUILTIN_RECIP_V2DF, + RS6000_BTI_V2DF, RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0 }, + { ALTIVEC_BUILTIN_VEC_RSQRT, ALTIVEC_BUILTIN_VRSQRTFP, + RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 }, + { ALTIVEC_BUILTIN_VEC_RSQRT, VSX_BUILTIN_VEC_RSQRT_V2DF, + RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 }, { ALTIVEC_BUILTIN_VEC_RSQRTE, ALTIVEC_BUILTIN_VRSQRTEFP, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 }, + { ALTIVEC_BUILTIN_VEC_RSQRTE, VSX_BUILTIN_XVRSQRTEDP, + RS6000_BTI_V2DF, RS6000_BTI_V2DF, 0, 0 }, { ALTIVEC_BUILTIN_VEC_TRUNC, ALTIVEC_BUILTIN_VRFIZ, RS6000_BTI_V4SF, RS6000_BTI_V4SF, 0, 0 }, { ALTIVEC_BUILTIN_VEC_TRUNC, VSX_BUILTIN_XVRDPIZ, diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 43ed634495b..3f022862332 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -106,9 +106,8 @@ extern void rs6000_split_compare_and_swap (rtx, rtx, rtx, rtx, rtx); extern void rs6000_expand_compare_and_swapqhi (rtx, rtx, rtx, rtx); extern void rs6000_split_compare_and_swapqhi (rtx, rtx, rtx, rtx, rtx, rtx); extern void rs6000_split_lock_test_and_set (rtx, rtx, rtx, rtx); -extern void rs6000_emit_swdivsf (rtx, rtx, rtx); -extern void rs6000_emit_swdivdf (rtx, rtx, rtx); -extern void rs6000_emit_swrsqrtsf (rtx, rtx); +extern void rs6000_emit_swdiv (rtx, rtx, rtx, bool); +extern void rs6000_emit_swrsqrt (rtx, rtx); extern void output_toc (FILE *, rtx, int, enum machine_mode); extern rtx rs6000_longcall_ref (rtx); extern void rs6000_fatal_bad_address (rtx); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index a7434ca5257..9bfaf54c2a2 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -316,6 +316,61 @@ int rs6000_vector_align[NUM_MACHINE_MODES]; /* Map selected modes to types for builtins. */ static GTY(()) tree builtin_mode_to_type[MAX_MACHINE_MODE][2]; + +/* What modes to automatically generate reciprocal divide estimate (fre) and + reciprocal sqrt (frsqrte) for. */ +unsigned char rs6000_recip_bits[MAX_MACHINE_MODE]; + +/* Masks to determine which reciprocal esitmate instructions to generate + automatically. */ +enum rs6000_recip_mask { + RECIP_SF_DIV = 0x001, /* Use divide estimate */ + RECIP_DF_DIV = 0x002, + RECIP_V4SF_DIV = 0x004, + RECIP_V2DF_DIV = 0x008, + + RECIP_SF_RSQRT = 0x010, /* Use reciprocal sqrt estimate. */ + RECIP_DF_RSQRT = 0x020, + RECIP_V4SF_RSQRT = 0x040, + RECIP_V2DF_RSQRT = 0x080, + + /* Various combination of flags for -mrecip=xxx. */ + RECIP_NONE = 0, + RECIP_ALL = (RECIP_SF_DIV | RECIP_DF_DIV | RECIP_V4SF_DIV + | RECIP_V2DF_DIV | RECIP_SF_RSQRT | RECIP_DF_RSQRT + | RECIP_V4SF_RSQRT | RECIP_V2DF_RSQRT), + + RECIP_HIGH_PRECISION = RECIP_ALL, + + /* On low precision machines like the power5, don't enable double precision + reciprocal square root estimate, since it isn't accurate enough. */ + RECIP_LOW_PRECISION = (RECIP_ALL & ~(RECIP_DF_RSQRT | RECIP_V2DF_RSQRT)) +}; + +static unsigned int rs6000_recip_control; +static const char *rs6000_recip_name; + +/* -mrecip options. */ +static struct +{ + const char *string; /* option name */ + unsigned int mask; /* mask bits to set */ +} recip_options[] = { + { "all", RECIP_ALL }, + { "none", RECIP_NONE }, + { "div", (RECIP_SF_DIV | RECIP_DF_DIV | RECIP_V4SF_DIV + | RECIP_V2DF_DIV) }, + { "divf", (RECIP_SF_DIV | RECIP_V4SF_DIV) }, + { "divd", (RECIP_DF_DIV | RECIP_V2DF_DIV) }, + { "rsqrt", (RECIP_SF_RSQRT | RECIP_DF_RSQRT | RECIP_V4SF_RSQRT + | RECIP_V2DF_RSQRT) }, + { "rsqrtf", (RECIP_SF_RSQRT | RECIP_V4SF_RSQRT) }, + { "rsqrtd", (RECIP_DF_RSQRT | RECIP_V2DF_RSQRT) }, +}; + +/* 2 argument gen function typedef. */ +typedef rtx (*gen_2arg_fn_t) (rtx, rtx, rtx); + /* Target cpu costs. */ @@ -1807,6 +1862,27 @@ rs6000_debug_reg_global (void) if (nl) fputs (nl, stderr); + if (rs6000_recip_control) + { + fprintf (stderr, "\nReciprocal mask = 0x%x\n", rs6000_recip_control); + + for (m = 0; m < NUM_MACHINE_MODES; ++m) + if (rs6000_recip_bits[m]) + { + fprintf (stderr, + "Reciprocal estimate mode: %-5s divide: %s rsqrt: %s\n", + GET_MODE_NAME (m), + (RS6000_RECIP_AUTO_RE_P (m) + ? "auto" + : (RS6000_RECIP_HAVE_RE_P (m) ? "have" : "none")), + (RS6000_RECIP_AUTO_RSQRTE_P (m) + ? "auto" + : (RS6000_RECIP_HAVE_RSQRTE_P (m) ? "have" : "none"))); + } + + fputs ("\n", stderr); + } + switch (rs6000_sched_costly_dep) { case max_dep_latency: @@ -2014,8 +2090,9 @@ rs6000_init_hard_regno_mode_ok (void) rs6000_constraints[RS6000_CONSTRAINT_wa] = VSX_REGS; rs6000_constraints[RS6000_CONSTRAINT_wf] = VSX_REGS; rs6000_constraints[RS6000_CONSTRAINT_wd] = VSX_REGS; - if (TARGET_VSX_SCALAR_DOUBLE) - rs6000_constraints[RS6000_CONSTRAINT_ws] = VSX_REGS; + rs6000_constraints[RS6000_CONSTRAINT_ws] = (TARGET_VSX_SCALAR_MEMORY + ? VSX_REGS + : FLOAT_REGS); } if (TARGET_ALTIVEC) @@ -2093,8 +2170,111 @@ rs6000_init_hard_regno_mode_ok (void) if (TARGET_E500_DOUBLE) rs6000_class_max_nregs[DFmode][GENERAL_REGS] = 1; + /* Calculate which modes to automatically generate code to use a the + reciprocal divide and square root instructions. In the future, possibly + automatically generate the instructions even if the user did not specify + -mrecip. The older machines double precision reciprocal sqrt estimate is + not accurate enough. */ + memset (rs6000_recip_bits, 0, sizeof (rs6000_recip_bits)); + if (TARGET_FRES) + rs6000_recip_bits[SFmode] = RS6000_RECIP_MASK_HAVE_RE; + if (TARGET_FRE) + rs6000_recip_bits[DFmode] = RS6000_RECIP_MASK_HAVE_RE; + if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode)) + rs6000_recip_bits[V4SFmode] = RS6000_RECIP_MASK_HAVE_RE; + if (VECTOR_UNIT_VSX_P (V2DFmode)) + rs6000_recip_bits[V2DFmode] = RS6000_RECIP_MASK_HAVE_RE; + + if (TARGET_FRSQRTES) + rs6000_recip_bits[SFmode] |= RS6000_RECIP_MASK_HAVE_RSQRTE; + if (TARGET_FRSQRTE) + rs6000_recip_bits[DFmode] |= RS6000_RECIP_MASK_HAVE_RSQRTE; + if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode)) + rs6000_recip_bits[V4SFmode] |= RS6000_RECIP_MASK_HAVE_RSQRTE; + if (VECTOR_UNIT_VSX_P (V2DFmode)) + rs6000_recip_bits[V2DFmode] |= RS6000_RECIP_MASK_HAVE_RSQRTE; + + if (rs6000_recip_control) + { + if (!TARGET_FUSED_MADD) + warning (0, "-mrecip requires -mfused-madd"); + if (!flag_finite_math_only) + warning (0, "-mrecip requires -ffinite-math or -ffast-math"); + if (flag_trapping_math) + warning (0, "-mrecip requires -fno-trapping-math or -ffast-math"); + if (!flag_reciprocal_math) + warning (0, "-mrecip requires -freciprocal-math or -ffast-math"); + if (TARGET_FUSED_MADD && flag_finite_math_only && !flag_trapping_math + && flag_reciprocal_math) + { + if (RS6000_RECIP_HAVE_RE_P (SFmode) + && (rs6000_recip_control & RECIP_SF_DIV) != 0) + rs6000_recip_bits[SFmode] |= RS6000_RECIP_MASK_AUTO_RE; + + if (RS6000_RECIP_HAVE_RE_P (DFmode) + && (rs6000_recip_control & RECIP_DF_DIV) != 0) + rs6000_recip_bits[DFmode] |= RS6000_RECIP_MASK_AUTO_RE; + + if (RS6000_RECIP_HAVE_RE_P (V4SFmode) + && (rs6000_recip_control & RECIP_V4SF_DIV) != 0) + rs6000_recip_bits[V4SFmode] |= RS6000_RECIP_MASK_AUTO_RE; + + if (RS6000_RECIP_HAVE_RE_P (V2DFmode) + && (rs6000_recip_control & RECIP_V2DF_DIV) != 0) + rs6000_recip_bits[V2DFmode] |= RS6000_RECIP_MASK_AUTO_RE; + + if (RS6000_RECIP_HAVE_RSQRTE_P (SFmode) + && (rs6000_recip_control & RECIP_SF_RSQRT) != 0) + rs6000_recip_bits[SFmode] |= RS6000_RECIP_MASK_AUTO_RSQRTE; + + if (RS6000_RECIP_HAVE_RSQRTE_P (DFmode) + && (rs6000_recip_control & RECIP_DF_RSQRT) != 0) + rs6000_recip_bits[DFmode] |= RS6000_RECIP_MASK_AUTO_RSQRTE; + + if (RS6000_RECIP_HAVE_RSQRTE_P (V4SFmode) + && (rs6000_recip_control & RECIP_V4SF_RSQRT) != 0) + rs6000_recip_bits[V4SFmode] |= RS6000_RECIP_MASK_AUTO_RSQRTE; + + if (RS6000_RECIP_HAVE_RSQRTE_P (V2DFmode) + && (rs6000_recip_control & RECIP_V2DF_RSQRT) != 0) + rs6000_recip_bits[V2DFmode] |= RS6000_RECIP_MASK_AUTO_RSQRTE; + } + } + if (TARGET_DEBUG_REG) rs6000_debug_reg_global (); + + if (TARGET_DEBUG_COST || TARGET_DEBUG_REG) + fprintf (stderr, + "SImode variable mult cost = %d\n" + "SImode constant mult cost = %d\n" + "SImode short constant mult cost = %d\n" + "DImode multipliciation cost = %d\n" + "SImode division cost = %d\n" + "DImode division cost = %d\n" + "Simple fp operation cost = %d\n" + "DFmode multiplication cost = %d\n" + "SFmode division cost = %d\n" + "DFmode division cost = %d\n" + "cache line size = %d\n" + "l1 cache size = %d\n" + "l2 cache size = %d\n" + "simultaneous prefetches = %d\n" + "\n", + rs6000_cost->mulsi, + rs6000_cost->mulsi_const, + rs6000_cost->mulsi_const9, + rs6000_cost->muldi, + rs6000_cost->divsi, + rs6000_cost->divdi, + rs6000_cost->fp, + rs6000_cost->dmul, + rs6000_cost->sdiv, + rs6000_cost->ddiv, + rs6000_cost->cache_line_size, + rs6000_cost->l1_cache_size, + rs6000_cost->l2_cache_size, + rs6000_cost->simultaneous_prefetches); } #if TARGET_MACHO @@ -2271,15 +2451,16 @@ rs6000_override_options (const char *default_cpu) | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND}, {"power6", PROCESSOR_POWER6, POWERPC_BASE_MASK | MASK_POWERPC64 | MASK_PPC_GPOPT | MASK_PPC_GFXOPT - | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_CMPB | MASK_DFP}, + | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_CMPB | MASK_DFP + | MASK_RECIP_PRECISION}, {"power6x", PROCESSOR_POWER6, POWERPC_BASE_MASK | MASK_POWERPC64 | MASK_PPC_GPOPT | MASK_PPC_GFXOPT | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_CMPB | MASK_DFP - | MASK_MFPGPR}, + | MASK_MFPGPR | MASK_RECIP_PRECISION}, {"power7", PROCESSOR_POWER7, POWERPC_7400_MASK | MASK_POWERPC64 | MASK_PPC_GPOPT | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_CMPB | MASK_DFP | MASK_POPCNTD - | MASK_VSX}, /* Don't add MASK_ISEL by default */ + | MASK_VSX| MASK_RECIP_PRECISION}, /* Don't add MASK_ISEL by default */ {"powerpc", PROCESSOR_POWERPC, POWERPC_BASE_MASK}, {"powerpc64", PROCESSOR_POWERPC64, POWERPC_BASE_MASK | MASK_PPC_GFXOPT | MASK_POWERPC64}, @@ -2307,7 +2488,24 @@ rs6000_override_options (const char *default_cpu) | MASK_PPC_GFXOPT | MASK_POWERPC64 | MASK_ALTIVEC | MASK_MFCRF | MASK_POPCNTB | MASK_FPRND | MASK_MULHW | MASK_DLMZB | MASK_CMPB | MASK_MFPGPR | MASK_DFP - | MASK_POPCNTD | MASK_VSX | MASK_ISEL | MASK_NO_UPDATE) + | MASK_POPCNTD | MASK_VSX | MASK_ISEL | MASK_NO_UPDATE + | MASK_RECIP_PRECISION) + }; + + /* Masks for instructions set at various powerpc ISAs. */ + enum { + ISA_2_1_MASKS = MASK_MFCRF, + ISA_2_2_MASKS = (ISA_2_1_MASKS | MASK_POPCNTB | MASK_FPRND), + + /* For ISA 2.05, do not add MFPGPR, since it isn't in ISA 2.06, and + don't add ALTIVEC, since in general it isn't a win on power6. */ + ISA_2_5_MASKS = (ISA_2_2_MASKS | MASK_CMPB | MASK_RECIP_PRECISION + | MASK_DFP), + + /* For ISA 2.06, don't add ISEL, since in general it isn't a win, but + altivec is a win so enable it. */ + ISA_2_6_MASKS = (ISA_2_5_MASKS | MASK_ALTIVEC | MASK_POPCNTD + | MASK_VSX | MASK_RECIP_PRECISION) }; /* Numerous experiment shows that IRA based loop pressure @@ -2449,10 +2647,17 @@ rs6000_override_options (const char *default_cpu) warning (0, msg); target_flags &= ~ MASK_VSX; } - else if (TARGET_VSX && !TARGET_ALTIVEC) - target_flags |= MASK_ALTIVEC; } + /* For the newer switches (vsx, dfp, etc.) set some of the older options, + unless the user explicitly used the -mno-<option> to disable the code. */ + if (TARGET_VSX) + target_flags |= (ISA_2_6_MASKS & (target_flags_explicit & ~ISA_2_6_MASKS)); + else if (TARGET_DFP) + target_flags |= (ISA_2_5_MASKS & (target_flags_explicit & ~ISA_2_5_MASKS)); + else if (TARGET_ALTIVEC) + target_flags |= (MASK_PPC_GFXOPT & (target_flags_explicit & ~MASK_PPC_GFXOPT)); + /* Set debug flags */ if (rs6000_debug_name) { @@ -2875,6 +3080,52 @@ rs6000_override_options (const char *default_cpu) the DERAT mispredict penalty. */ TARGET_AVOID_XFORM = (rs6000_cpu == PROCESSOR_POWER6 && TARGET_CMPB); + /* Set the -mrecip options. */ + if (rs6000_recip_name) + { + char *p = ASTRDUP (rs6000_recip_name); + char *q; + unsigned int mask, i; + bool invert; + + while ((q = strtok (p, ",")) != NULL) + { + p = NULL; + if (*q == '!') + { + invert = true; + q++; + } + else + invert = false; + + if (!strcmp (q, "default")) + mask = ((TARGET_RECIP_PRECISION) + ? RECIP_HIGH_PRECISION : RECIP_LOW_PRECISION); + else + { + for (i = 0; i < ARRAY_SIZE (recip_options); i++) + if (!strcmp (q, recip_options[i].string)) + { + mask = recip_options[i].mask; + break; + } + + if (i == ARRAY_SIZE (recip_options)) + { + error ("Unknown option for -mrecip=%s", q); + invert = false; + mask = 0; + } + } + + if (invert) + rs6000_recip_control &= ~mask; + else + rs6000_recip_control |= mask; + } + } + rs6000_init_hard_regno_mode_ok (); } @@ -3191,12 +3442,10 @@ rs6000_builtin_vectorized_function (tree fndecl, tree type_out, { enum machine_mode in_mode, out_mode; int in_n, out_n; - enum built_in_function fn = DECL_FUNCTION_CODE (fndecl); if (TREE_CODE (type_out) != VECTOR_TYPE || TREE_CODE (type_in) != VECTOR_TYPE - || !TARGET_VECTORIZE_BUILTINS - || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL) + || !TARGET_VECTORIZE_BUILTINS) return NULL_TREE; out_mode = TYPE_MODE (TREE_TYPE (type_out)); @@ -3204,111 +3453,151 @@ rs6000_builtin_vectorized_function (tree fndecl, tree type_out, in_mode = TYPE_MODE (TREE_TYPE (type_in)); in_n = TYPE_VECTOR_SUBPARTS (type_in); - switch (fn) + if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL) { - case BUILT_IN_COPYSIGN: - if (VECTOR_UNIT_VSX_P (V2DFmode) - && out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return rs6000_builtin_decls[VSX_BUILTIN_CPSGNDP]; - break; - case BUILT_IN_COPYSIGNF: - if (out_mode != SFmode || out_n != 4 - || in_mode != SFmode || in_n != 4) - break; - if (VECTOR_UNIT_VSX_P (V4SFmode)) - return rs6000_builtin_decls[VSX_BUILTIN_CPSGNSP]; - if (VECTOR_UNIT_ALTIVEC_P (V4SFmode)) - return rs6000_builtin_decls[ALTIVEC_BUILTIN_COPYSIGN_V4SF]; - break; - case BUILT_IN_SQRT: - if (VECTOR_UNIT_VSX_P (V2DFmode) - && out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return rs6000_builtin_decls[VSX_BUILTIN_XVSQRTDP]; - break; - case BUILT_IN_SQRTF: - if (VECTOR_UNIT_VSX_P (V4SFmode) - && out_mode == SFmode && out_n == 4 - && in_mode == SFmode && in_n == 4) - return rs6000_builtin_decls[VSX_BUILTIN_XVSQRTSP]; - break; - case BUILT_IN_CEIL: - if (VECTOR_UNIT_VSX_P (V2DFmode) - && out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIP]; - break; - case BUILT_IN_CEILF: - if (out_mode != SFmode || out_n != 4 - || in_mode != SFmode || in_n != 4) - break; - if (VECTOR_UNIT_VSX_P (V4SFmode)) - return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIP]; - if (VECTOR_UNIT_ALTIVEC_P (V4SFmode)) - return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIP]; - break; - case BUILT_IN_FLOOR: - if (VECTOR_UNIT_VSX_P (V2DFmode) - && out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIM]; - break; - case BUILT_IN_FLOORF: - if (out_mode != SFmode || out_n != 4 - || in_mode != SFmode || in_n != 4) - break; - if (VECTOR_UNIT_VSX_P (V4SFmode)) - return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIM]; - if (VECTOR_UNIT_ALTIVEC_P (V4SFmode)) - return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIM]; - break; - case BUILT_IN_TRUNC: - if (VECTOR_UNIT_VSX_P (V2DFmode) - && out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIZ]; - break; - case BUILT_IN_TRUNCF: - if (out_mode != SFmode || out_n != 4 - || in_mode != SFmode || in_n != 4) - break; - if (VECTOR_UNIT_VSX_P (V4SFmode)) - return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIZ]; - if (VECTOR_UNIT_ALTIVEC_P (V4SFmode)) - return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIZ]; - break; - case BUILT_IN_NEARBYINT: - if (VECTOR_UNIT_VSX_P (V2DFmode) - && flag_unsafe_math_optimizations - && out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return rs6000_builtin_decls[VSX_BUILTIN_XVRDPI]; - break; - case BUILT_IN_NEARBYINTF: - if (VECTOR_UNIT_VSX_P (V4SFmode) - && flag_unsafe_math_optimizations - && out_mode == SFmode && out_n == 4 - && in_mode == SFmode && in_n == 4) - return rs6000_builtin_decls[VSX_BUILTIN_XVRSPI]; - break; - case BUILT_IN_RINT: - if (VECTOR_UNIT_VSX_P (V2DFmode) - && !flag_trapping_math - && out_mode == DFmode && out_n == 2 - && in_mode == DFmode && in_n == 2) - return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIC]; - break; - case BUILT_IN_RINTF: - if (VECTOR_UNIT_VSX_P (V4SFmode) - && !flag_trapping_math - && out_mode == SFmode && out_n == 4 - && in_mode == SFmode && in_n == 4) - return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIC]; - break; - default: - break; + enum built_in_function fn = DECL_FUNCTION_CODE (fndecl); + switch (fn) + { + case BUILT_IN_COPYSIGN: + if (VECTOR_UNIT_VSX_P (V2DFmode) + && out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return rs6000_builtin_decls[VSX_BUILTIN_CPSGNDP]; + break; + case BUILT_IN_COPYSIGNF: + if (out_mode != SFmode || out_n != 4 + || in_mode != SFmode || in_n != 4) + break; + if (VECTOR_UNIT_VSX_P (V4SFmode)) + return rs6000_builtin_decls[VSX_BUILTIN_CPSGNSP]; + if (VECTOR_UNIT_ALTIVEC_P (V4SFmode)) + return rs6000_builtin_decls[ALTIVEC_BUILTIN_COPYSIGN_V4SF]; + break; + case BUILT_IN_SQRT: + if (VECTOR_UNIT_VSX_P (V2DFmode) + && out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return rs6000_builtin_decls[VSX_BUILTIN_XVSQRTDP]; + break; + case BUILT_IN_SQRTF: + if (VECTOR_UNIT_VSX_P (V4SFmode) + && out_mode == SFmode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return rs6000_builtin_decls[VSX_BUILTIN_XVSQRTSP]; + break; + case BUILT_IN_CEIL: + if (VECTOR_UNIT_VSX_P (V2DFmode) + && out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIP]; + break; + case BUILT_IN_CEILF: + if (out_mode != SFmode || out_n != 4 + || in_mode != SFmode || in_n != 4) + break; + if (VECTOR_UNIT_VSX_P (V4SFmode)) + return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIP]; + if (VECTOR_UNIT_ALTIVEC_P (V4SFmode)) + return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIP]; + break; + case BUILT_IN_FLOOR: + if (VECTOR_UNIT_VSX_P (V2DFmode) + && out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIM]; + break; + case BUILT_IN_FLOORF: + if (out_mode != SFmode || out_n != 4 + || in_mode != SFmode || in_n != 4) + break; + if (VECTOR_UNIT_VSX_P (V4SFmode)) + return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIM]; + if (VECTOR_UNIT_ALTIVEC_P (V4SFmode)) + return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIM]; + break; + case BUILT_IN_TRUNC: + if (VECTOR_UNIT_VSX_P (V2DFmode) + && out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIZ]; + break; + case BUILT_IN_TRUNCF: + if (out_mode != SFmode || out_n != 4 + || in_mode != SFmode || in_n != 4) + break; + if (VECTOR_UNIT_VSX_P (V4SFmode)) + return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIZ]; + if (VECTOR_UNIT_ALTIVEC_P (V4SFmode)) + return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIZ]; + break; + case BUILT_IN_NEARBYINT: + if (VECTOR_UNIT_VSX_P (V2DFmode) + && flag_unsafe_math_optimizations + && out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return rs6000_builtin_decls[VSX_BUILTIN_XVRDPI]; + break; + case BUILT_IN_NEARBYINTF: + if (VECTOR_UNIT_VSX_P (V4SFmode) + && flag_unsafe_math_optimizations + && out_mode == SFmode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return rs6000_builtin_decls[VSX_BUILTIN_XVRSPI]; + break; + case BUILT_IN_RINT: + if (VECTOR_UNIT_VSX_P (V2DFmode) + && !flag_trapping_math + && out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return rs6000_builtin_decls[VSX_BUILTIN_XVRDPIC]; + break; + case BUILT_IN_RINTF: + if (VECTOR_UNIT_VSX_P (V4SFmode) + && !flag_trapping_math + && out_mode == SFmode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return rs6000_builtin_decls[VSX_BUILTIN_XVRSPIC]; + break; + default: + break; + } } + + else if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD) + { + enum rs6000_builtins fn + = (enum rs6000_builtins)DECL_FUNCTION_CODE (fndecl); + switch (fn) + { + case RS6000_BUILTIN_RSQRTF: + if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode) + && out_mode == SFmode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRSQRTFP]; + break; + case RS6000_BUILTIN_RSQRT: + if (VECTOR_UNIT_VSX_P (V2DFmode) + && out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return rs6000_builtin_decls[VSX_BUILTIN_VEC_RSQRT_V2DF]; + break; + case RS6000_BUILTIN_RECIPF: + if (VECTOR_UNIT_ALTIVEC_OR_VSX_P (V4SFmode) + && out_mode == SFmode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRECIPFP]; + break; + case RS6000_BUILTIN_RECIP: + if (VECTOR_UNIT_VSX_P (V2DFmode) + && out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return rs6000_builtin_decls[VSX_BUILTIN_RECIP_V2DF]; + break; + default: + break; + } + } + return NULL_TREE; } @@ -3668,6 +3957,13 @@ rs6000_handle_option (size_t code, const char *arg, int value) target_flags_explicit |= MASK_SOFT_FLOAT; rs6000_single_float = rs6000_double_float = 0; } + + case OPT_mrecip: + rs6000_recip_name = (value) ? "default" : "none"; + break; + + case OPT_mrecip_: + rs6000_recip_name = arg; break; } return true; @@ -8865,6 +9161,7 @@ static struct builtin_description bdesc_2arg[] = { MASK_ALTIVEC, CODE_FOR_altivec_vpkshus, "__builtin_altivec_vpkshus", ALTIVEC_BUILTIN_VPKSHUS }, { MASK_ALTIVEC, CODE_FOR_altivec_vpkuwus, "__builtin_altivec_vpkuwus", ALTIVEC_BUILTIN_VPKUWUS }, { MASK_ALTIVEC, CODE_FOR_altivec_vpkswus, "__builtin_altivec_vpkswus", ALTIVEC_BUILTIN_VPKSWUS }, + { MASK_ALTIVEC, CODE_FOR_recipv4sf3, "__builtin_altivec_vrecipdivfp", ALTIVEC_BUILTIN_VRECIPFP }, { MASK_ALTIVEC, CODE_FOR_vrotlv16qi3, "__builtin_altivec_vrlb", ALTIVEC_BUILTIN_VRLB }, { MASK_ALTIVEC, CODE_FOR_vrotlv8hi3, "__builtin_altivec_vrlh", ALTIVEC_BUILTIN_VRLH }, { MASK_ALTIVEC, CODE_FOR_vrotlv4si3, "__builtin_altivec_vrlw", ALTIVEC_BUILTIN_VRLW }, @@ -8907,6 +9204,7 @@ static struct builtin_description bdesc_2arg[] = { MASK_VSX, CODE_FOR_subv2df3, "__builtin_vsx_xvsubdp", VSX_BUILTIN_XVSUBDP }, { MASK_VSX, CODE_FOR_mulv2df3, "__builtin_vsx_xvmuldp", VSX_BUILTIN_XVMULDP }, { MASK_VSX, CODE_FOR_divv2df3, "__builtin_vsx_xvdivdp", VSX_BUILTIN_XVDIVDP }, + { MASK_VSX, CODE_FOR_recipv2df3, "__builtin_vsx_xvrecipdivdp", VSX_BUILTIN_RECIP_V2DF }, { MASK_VSX, CODE_FOR_sminv2df3, "__builtin_vsx_xvmindp", VSX_BUILTIN_XVMINDP }, { MASK_VSX, CODE_FOR_smaxv2df3, "__builtin_vsx_xvmaxdp", VSX_BUILTIN_XVMAXDP }, { MASK_VSX, CODE_FOR_vsx_tdivv2df3_fe, "__builtin_vsx_xvtdivdp_fe", VSX_BUILTIN_XVTDIVDP_FE }, @@ -8919,6 +9217,7 @@ static struct builtin_description bdesc_2arg[] = { MASK_VSX, CODE_FOR_subv4sf3, "__builtin_vsx_xvsubsp", VSX_BUILTIN_XVSUBSP }, { MASK_VSX, CODE_FOR_mulv4sf3, "__builtin_vsx_xvmulsp", VSX_BUILTIN_XVMULSP }, { MASK_VSX, CODE_FOR_divv4sf3, "__builtin_vsx_xvdivsp", VSX_BUILTIN_XVDIVSP }, + { MASK_VSX, CODE_FOR_recipv4sf3, "__builtin_vsx_xvrecipdivsp", VSX_BUILTIN_RECIP_V4SF }, { MASK_VSX, CODE_FOR_sminv4sf3, "__builtin_vsx_xvminsp", VSX_BUILTIN_XVMINSP }, { MASK_VSX, CODE_FOR_smaxv4sf3, "__builtin_vsx_xvmaxsp", VSX_BUILTIN_XVMAXSP }, { MASK_VSX, CODE_FOR_vsx_tdivv4sf3_fe, "__builtin_vsx_xvtdivsp_fe", VSX_BUILTIN_XVTDIVSP_FE }, @@ -9035,6 +9334,7 @@ static struct builtin_description bdesc_2arg[] = { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_packsu", ALTIVEC_BUILTIN_VEC_PACKSU }, { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_vpkswus", ALTIVEC_BUILTIN_VEC_VPKSWUS }, { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_vpkshus", ALTIVEC_BUILTIN_VEC_VPKSHUS }, + { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_recipdiv", ALTIVEC_BUILTIN_VEC_RECIP }, { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_rl", ALTIVEC_BUILTIN_VEC_RL }, { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_vrlw", ALTIVEC_BUILTIN_VEC_VRLW }, { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_vrlh", ALTIVEC_BUILTIN_VEC_VRLH }, @@ -9364,12 +9664,13 @@ static struct builtin_description bdesc_1arg[] = { { MASK_ALTIVEC, CODE_FOR_altivec_vexptefp, "__builtin_altivec_vexptefp", ALTIVEC_BUILTIN_VEXPTEFP }, { MASK_ALTIVEC, CODE_FOR_altivec_vlogefp, "__builtin_altivec_vlogefp", ALTIVEC_BUILTIN_VLOGEFP }, - { MASK_ALTIVEC, CODE_FOR_altivec_vrefp, "__builtin_altivec_vrefp", ALTIVEC_BUILTIN_VREFP }, + { MASK_ALTIVEC, CODE_FOR_rev4sf2, "__builtin_altivec_vrefp", ALTIVEC_BUILTIN_VREFP }, { MASK_ALTIVEC, CODE_FOR_vector_floorv4sf2, "__builtin_altivec_vrfim", ALTIVEC_BUILTIN_VRFIM }, { MASK_ALTIVEC, CODE_FOR_altivec_vrfin, "__builtin_altivec_vrfin", ALTIVEC_BUILTIN_VRFIN }, { MASK_ALTIVEC, CODE_FOR_vector_ceilv4sf2, "__builtin_altivec_vrfip", ALTIVEC_BUILTIN_VRFIP }, { MASK_ALTIVEC, CODE_FOR_vector_btruncv4sf2, "__builtin_altivec_vrfiz", ALTIVEC_BUILTIN_VRFIZ }, - { MASK_ALTIVEC, CODE_FOR_altivec_vrsqrtefp, "__builtin_altivec_vrsqrtefp", ALTIVEC_BUILTIN_VRSQRTEFP }, + { MASK_ALTIVEC, CODE_FOR_rsqrtv4sf2, "__builtin_altivec_vrsqrtfp", ALTIVEC_BUILTIN_VRSQRTFP }, + { MASK_ALTIVEC, CODE_FOR_rsqrtev4sf2, "__builtin_altivec_vrsqrtefp", ALTIVEC_BUILTIN_VRSQRTEFP }, { MASK_ALTIVEC, CODE_FOR_altivec_vspltisb, "__builtin_altivec_vspltisb", ALTIVEC_BUILTIN_VSPLTISB }, { MASK_ALTIVEC, CODE_FOR_altivec_vspltish, "__builtin_altivec_vspltish", ALTIVEC_BUILTIN_VSPLTISH }, { MASK_ALTIVEC, CODE_FOR_altivec_vspltisw, "__builtin_altivec_vspltisw", ALTIVEC_BUILTIN_VSPLTISW }, @@ -9382,14 +9683,16 @@ static struct builtin_description bdesc_1arg[] = { MASK_VSX, CODE_FOR_negv2df2, "__builtin_vsx_xvnegdp", VSX_BUILTIN_XVNEGDP }, { MASK_VSX, CODE_FOR_sqrtv2df2, "__builtin_vsx_xvsqrtdp", VSX_BUILTIN_XVSQRTDP }, - { MASK_VSX, CODE_FOR_vsx_rsqrtev2df2, "__builtin_vsx_xvrsqrtedp", VSX_BUILTIN_XVRSQRTEDP }, + { MASK_VSX, CODE_FOR_rsqrtv2df2, "__builtin_vsx_xvrsqrtdp", VSX_BUILTIN_VEC_RSQRT_V2DF }, + { MASK_VSX, CODE_FOR_rsqrtev2df2, "__builtin_vsx_xvrsqrtedp", VSX_BUILTIN_XVRSQRTEDP }, { MASK_VSX, CODE_FOR_vsx_tsqrtv2df2_fe, "__builtin_vsx_xvtsqrtdp_fe", VSX_BUILTIN_XVTSQRTDP_FE }, { MASK_VSX, CODE_FOR_vsx_tsqrtv2df2_fg, "__builtin_vsx_xvtsqrtdp_fg", VSX_BUILTIN_XVTSQRTDP_FG }, { MASK_VSX, CODE_FOR_vsx_frev2df2, "__builtin_vsx_xvredp", VSX_BUILTIN_XVREDP }, { MASK_VSX, CODE_FOR_negv4sf2, "__builtin_vsx_xvnegsp", VSX_BUILTIN_XVNEGSP }, { MASK_VSX, CODE_FOR_sqrtv4sf2, "__builtin_vsx_xvsqrtsp", VSX_BUILTIN_XVSQRTSP }, - { MASK_VSX, CODE_FOR_vsx_rsqrtev4sf2, "__builtin_vsx_xvrsqrtesp", VSX_BUILTIN_XVRSQRTESP }, + { MASK_VSX, CODE_FOR_rsqrtv4sf2, "__builtin_vsx_xvrsqrtsp", VSX_BUILTIN_VEC_RSQRT_V4SF }, + { MASK_VSX, CODE_FOR_rsqrtev4sf2, "__builtin_vsx_xvrsqrtesp", VSX_BUILTIN_XVRSQRTESP }, { MASK_VSX, CODE_FOR_vsx_tsqrtv4sf2_fe, "__builtin_vsx_xvtsqrtsp_fe", VSX_BUILTIN_XVTSQRTSP_FE }, { MASK_VSX, CODE_FOR_vsx_tsqrtv4sf2_fg, "__builtin_vsx_xvtsqrtsp_fg", VSX_BUILTIN_XVTSQRTSP_FG }, { MASK_VSX, CODE_FOR_vsx_frev4sf2, "__builtin_vsx_xvresp", VSX_BUILTIN_XVRESP }, @@ -9448,6 +9751,7 @@ static struct builtin_description bdesc_1arg[] = { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_mtvscr", ALTIVEC_BUILTIN_VEC_MTVSCR }, { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_re", ALTIVEC_BUILTIN_VEC_RE }, { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_round", ALTIVEC_BUILTIN_VEC_ROUND }, + { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_rsqrt", ALTIVEC_BUILTIN_VEC_RSQRT }, { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_rsqrte", ALTIVEC_BUILTIN_VEC_RSQRTE }, { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_trunc", ALTIVEC_BUILTIN_VEC_TRUNC }, { MASK_ALTIVEC, CODE_FOR_nothing, "__builtin_vec_unpackh", ALTIVEC_BUILTIN_VEC_UNPACKH }, @@ -10963,73 +11267,83 @@ rs6000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, rtx ret; bool success; - if (fcode == RS6000_BUILTIN_RECIP) + switch (fcode) + { + case RS6000_BUILTIN_RECIP: return rs6000_expand_binop_builtin (CODE_FOR_recipdf3, exp, target); - if (fcode == RS6000_BUILTIN_RECIPF) + case RS6000_BUILTIN_RECIPF: return rs6000_expand_binop_builtin (CODE_FOR_recipsf3, exp, target); - if (fcode == RS6000_BUILTIN_RSQRTF) + case RS6000_BUILTIN_RSQRTF: return rs6000_expand_unop_builtin (CODE_FOR_rsqrtsf2, exp, target); - if (fcode == RS6000_BUILTIN_BSWAP_HI) - return rs6000_expand_unop_builtin (CODE_FOR_bswaphi2, exp, target); - - if (fcode == POWER7_BUILTIN_BPERMD) - return rs6000_expand_binop_builtin (((TARGET_64BIT) - ? CODE_FOR_bpermd_di - : CODE_FOR_bpermd_si), exp, target); + case RS6000_BUILTIN_RSQRT: + return rs6000_expand_unop_builtin (CODE_FOR_rsqrtdf2, exp, target); - if (fcode == ALTIVEC_BUILTIN_MASK_FOR_LOAD - || fcode == ALTIVEC_BUILTIN_MASK_FOR_STORE) - { - int icode = (int) CODE_FOR_altivec_lvsr; - enum machine_mode tmode = insn_data[icode].operand[0].mode; - enum machine_mode mode = insn_data[icode].operand[1].mode; - tree arg; - rtx op, addr, pat; + case RS6000_BUILTIN_BSWAP_HI: + return rs6000_expand_unop_builtin (CODE_FOR_bswaphi2, exp, target); - gcc_assert (TARGET_ALTIVEC); + case POWER7_BUILTIN_BPERMD: + return rs6000_expand_binop_builtin (((TARGET_64BIT) + ? CODE_FOR_bpermd_di + : CODE_FOR_bpermd_si), exp, target); - arg = CALL_EXPR_ARG (exp, 0); - gcc_assert (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE); - op = expand_expr (arg, NULL_RTX, Pmode, EXPAND_NORMAL); - addr = memory_address (mode, op); - if (fcode == ALTIVEC_BUILTIN_MASK_FOR_STORE) - op = addr; - else - { - /* For the load case need to negate the address. */ - op = gen_reg_rtx (GET_MODE (addr)); - emit_insn (gen_rtx_SET (VOIDmode, op, - gen_rtx_NEG (GET_MODE (addr), addr))); - } - op = gen_rtx_MEM (mode, op); + case ALTIVEC_BUILTIN_MASK_FOR_LOAD: + case ALTIVEC_BUILTIN_MASK_FOR_STORE: + { + int icode = (int) CODE_FOR_altivec_lvsr; + enum machine_mode tmode = insn_data[icode].operand[0].mode; + enum machine_mode mode = insn_data[icode].operand[1].mode; + tree arg; + rtx op, addr, pat; + + gcc_assert (TARGET_ALTIVEC); + + arg = CALL_EXPR_ARG (exp, 0); + gcc_assert (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE); + op = expand_expr (arg, NULL_RTX, Pmode, EXPAND_NORMAL); + addr = memory_address (mode, op); + if (fcode == ALTIVEC_BUILTIN_MASK_FOR_STORE) + op = addr; + else + { + /* For the load case need to negate the address. */ + op = gen_reg_rtx (GET_MODE (addr)); + emit_insn (gen_rtx_SET (VOIDmode, op, + gen_rtx_NEG (GET_MODE (addr), addr))); + } + op = gen_rtx_MEM (mode, op); - if (target == 0 - || GET_MODE (target) != tmode - || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) - target = gen_reg_rtx (tmode); + if (target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); - /*pat = gen_altivec_lvsr (target, op);*/ - pat = GEN_FCN (icode) (target, op); - if (!pat) - return 0; - emit_insn (pat); + /*pat = gen_altivec_lvsr (target, op);*/ + pat = GEN_FCN (icode) (target, op); + if (!pat) + return 0; + emit_insn (pat); - return target; - } + return target; + } + case ALTIVEC_BUILTIN_VCFUX: + case ALTIVEC_BUILTIN_VCFSX: + case ALTIVEC_BUILTIN_VCTUXS: + case ALTIVEC_BUILTIN_VCTSXS: /* FIXME: There's got to be a nicer way to handle this case than constructing a new CALL_EXPR. */ - if (fcode == ALTIVEC_BUILTIN_VCFUX - || fcode == ALTIVEC_BUILTIN_VCFSX - || fcode == ALTIVEC_BUILTIN_VCTUXS - || fcode == ALTIVEC_BUILTIN_VCTSXS) - { if (call_expr_nargs (exp) == 1) - exp = build_call_nary (TREE_TYPE (exp), CALL_EXPR_FN (exp), - 2, CALL_EXPR_ARG (exp, 0), integer_zero_node); + { + exp = build_call_nary (TREE_TYPE (exp), CALL_EXPR_FN (exp), + 2, CALL_EXPR_ARG (exp, 0), integer_zero_node); + } + break; + + default: + break; } if (TARGET_ALTIVEC) @@ -11081,6 +11395,7 @@ static void rs6000_init_builtins (void) { tree tdecl; + tree ftype; V2SI_type_node = build_vector_type (intSI_type_node, 2); V2SF_type_node = build_vector_type (float_type_node, 2); @@ -11270,29 +11585,38 @@ rs6000_init_builtins (void) altivec_init_builtins (); if (TARGET_ALTIVEC || TARGET_SPE || TARGET_PAIRED_FLOAT || TARGET_VSX) rs6000_common_init_builtins (); - if (TARGET_PPC_GFXOPT) + if (TARGET_FRE) + { + ftype = builtin_function_type (DFmode, DFmode, DFmode, VOIDmode, + RS6000_BUILTIN_RECIP, + "__builtin_recipdiv"); + def_builtin (MASK_POPCNTB, "__builtin_recipdiv", ftype, + RS6000_BUILTIN_RECIP); + } + if (TARGET_FRES) { - tree ftype = builtin_function_type (SFmode, SFmode, SFmode, VOIDmode, - RS6000_BUILTIN_RECIPF, - "__builtin_recipdivf"); + ftype = builtin_function_type (SFmode, SFmode, SFmode, VOIDmode, + RS6000_BUILTIN_RECIPF, + "__builtin_recipdivf"); def_builtin (MASK_PPC_GFXOPT, "__builtin_recipdivf", ftype, RS6000_BUILTIN_RECIPF); - + } + if (TARGET_FRSQRTE) + { + ftype = builtin_function_type (DFmode, DFmode, VOIDmode, VOIDmode, + RS6000_BUILTIN_RSQRT, + "__builtin_rsqrt"); + def_builtin (MASK_PPC_GFXOPT, "__builtin_rsqrt", ftype, + RS6000_BUILTIN_RSQRT); + } + if (TARGET_FRSQRTES) + { ftype = builtin_function_type (SFmode, SFmode, VOIDmode, VOIDmode, RS6000_BUILTIN_RSQRTF, "__builtin_rsqrtf"); def_builtin (MASK_PPC_GFXOPT, "__builtin_rsqrtf", ftype, RS6000_BUILTIN_RSQRTF); } - if (TARGET_POPCNTB) - { - tree ftype = builtin_function_type (DFmode, DFmode, DFmode, VOIDmode, - RS6000_BUILTIN_RECIP, - "__builtin_recipdiv"); - def_builtin (MASK_POPCNTB, "__builtin_recipdiv", ftype, - RS6000_BUILTIN_RECIP); - - } if (TARGET_POPCNTD) { enum machine_mode mode = (TARGET_64BIT) ? DImode : SImode; @@ -13800,30 +14124,16 @@ rs6000_preferred_reload_class (rtx x, enum reg_class rclass) if (GET_MODE_CLASS (mode) == MODE_INT && rclass == NON_SPECIAL_REGS) return GENERAL_REGS; - /* For VSX, prefer the traditional registers for DF if the address is of the - form reg+offset because we can use the non-VSX loads. Prefer the Altivec - registers if Altivec is handling the vector operations (i.e. V16QI, V8HI, - and V4SI). */ - if (rclass == VSX_REGS && VECTOR_MEM_VSX_P (mode)) + /* For VSX, prefer the traditional registers for 64-bit values because we can + use the non-VSX loads. Prefer the Altivec registers if Altivec is + handling the vector operations (i.e. V16QI, V8HI, and V4SI), or if we + prefer Altivec loads.. */ + if (rclass == VSX_REGS) { - if (mode == DFmode && GET_CODE (x) == MEM) - { - rtx addr = XEXP (x, 0); - - if (legitimate_indirect_address_p (addr, false)) /* reg */ - return VSX_REGS; - - if (legitimate_indexed_address_p (addr, false)) /* reg+reg */ - return VSX_REGS; + if (GET_MODE_SIZE (mode) <= 8) + return FLOAT_REGS; - if (GET_CODE (addr) == PRE_MODIFY - && legitimate_indexed_address_p (XEXP (addr, 0), false)) - return VSX_REGS; - - return FLOAT_REGS; - } - - if (VECTOR_UNIT_ALTIVEC_P (mode)) + if (VECTOR_UNIT_ALTIVEC_P (mode) || VECTOR_MEM_ALTIVEC_P (mode)) return ALTIVEC_REGS; return rclass; @@ -25110,17 +25420,41 @@ static tree rs6000_builtin_reciprocal (unsigned int fn, bool md_fn, bool sqrt ATTRIBUTE_UNUSED) { - if (! (TARGET_RECIP && TARGET_PPC_GFXOPT && !optimize_size - && flag_finite_math_only && !flag_trapping_math - && flag_unsafe_math_optimizations)) + if (optimize_insn_for_size_p ()) return NULL_TREE; if (md_fn) - return NULL_TREE; + switch (fn) + { + case VSX_BUILTIN_XVSQRTDP: + if (!RS6000_RECIP_AUTO_RSQRTE_P (V2DFmode)) + return NULL_TREE; + + return rs6000_builtin_decls[VSX_BUILTIN_VEC_RSQRT_V2DF]; + + case VSX_BUILTIN_XVSQRTSP: + if (!RS6000_RECIP_AUTO_RSQRTE_P (V4SFmode)) + return NULL_TREE; + + return rs6000_builtin_decls[VSX_BUILTIN_VEC_RSQRT_V4SF]; + + default: + return NULL_TREE; + } + else switch (fn) { + case BUILT_IN_SQRT: + if (!RS6000_RECIP_AUTO_RSQRTE_P (DFmode)) + return NULL_TREE; + + return rs6000_builtin_decls[RS6000_BUILTIN_RSQRT]; + case BUILT_IN_SQRTF: + if (!RS6000_RECIP_AUTO_RSQRTE_P (SFmode)) + return NULL_TREE; + return rs6000_builtin_decls[RS6000_BUILTIN_RSQRTF]; default: @@ -25128,192 +25462,300 @@ rs6000_builtin_reciprocal (unsigned int fn, bool md_fn, } } -/* Newton-Raphson approximation of single-precision floating point divide n/d. - Assumes no trapping math and finite arguments. */ +/* Load up a constant. If the mode is a vector mode, splat the value across + all of the vector elements. */ -void -rs6000_emit_swdivsf (rtx dst, rtx n, rtx d) +static rtx +rs6000_load_constant_and_splat (enum machine_mode mode, REAL_VALUE_TYPE dconst) +{ + rtx reg; + + if (mode == SFmode || mode == DFmode) + { + rtx d = CONST_DOUBLE_FROM_REAL_VALUE (dconst, mode); + reg = force_reg (mode, d); + } + else if (mode == V4SFmode) + { + rtx d = CONST_DOUBLE_FROM_REAL_VALUE (dconst, SFmode); + rtvec v = gen_rtvec (4, d, d, d, d); + reg = gen_reg_rtx (mode); + rs6000_expand_vector_init (reg, gen_rtx_PARALLEL (mode, v)); + } + else if (mode == V2DFmode) + { + rtx d = CONST_DOUBLE_FROM_REAL_VALUE (dconst, DFmode); + rtvec v = gen_rtvec (2, d, d); + reg = gen_reg_rtx (mode); + rs6000_expand_vector_init (reg, gen_rtx_PARALLEL (mode, v)); + } + else + gcc_unreachable (); + + return reg; +} + +/* Generate a FMADD instruction: + dst = (m1 * m2) + a + + generating different RTL based on the fused multiply/add switch. */ + +static void +rs6000_emit_madd (rtx dst, rtx m1, rtx m2, rtx a) +{ + enum machine_mode mode = GET_MODE (dst); + + if (!TARGET_FUSED_MADD) + { + /* For the simple ops, use the generator function, rather than assuming + that the RTL is standard. */ + enum insn_code mcode = optab_handler (smul_optab, mode)->insn_code; + enum insn_code acode = optab_handler (add_optab, mode)->insn_code; + gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (mcode); + gen_2arg_fn_t gen_add = (gen_2arg_fn_t) GEN_FCN (acode); + rtx mreg = gen_reg_rtx (mode); + + gcc_assert (mcode != CODE_FOR_nothing && acode != CODE_FOR_nothing); + emit_insn (gen_mul (mreg, m1, m2)); + emit_insn (gen_add (dst, mreg, a)); + } + + else + emit_insn (gen_rtx_SET (VOIDmode, dst, + gen_rtx_PLUS (mode, + gen_rtx_MULT (mode, m1, m2), + a))); +} + +/* Generate a FMSUB instruction: + dst = (m1 * m2) - a + + generating different RTL based on the fused multiply/add switch. */ + +static void +rs6000_emit_msub (rtx dst, rtx m1, rtx m2, rtx a) +{ + enum machine_mode mode = GET_MODE (dst); + + if (!TARGET_FUSED_MADD + || (mode == V4SFmode && VECTOR_UNIT_ALTIVEC_P (V4SFmode))) + { + /* For the simple ops, use the generator function, rather than assuming + that the RTL is standard. */ + enum insn_code mcode = optab_handler (smul_optab, mode)->insn_code; + enum insn_code scode = optab_handler (add_optab, mode)->insn_code; + gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (mcode); + gen_2arg_fn_t gen_sub = (gen_2arg_fn_t) GEN_FCN (scode); + rtx mreg = gen_reg_rtx (mode); + + gcc_assert (mcode != CODE_FOR_nothing && scode != CODE_FOR_nothing); + emit_insn (gen_mul (mreg, m1, m2)); + emit_insn (gen_sub (dst, mreg, a)); + } + + else + emit_insn (gen_rtx_SET (VOIDmode, dst, + gen_rtx_MINUS (mode, + gen_rtx_MULT (mode, m1, m2), + a))); +} + +/* Generate a FNMSUB instruction: + dst = - ((m1 * m2) - a) + + Which is equivalent to (except in the prescence of -0.0): + dst = a - (m1 * m2) + + generating different RTL based on the fast-math and fused multiply/add + switches. */ + +static void +rs6000_emit_nmsub (rtx dst, rtx m1, rtx m2, rtx a) { - rtx x0, e0, e1, y1, u0, v0, one; + enum machine_mode mode = GET_MODE (dst); + + if (!TARGET_FUSED_MADD) + { + /* For the simple ops, use the generator function, rather than assuming + that the RTL is standard. */ + enum insn_code mcode = optab_handler (smul_optab, mode)->insn_code; + enum insn_code scode = optab_handler (sub_optab, mode)->insn_code; + gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (mcode); + gen_2arg_fn_t gen_sub = (gen_2arg_fn_t) GEN_FCN (scode); + rtx mreg = gen_reg_rtx (mode); - x0 = gen_reg_rtx (SFmode); - e0 = gen_reg_rtx (SFmode); - e1 = gen_reg_rtx (SFmode); - y1 = gen_reg_rtx (SFmode); - u0 = gen_reg_rtx (SFmode); - v0 = gen_reg_rtx (SFmode); - one = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconst1, SFmode)); + gcc_assert (mcode != CODE_FOR_nothing && scode != CODE_FOR_nothing); + emit_insn (gen_mul (mreg, m1, m2)); + emit_insn (gen_sub (dst, a, mreg)); + } + + else + { + rtx m = gen_rtx_MULT (mode, m1, m2); + + if (!HONOR_SIGNED_ZEROS (mode)) + emit_insn (gen_rtx_SET (VOIDmode, dst, gen_rtx_MINUS (mode, a, m))); + + else + emit_insn (gen_rtx_SET (VOIDmode, dst, + gen_rtx_NEG (mode, + gen_rtx_MINUS (mode, m, a)))); + } +} + +/* Newton-Raphson approximation of floating point divide with just 2 passes + (either single precision floating point, or newer machines with higher + accuracy estimates). Support both scalar and vector divide. Assumes no + trapping math and finite arguments. */ + +static void +rs6000_emit_swdiv_high_precision (rtx dst, rtx n, rtx d) +{ + enum machine_mode mode = GET_MODE (dst); + rtx x0, e0, e1, y1, u0, v0; + enum insn_code code = optab_handler (smul_optab, mode)->insn_code; + gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code); + rtx one = rs6000_load_constant_and_splat (mode, dconst1); + + gcc_assert (code != CODE_FOR_nothing); /* x0 = 1./d estimate */ + x0 = gen_reg_rtx (mode); emit_insn (gen_rtx_SET (VOIDmode, x0, - gen_rtx_UNSPEC (SFmode, gen_rtvec (1, d), + gen_rtx_UNSPEC (mode, gen_rtvec (1, d), UNSPEC_FRES))); - /* e0 = 1. - d * x0 */ - emit_insn (gen_rtx_SET (VOIDmode, e0, - gen_rtx_MINUS (SFmode, one, - gen_rtx_MULT (SFmode, d, x0)))); - /* e1 = e0 + e0 * e0 */ - emit_insn (gen_rtx_SET (VOIDmode, e1, - gen_rtx_PLUS (SFmode, - gen_rtx_MULT (SFmode, e0, e0), e0))); - /* y1 = x0 + e1 * x0 */ - emit_insn (gen_rtx_SET (VOIDmode, y1, - gen_rtx_PLUS (SFmode, - gen_rtx_MULT (SFmode, e1, x0), x0))); - /* u0 = n * y1 */ - emit_insn (gen_rtx_SET (VOIDmode, u0, - gen_rtx_MULT (SFmode, n, y1))); - /* v0 = n - d * u0 */ - emit_insn (gen_rtx_SET (VOIDmode, v0, - gen_rtx_MINUS (SFmode, n, - gen_rtx_MULT (SFmode, d, u0)))); - /* dst = u0 + v0 * y1 */ - emit_insn (gen_rtx_SET (VOIDmode, dst, - gen_rtx_PLUS (SFmode, - gen_rtx_MULT (SFmode, v0, y1), u0))); -} - -/* Newton-Raphson approximation of double-precision floating point divide n/d. - Assumes no trapping math and finite arguments. */ -void -rs6000_emit_swdivdf (rtx dst, rtx n, rtx d) + e0 = gen_reg_rtx (mode); + rs6000_emit_nmsub (e0, d, x0, one); /* e0 = 1. - (d * x0) */ + + e1 = gen_reg_rtx (mode); + rs6000_emit_madd (e1, e0, e0, e0); /* e1 = (e0 * e0) + e0 */ + + y1 = gen_reg_rtx (mode); + rs6000_emit_madd (y1, e1, x0, x0); /* y1 = (e1 * x0) + x0 */ + + u0 = gen_reg_rtx (mode); + emit_insn (gen_mul (u0, n, y1)); /* u0 = n * y1 */ + + v0 = gen_reg_rtx (mode); + rs6000_emit_nmsub (v0, d, u0, n); /* v0 = n - (d * u0) */ + + rs6000_emit_madd (dst, v0, y1, u0); /* dst = (v0 * y1) + u0 */ +} + +/* Newton-Raphson approximation of floating point divide that has a low + precision estimate. Assumes no trapping math and finite arguments. */ + +static void +rs6000_emit_swdiv_low_precision (rtx dst, rtx n, rtx d) { + enum machine_mode mode = GET_MODE (dst); rtx x0, e0, e1, e2, y1, y2, y3, u0, v0, one; + enum insn_code code = optab_handler (smul_optab, mode)->insn_code; + gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code); - x0 = gen_reg_rtx (DFmode); - e0 = gen_reg_rtx (DFmode); - e1 = gen_reg_rtx (DFmode); - e2 = gen_reg_rtx (DFmode); - y1 = gen_reg_rtx (DFmode); - y2 = gen_reg_rtx (DFmode); - y3 = gen_reg_rtx (DFmode); - u0 = gen_reg_rtx (DFmode); - v0 = gen_reg_rtx (DFmode); - one = force_reg (DFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconst1, DFmode)); + gcc_assert (code != CODE_FOR_nothing); + + one = rs6000_load_constant_and_splat (mode, dconst1); /* x0 = 1./d estimate */ + x0 = gen_reg_rtx (mode); emit_insn (gen_rtx_SET (VOIDmode, x0, - gen_rtx_UNSPEC (DFmode, gen_rtvec (1, d), + gen_rtx_UNSPEC (mode, gen_rtvec (1, d), UNSPEC_FRES))); - /* e0 = 1. - d * x0 */ - emit_insn (gen_rtx_SET (VOIDmode, e0, - gen_rtx_MINUS (DFmode, one, - gen_rtx_MULT (SFmode, d, x0)))); - /* y1 = x0 + e0 * x0 */ - emit_insn (gen_rtx_SET (VOIDmode, y1, - gen_rtx_PLUS (DFmode, - gen_rtx_MULT (DFmode, e0, x0), x0))); - /* e1 = e0 * e0 */ - emit_insn (gen_rtx_SET (VOIDmode, e1, - gen_rtx_MULT (DFmode, e0, e0))); - /* y2 = y1 + e1 * y1 */ - emit_insn (gen_rtx_SET (VOIDmode, y2, - gen_rtx_PLUS (DFmode, - gen_rtx_MULT (DFmode, e1, y1), y1))); - /* e2 = e1 * e1 */ - emit_insn (gen_rtx_SET (VOIDmode, e2, - gen_rtx_MULT (DFmode, e1, e1))); - /* y3 = y2 + e2 * y2 */ - emit_insn (gen_rtx_SET (VOIDmode, y3, - gen_rtx_PLUS (DFmode, - gen_rtx_MULT (DFmode, e2, y2), y2))); - /* u0 = n * y3 */ - emit_insn (gen_rtx_SET (VOIDmode, u0, - gen_rtx_MULT (DFmode, n, y3))); - /* v0 = n - d * u0 */ - emit_insn (gen_rtx_SET (VOIDmode, v0, - gen_rtx_MINUS (DFmode, n, - gen_rtx_MULT (DFmode, d, u0)))); - /* dst = u0 + v0 * y3 */ - emit_insn (gen_rtx_SET (VOIDmode, dst, - gen_rtx_PLUS (DFmode, - gen_rtx_MULT (DFmode, v0, y3), u0))); -} - - -/* Newton-Raphson approximation of single-precision floating point rsqrt. - Assumes no trapping math and finite arguments. */ + + e0 = gen_reg_rtx (mode); + rs6000_emit_nmsub (e0, d, x0, one); /* e0 = 1. - d * x0 */ + + y1 = gen_reg_rtx (mode); + rs6000_emit_madd (y1, e0, x0, x0); /* y1 = x0 + e0 * x0 */ + + e1 = gen_reg_rtx (mode); + emit_insn (gen_mul (e1, e0, e0)); /* e1 = e0 * e0 */ + + y2 = gen_reg_rtx (mode); + rs6000_emit_madd (y2, e1, y1, y1); /* y2 = y1 + e1 * y1 */ + + e2 = gen_reg_rtx (mode); + emit_insn (gen_mul (e2, e1, e1)); /* e2 = e1 * e1 */ + + y3 = gen_reg_rtx (mode); + rs6000_emit_madd (y3, e2, y2, y2); /* y3 = y2 + e2 * y2 */ + + u0 = gen_reg_rtx (mode); + emit_insn (gen_mul (u0, n, y3)); /* u0 = n * y3 */ + + v0 = gen_reg_rtx (mode); + rs6000_emit_nmsub (v0, d, u0, n); /* v0 = n - d * u0 */ + + rs6000_emit_madd (dst, v0, y3, u0); /* dst = u0 + v0 * y3 */ +} + +/* Newton-Raphson approximation of floating point divide DST = N/D. If NOTE_P, + add a reg_note saying that this was a division. Support both scalar and + vector divide. Assumes no trapping math and finite arguments. */ void -rs6000_emit_swrsqrtsf (rtx dst, rtx src) -{ - rtx x0, x1, x2, y1, u0, u1, u2, v0, v1, v2, t0, - half, one, halfthree, c1, cond, label; - - x0 = gen_reg_rtx (SFmode); - x1 = gen_reg_rtx (SFmode); - x2 = gen_reg_rtx (SFmode); - y1 = gen_reg_rtx (SFmode); - u0 = gen_reg_rtx (SFmode); - u1 = gen_reg_rtx (SFmode); - u2 = gen_reg_rtx (SFmode); - v0 = gen_reg_rtx (SFmode); - v1 = gen_reg_rtx (SFmode); - v2 = gen_reg_rtx (SFmode); - t0 = gen_reg_rtx (SFmode); - halfthree = gen_reg_rtx (SFmode); - cond = gen_rtx_REG (CCFPmode, CR1_REGNO); - label = gen_rtx_LABEL_REF (VOIDmode, gen_label_rtx ()); +rs6000_emit_swdiv (rtx dst, rtx n, rtx d, bool note_p) +{ + enum machine_mode mode = GET_MODE (dst); - /* check 0.0, 1.0, NaN, Inf by testing src * src = src */ - emit_insn (gen_rtx_SET (VOIDmode, t0, - gen_rtx_MULT (SFmode, src, src))); + if (RS6000_RECIP_HIGH_PRECISION_P (mode)) + rs6000_emit_swdiv_high_precision (dst, n, d); + else + rs6000_emit_swdiv_low_precision (dst, n, d); - emit_insn (gen_rtx_SET (VOIDmode, cond, - gen_rtx_COMPARE (CCFPmode, t0, src))); - c1 = gen_rtx_EQ (VOIDmode, cond, const0_rtx); - emit_unlikely_jump (c1, label); + if (note_p) + add_reg_note (get_last_insn (), REG_EQUAL, gen_rtx_DIV (mode, n, d)); +} - half = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, SFmode)); - one = force_reg (SFmode, CONST_DOUBLE_FROM_REAL_VALUE (dconst1, SFmode)); +/* Newton-Raphson approximation of single/double-precision floating point + rsqrt. Assumes no trapping math and finite arguments. */ + +void +rs6000_emit_swrsqrt (rtx dst, rtx src) +{ + enum machine_mode mode = GET_MODE (src); + rtx x0 = gen_reg_rtx (mode); + rtx y = gen_reg_rtx (mode); + int passes = (TARGET_RECIP_PRECISION) ? 2 : 3; + REAL_VALUE_TYPE dconst3_2; + int i; + rtx halfthree; + enum insn_code code = optab_handler (smul_optab, mode)->insn_code; + gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code); - /* halfthree = 1.5 = 1.0 + 0.5 */ - emit_insn (gen_rtx_SET (VOIDmode, halfthree, - gen_rtx_PLUS (SFmode, one, half))); + gcc_assert (code != CODE_FOR_nothing); + + /* Load up the constant 1.5 either as a scalar, or as a vector. */ + real_from_integer (&dconst3_2, VOIDmode, 3, 0, 0); + SET_REAL_EXP (&dconst3_2, REAL_EXP (&dconst3_2) - 1); + + halfthree = rs6000_load_constant_and_splat (mode, dconst3_2); /* x0 = rsqrt estimate */ emit_insn (gen_rtx_SET (VOIDmode, x0, - gen_rtx_UNSPEC (SFmode, gen_rtvec (1, src), + gen_rtx_UNSPEC (mode, gen_rtvec (1, src), UNSPEC_RSQRT))); - /* y1 = 0.5 * src = 1.5 * src - src -> fewer constants */ - emit_insn (gen_rtx_SET (VOIDmode, y1, - gen_rtx_MINUS (SFmode, - gen_rtx_MULT (SFmode, src, halfthree), - src))); - - /* x1 = x0 * (1.5 - y1 * (x0 * x0)) */ - emit_insn (gen_rtx_SET (VOIDmode, u0, - gen_rtx_MULT (SFmode, x0, x0))); - emit_insn (gen_rtx_SET (VOIDmode, v0, - gen_rtx_MINUS (SFmode, - halfthree, - gen_rtx_MULT (SFmode, y1, u0)))); - emit_insn (gen_rtx_SET (VOIDmode, x1, - gen_rtx_MULT (SFmode, x0, v0))); - - /* x2 = x1 * (1.5 - y1 * (x1 * x1)) */ - emit_insn (gen_rtx_SET (VOIDmode, u1, - gen_rtx_MULT (SFmode, x1, x1))); - emit_insn (gen_rtx_SET (VOIDmode, v1, - gen_rtx_MINUS (SFmode, - halfthree, - gen_rtx_MULT (SFmode, y1, u1)))); - emit_insn (gen_rtx_SET (VOIDmode, x2, - gen_rtx_MULT (SFmode, x1, v1))); - - /* dst = x2 * (1.5 - y1 * (x2 * x2)) */ - emit_insn (gen_rtx_SET (VOIDmode, u2, - gen_rtx_MULT (SFmode, x2, x2))); - emit_insn (gen_rtx_SET (VOIDmode, v2, - gen_rtx_MINUS (SFmode, - halfthree, - gen_rtx_MULT (SFmode, y1, u2)))); - emit_insn (gen_rtx_SET (VOIDmode, dst, - gen_rtx_MULT (SFmode, x2, v2))); + /* y = 0.5 * src = 1.5 * src - src -> fewer constants */ + rs6000_emit_msub (y, src, halfthree, src); - emit_label (XEXP (label, 0)); + for (i = 0; i < passes; i++) + { + rtx x1 = gen_reg_rtx (mode); + rtx u = gen_reg_rtx (mode); + rtx v = gen_reg_rtx (mode); + + /* x1 = x0 * (1.5 - y * (x0 * x0)) */ + emit_insn (gen_mul (u, x0, x0)); + rs6000_emit_nmsub (v, y, u, halfthree); + emit_insn (gen_mul (x1, x0, v)); + x0 = x1; + } + + emit_move_insn (dst, x0); + return; } /* Emit popcount intrinsic on TARGET_POPCNTB (Power5) and TARGET_POPCNTD diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index 327673e160c..5a1d7eeed68 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -543,6 +543,46 @@ extern int rs6000_vector_align[]; /* E500 processors only support plain "sync", not lwsync. */ #define TARGET_NO_LWSYNC TARGET_E500 +/* Which machine supports the various reciprocal estimate instructions. */ +#define TARGET_FRES (TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT \ + && TARGET_FPRS && TARGET_SINGLE_FLOAT) + +#define TARGET_FRE (TARGET_HARD_FLOAT && TARGET_FPRS \ + && TARGET_DOUBLE_FLOAT \ + && (TARGET_POPCNTB || VECTOR_UNIT_VSX_P (DFmode))) + +#define TARGET_FRSQRTES (TARGET_HARD_FLOAT && TARGET_POPCNTB \ + && TARGET_FPRS && TARGET_SINGLE_FLOAT) + +#define TARGET_FRSQRTE (TARGET_HARD_FLOAT && TARGET_FPRS \ + && TARGET_DOUBLE_FLOAT \ + && (TARGET_PPC_GFXOPT || VECTOR_UNIT_VSX_P (DFmode))) + +/* Whether the various reciprocal divide/square root estimate instructions + exist, and whether we should automatically generate code for the instruction + by default. */ +#define RS6000_RECIP_MASK_HAVE_RE 0x1 /* have RE instruction. */ +#define RS6000_RECIP_MASK_AUTO_RE 0x2 /* generate RE by default. */ +#define RS6000_RECIP_MASK_HAVE_RSQRTE 0x4 /* have RSQRTE instruction. */ +#define RS6000_RECIP_MASK_AUTO_RSQRTE 0x8 /* gen. RSQRTE by default. */ + +extern unsigned char rs6000_recip_bits[]; + +#define RS6000_RECIP_HAVE_RE_P(MODE) \ + (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_HAVE_RE) + +#define RS6000_RECIP_AUTO_RE_P(MODE) \ + (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_AUTO_RE) + +#define RS6000_RECIP_HAVE_RSQRTE_P(MODE) \ + (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_HAVE_RSQRTE) + +#define RS6000_RECIP_AUTO_RSQRTE_P(MODE) \ + (rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_AUTO_RSQRTE) + +#define RS6000_RECIP_HIGH_PRECISION_P(MODE) \ + ((MODE) == SFmode || (MODE) == V4SFmode || TARGET_RECIP_PRECISION) + /* Sometimes certain combinations of command options do not make sense on a particular target machine. You can define a macro `OVERRIDE_OPTIONS' to take account of this. This macro, if diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index bcb66ec1479..8f7093a6735 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -220,6 +220,9 @@ ; These modes do not fit in integer registers in 32-bit mode. (define_mode_iterator DIFD [DI DF DD]) +;; Iterator for reciprocal estimate instructions +(define_mode_iterator RECIPF [SF DF V4SF V2DF]) + ; Various instructions that come in SI and DI forms. ; A generic w/d attribute, for things like cmpw/cmpd. (define_mode_attr wd [(QI "b") (HI "h") (SI "w") (DI "d")]) @@ -240,6 +243,11 @@ (define_mode_attr mptrsize [(SI "si") (DI "di")]) +(define_mode_attr rreg [(SF "f") + (DF "Ws") + (V4SF "Wf") + (V2DF "Wd")]) + ;; Start with fixed-point load and store insns. Here we put only the more ;; complex forms. Basic data transfer is done later. @@ -5563,6 +5571,45 @@ [(set_attr "type" "var_delayed_compare,delayed_compare,var_delayed_compare,delayed_compare") (set_attr "length" "4,4,8,8")]) +;; Builtins to replace a division to generate FRE reciprocal estimate +;; instructions and the necessary fixup instructions +(define_expand "recip<mode>3" + [(match_operand:RECIPF 0 "gpc_reg_operand" "") + (match_operand:RECIPF 1 "gpc_reg_operand" "") + (match_operand:RECIPF 2 "gpc_reg_operand" "")] + "RS6000_RECIP_HAVE_RE_P (<MODE>mode)" +{ + rs6000_emit_swdiv (operands[0], operands[1], operands[2], false); + DONE; +}) + +;; Split to create division from FRE/FRES/etc. and fixup instead of the normal +;; hardware division. This is only done before register allocation and with +;; -ffast-math. This must appear before the divsf3/divdf3 insns. +(define_split + [(set (match_operand:RECIPF 0 "gpc_reg_operand" "") + (div:RECIPF (match_operand 1 "gpc_reg_operand" "") + (match_operand 2 "gpc_reg_operand" "")))] + "RS6000_RECIP_AUTO_RE_P (<MODE>mode) + && can_create_pseudo_p () && optimize_insn_for_speed_p () + && flag_finite_math_only && !flag_trapping_math && flag_reciprocal_math" + [(const_int 0)] +{ + rs6000_emit_swdiv (operands[0], operands[1], operands[2], true); + DONE; +}) + +;; Builtins to replace 1/sqrt(x) with instructions using RSQRTE and the +;; appropriate fixup. +(define_expand "rsqrt<mode>2" + [(match_operand:RECIPF 0 "gpc_reg_operand" "") + (match_operand:RECIPF 1 "gpc_reg_operand" "")] + "RS6000_RECIP_HAVE_RSQRT_P (<MODE>mode)" +{ + rs6000_emit_swrsqrt (operands[0], operands[1]); + DONE; +}) + (define_split [(set (match_operand:CC 3 "cc_reg_not_micro_cr0_operand" "") (compare:CC (ashiftrt:SI (match_operand:SI 1 "gpc_reg_operand" "") @@ -5766,22 +5813,10 @@ "{fd|fdiv} %0,%1,%2" [(set_attr "type" "ddiv")]) -(define_expand "recipsf3" - [(set (match_operand:SF 0 "gpc_reg_operand" "=f") - (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f") - (match_operand:SF 2 "gpc_reg_operand" "f")] - UNSPEC_FRES))] - "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT && !optimize_size - && flag_finite_math_only && !flag_trapping_math" -{ - rs6000_emit_swdivsf (operands[0], operands[1], operands[2]); - DONE; -}) - (define_insn "fres" [(set (match_operand:SF 0 "gpc_reg_operand" "=f") (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")] UNSPEC_FRES))] - "TARGET_PPC_GFXOPT && flag_finite_math_only" + "TARGET_FRES" "fres %0,%1" [(set_attr "type" "fp")]) @@ -5931,23 +5966,12 @@ "fsqrt %0,%1" [(set_attr "type" "dsqrt")]) -(define_expand "rsqrtsf2" +(define_insn "*rsqrtsf_internal1" [(set (match_operand:SF 0 "gpc_reg_operand" "=f") (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")] UNSPEC_RSQRT))] - "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT && !optimize_size - && flag_finite_math_only && !flag_trapping_math" -{ - rs6000_emit_swrsqrtsf (operands[0], operands[1]); - DONE; -}) - -(define_insn "*rsqrt_internal1" - [(set (match_operand:SF 0 "gpc_reg_operand" "=f") - (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")] - UNSPEC_RSQRT))] - "TARGET_HARD_FLOAT && TARGET_PPC_GFXOPT" - "frsqrte %0,%1" + "TARGET_FRSQRTES" + "frsqrtes %0,%1" [(set_attr "type" "fp")]) (define_expand "copysignsf3" @@ -5960,9 +5984,18 @@ (match_dup 5)) (match_dup 3) (match_dup 4)))] - "TARGET_PPC_GFXOPT && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT - && !HONOR_NANS (SFmode) && !HONOR_SIGNED_ZEROS (SFmode)" + "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT + && ((TARGET_PPC_GFXOPT + && !HONOR_NANS (SFmode) + && !HONOR_SIGNED_ZEROS (SFmode)) + || VECTOR_UNIT_VSX_P (DFmode))" { + if (VECTOR_UNIT_VSX_P (DFmode)) + { + emit_insn (gen_vsx_copysignsf3 (operands[0], operands[1], operands[2], + CONST0_RTX (SFmode))); + DONE; + } operands[3] = gen_reg_rtx (SFmode); operands[4] = gen_reg_rtx (SFmode); operands[5] = CONST0_RTX (SFmode); @@ -6222,31 +6255,21 @@ "{fd|fdiv} %0,%1,%2" [(set_attr "type" "ddiv")]) -(define_expand "recipdf3" - [(set (match_operand:DF 0 "gpc_reg_operand" "=d") - (unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d") - (match_operand:DF 2 "gpc_reg_operand" "d")] - UNSPEC_FRES))] - "TARGET_RECIP && TARGET_HARD_FLOAT && TARGET_POPCNTB && !optimize_size - && flag_finite_math_only && !flag_trapping_math" -{ - rs6000_emit_swdivdf (operands[0], operands[1], operands[2]); - DONE; -}) - -(define_expand "fred" - [(set (match_operand:DF 0 "gpc_reg_operand" "=d") - (unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")] UNSPEC_FRES))] - "(TARGET_POPCNTB || VECTOR_UNIT_VSX_P (DFmode)) && flag_finite_math_only" - "") - (define_insn "*fred_fpr" [(set (match_operand:DF 0 "gpc_reg_operand" "=f") (unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "f")] UNSPEC_FRES))] - "TARGET_POPCNTB && flag_finite_math_only && !VECTOR_UNIT_VSX_P (DFmode)" + "TARGET_FRE && !VECTOR_UNIT_VSX_P (DFmode)" "fre %0,%1" [(set_attr "type" "fp")]) +(define_insn "*rsqrtdf_internal1" + [(set (match_operand:DF 0 "gpc_reg_operand" "=d") + (unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")] + UNSPEC_RSQRT))] + "TARGET_FRSQRTE && !VECTOR_UNIT_VSX_P (DFmode)" + "frsqrte %0,%1" + [(set_attr "type" "fp")]) + (define_insn "*fmadddf4_fpr" [(set (match_operand:DF 0 "gpc_reg_operand" "=d") (plus:DF (mult:DF (match_operand:DF 1 "gpc_reg_operand" "%d") diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt index 63f0f8c1582..e70172a19a6 100644 --- a/gcc/config/rs6000/rs6000.opt +++ b/gcc/config/rs6000/rs6000.opt @@ -195,8 +195,16 @@ Target Report Var(TARGET_XL_COMPAT) Conform more closely to IBM XLC semantics mrecip -Target Report Var(TARGET_RECIP) -Generate software reciprocal sqrt for better throughput +Target Report +Generate software reciprocal divide and square root for better throughput. + +mrecip= +Target Report RejectNegative Joined +Generate software reciprocal divide and square root for better throughput. + +mrecip-precision +Target Report Mask(RECIP_PRECISION) +Assume that the reciprocal estimate instructions provide more accuracy. mno-fp-in-toc Target Report RejectNegative Var(TARGET_NO_FP_IN_TOC) diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md index 46fb2926c9f..760baeb458d 100644 --- a/gcc/config/rs6000/vector.md +++ b/gcc/config/rs6000/vector.md @@ -267,6 +267,20 @@ "VECTOR_UNIT_VSX_P (<MODE>mode)" "") +(define_expand "rsqrte<mode>2" + [(set (match_operand:VEC_F 0 "vfloat_operand" "") + (unspec:VEC_F [(match_operand:VEC_F 1 "vfloat_operand" "")] + UNSPEC_RSQRT))] + "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)" + "") + +(define_expand "re<mode>2" + [(set (match_operand:VEC_F 0 "vfloat_operand" "") + (unspec:VEC_F [(match_operand:VEC_F 1 "vfloat_operand" "f")] + UNSPEC_FRES))] + "VECTOR_UNIT_ALTIVEC_OR_VSX_P (<MODE>mode)" + "") + (define_expand "ftrunc<mode>2" [(set (match_operand:VEC_F 0 "vfloat_operand" "") (fix:VEC_F (match_operand:VEC_F 1 "vfloat_operand" "")))] diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 7d572a48412..213d53ae5d1 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -195,7 +195,7 @@ (UNSPEC_VSX_MSUB 511) (UNSPEC_VSX_NMADD 512) (UNSPEC_VSX_NMSUB 513) - (UNSPEC_VSX_RSQRTE 514) + ;; 514 deleted (UNSPEC_VSX_TDIV 515) (UNSPEC_VSX_TSQRT 516) (UNSPEC_VSX_XXPERMDI 517) @@ -446,10 +446,10 @@ [(set_attr "type" "<VStype_sqrt>") (set_attr "fp_type" "<VSfptype_sqrt>")]) -(define_insn "vsx_rsqrte<mode>2" +(define_insn "*vsx_rsqrte<mode>2" [(set (match_operand:VSX_B 0 "vsx_register_operand" "=<VSr>,?wa") (unspec:VSX_B [(match_operand:VSX_B 1 "vsx_register_operand" "<VSr>,wa")] - UNSPEC_VSX_RSQRTE))] + UNSPEC_RSQRT))] "VECTOR_UNIT_VSX_P (<MODE>mode)" "x<VSv>rsqrte<VSs> %x0,%x1" [(set_attr "type" "<VStype_simple>") @@ -862,6 +862,20 @@ [(set_attr "type" "<VStype_simple>") (set_attr "fp_type" "<VSfptype_simple>")]) +;; Special version of copysign for single precision that knows internally +;; scalar single values are kept as double +(define_insn "vsx_copysignsf3" + [(set (match_operand:SF 0 "vsx_register_operand" "=f") + (if_then_else:SF + (ge:SF (match_operand:SF 2 "vsx_register_operand" "f") + (match_operand:SF 3 "zero_constant" "j")) + (abs:SF (match_operand:SF 1 "vsx_register_operand" "f")) + (neg:SF (abs:SF (match_dup 1)))))] + "VECTOR_UNIT_VSX_P (DFmode)" + "xscpsgndp %x0,%x2,%x1" + [(set_attr "type" "fp") + (set_attr "fp_type" "fp_addsub_d")]) + ;; For the conversions, limit the register class for the integer value to be ;; the fprs because we don't want to add the altivec registers to movdi/movsi. ;; For the unsigned tests, there isn't a generic double -> unsigned conversion diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 8e9a7061b2b..5f0d7624a04 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -10994,6 +10994,10 @@ vector unsigned char vec_vrlb (vector unsigned char, vector float vec_round (vector float); +vector float vec_recip (vector float, vector float); + +vector float vec_rsqrt (vector float); + vector float vec_rsqrte (vector float); vector float vec_sel (vector float, vector float, vector bool int); @@ -11922,8 +11926,10 @@ vector double vec_or (vector bool long, vector double); vector double vec_perm (vector double, vector double, vector unsigned char); -vector float vec_rint (vector float); vector double vec_rint (vector double); +vector double vec_recip (vector double, vector double); +vector double vec_rsqrt (vector double); +vector double vec_rsqrte (vector double); vector double vec_sel (vector double, vector double, vector bool long); vector double vec_sel (vector double, vector double, vector unsigned long); vector double vec_sub (vector double, vector double); @@ -11964,10 +11970,20 @@ GCC provides a few other builtins on Powerpc to access certain instructions: float __builtin_recipdivf (float, float); float __builtin_rsqrtf (float); double __builtin_recipdiv (double, double); +double __builtin_rsqrt (double); long __builtin_bpermd (long, long); int __builtin_bswap16 (int); @end smallexample +The @code{vec_rsqrt}, @code{__builtin_rsqrt}, and +@code{__builtin_rsqrtf} functions generate multiple instructions to +implement the reciprocal sqrt functionality using reciprocal sqrt +estimate instructions. + +The @code{__builtin_recipdiv}, and @code{__builtin_recipdivf} +functions generate multiple instructions to implement division using +the reciprocal estimate instructions. + @node RX Built-in Functions @subsection RX Built-in Functions GCC supports some of the RX instructions which cannot be expressed in diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 2a4ea479682..d8c0c22bed6 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -783,7 +783,8 @@ See RS/6000 and PowerPC Options. -mfloat-gprs=yes -mfloat-gprs=no -mfloat-gprs=single -mfloat-gprs=double @gol -mprototype -mno-prototype @gol -msim -mmvme -mads -myellowknife -memb -msdata @gol --msdata=@var{opt} -mvxworks -G @var{num} -pthread} +-msdata=@var{opt} -mvxworks -G @var{num} -pthread @gol +-mrecip -mrecip=@var{opt} -mno-recip -mrecip-precision -mno-recip-precision} @emph{RX Options} @gccoptlist{-m64bit-doubles -m32bit-doubles -fpu -nofpu@gol @@ -14975,17 +14976,6 @@ values for @var{cpu_type} are used for @option{-mtune} as for architecture, registers, and mnemonics set by @option{-mcpu}, but the scheduling parameters set by @option{-mtune}. -@item -mswdiv -@itemx -mno-swdiv -@opindex mswdiv -@opindex mno-swdiv -Generate code to compute division as reciprocal estimate and iterative -refinement, creating opportunities for increased throughput. This -feature requires: optional PowerPC Graphics instruction set for single -precision and FRE instruction for double precision, assuming divides -cannot generate user-visible traps, and the domain values not include -Infinities, denormals or zero denominator. - @item -maltivec @itemx -mno-altivec @opindex maltivec @@ -15641,6 +15631,52 @@ sequence. Adds support for multithreading with the @dfn{pthreads} library. This option sets flags for both the preprocessor and linker. +@item -mrecip +@itemx -mno-recip +@opindex mrecip +This option will enable GCC to use the reciprocal estimate and +reciprocal square root estimate instructions with additional +Newton-Raphson steps to increase precision instead of doing a divide or +square root and divide for floating point arguments. You should use +the @option{-ffast-math} option when using @option{-mrecip} (or at +least @option{-funsafe-math-optimizations}, +@option{-finite-math-only}, @option{-freciprocal-math} and +@option{-fno-trapping-math}). Note that while the throughput of the +sequence is generally higher than the throughput of the non-reciprocal +instruction, the precision of the sequence can be decreased by up to 2 +ulp (i.e. the inverse of 1.0 equals 0.99999994) for reciprocal square +roots. + +@item -mrecip=@var{opt} +@opindex mrecip=opt +This option allows to control which reciprocal estimate instructions +may be used. @var{opt} is a comma separated list of options, that may +be preceeded by a @code{!} to invert the option: +@code{all}: enable all estimate instructions, +@code{default}: enable the default instructions, equvalent to @option{-mrecip}, +@code{none}: disable all estimate instructions, equivalent to @option{-mno-recip}; +@code{div}: enable the reciprocal approximation instructions for both single and double precision; +@code{divf}: enable the single precision reciprocal approximation instructions; +@code{divd}: enable the double precision reciprocal approximation instructions; +@code{rsqrt}: enable the reciprocal square root approximation instructions for both single and double precision; +@code{rsqrtf}: enable the single precision reciprocal square root approximation instructions; +@code{rsqrtd}: enable the double precision reciprocal square root approximation instructions; + +So for example, @option{-mrecip=all,!rsqrtd} would enable the +all of the reciprocal estimate instructions, except for the +@code{FRSQRTE}, @code{XSRSQRTEDP}, and @code{XVRSQRTEDP} instructions +which handle the double precision reciprocal square root calculations. + +@item -mrecip-precision +@itemx -mno-recip-precision +@opindex mrecip-precision +Assume (do not assume) that the reciprocal estimate instructions +provide higher precision estimates than is mandated by the powerpc +ABI. Selecting @option{-mcpu=power6} or @option{-mcpu=power7} +automatically selects @option{-mrecip-precision}. The double +precision square root estimate instructions are not generated by +default on low precision machines, since they do not provide an +estimate that converges after three steps. @end table @node RX Options diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 4f97f01350f..ccbd30c6fdc 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,16 @@ +2010-06-02 Michael Meissner <meissner@linux.vnet.ibm.com> + + PR target/44218 + * gcc.target/powerpc/recip-1.c: New test for -mrecip support. + * gcc.target/powerpc/recip-2.c: Ditto. + * gcc.target/powerpc/recip-3.c: Ditto. + * gcc.target/powerpc/recip-4.c: Ditto. + * gcc.target/powerpc/recip-5.c: Ditto. + * gcc.target/powerpc/recip-6.c: Ditto. + * gcc.target/powerpc/recip-7.c: Ditto. + * gcc.target/powerpc/recip-test.h: Ditto. + * gcc.target/powerpc/recip-test2.h: Ditto. + 2010-06-02 H.J. Lu <hongjiu.lu@intel.com> * g++.dg/torture/pr44295.C (size_t): Use __SIZE_TYPE__. diff --git a/gcc/testsuite/gcc.target/powerpc/recip-1.c b/gcc/testsuite/gcc.target/powerpc/recip-1.c new file mode 100644 index 00000000000..d1e383dc4ea --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/recip-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile { target { powerpc*-*-* } } } */ +/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power6" } */ +/* { dg-final { scan-assembler-times "frsqrte" 2 } } */ +/* { dg-final { scan-assembler-times "fmsub" 2 } } */ +/* { dg-final { scan-assembler-times "fmul" 8 } } */ +/* { dg-final { scan-assembler-times "fnmsub" 4 } } */ + +double +rsqrt_d (double a) +{ + return 1.0 / __builtin_sqrt (a); +} + +float +rsqrt_f (float a) +{ + return 1.0f / __builtin_sqrtf (a); +} diff --git a/gcc/testsuite/gcc.target/powerpc/recip-2.c b/gcc/testsuite/gcc.target/powerpc/recip-2.c new file mode 100644 index 00000000000..69442733aab --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/recip-2.c @@ -0,0 +1,21 @@ +/* { dg-do compile { target { powerpc*-*-* } } } */ +/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power5" } */ +/* { dg-final { scan-assembler-times "frsqrtes" 1 } } */ +/* { dg-final { scan-assembler-times "fmsubs" 1 } } */ +/* { dg-final { scan-assembler-times "fmuls" 6 } } */ +/* { dg-final { scan-assembler-times "fnmsubs" 3 } } */ +/* { dg-final { scan-assembler-times "fsqrt" 1 } } */ + +/* power5 resqrte is not accurate enough, and should not be generated by + default for -mrecip. */ +double +rsqrt_d (double a) +{ + return 1.0 / __builtin_sqrt (a); +} + +float +rsqrt_f (float a) +{ + return 1.0f / __builtin_sqrtf (a); +} diff --git a/gcc/testsuite/gcc.target/powerpc/recip-3.c b/gcc/testsuite/gcc.target/powerpc/recip-3.c new file mode 100644 index 00000000000..80a34e8ee59 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/recip-3.c @@ -0,0 +1,22 @@ +/* { dg-do compile { target { powerpc*-*-* } } } */ +/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power7" } */ +/* { dg-final { scan-assembler-times "xsrsqrtedp" 1 } } */ +/* { dg-final { scan-assembler-times "xsmsub.dp" 1 } } */ +/* { dg-final { scan-assembler-times "xsmuldp" 4 } } */ +/* { dg-final { scan-assembler-times "xsnmsub.dp" 2 } } */ +/* { dg-final { scan-assembler-times "frsqrtes" 1 } } */ +/* { dg-final { scan-assembler-times "fmsubs" 1 } } */ +/* { dg-final { scan-assembler-times "fmuls" 4 } } */ +/* { dg-final { scan-assembler-times "fnmsubs" 2 } } */ + +double +rsqrt_d (double a) +{ + return 1.0 / __builtin_sqrt (a); +} + +float +rsqrt_f (float a) +{ + return 1.0f / __builtin_sqrtf (a); +} diff --git a/gcc/testsuite/gcc.target/powerpc/recip-4.c b/gcc/testsuite/gcc.target/powerpc/recip-4.c new file mode 100644 index 00000000000..bd496d70e25 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/recip-4.c @@ -0,0 +1,36 @@ +/* { dg-do compile { target { powerpc*-*-* } } } */ +/* { dg-options "-O3 -ftree-vectorize -mrecip -ffast-math -mcpu=power7 -fno-unroll-loops" } */ +/* { dg-final { scan-assembler-times "xvrsqrtedp" 1 } } */ +/* { dg-final { scan-assembler-times "xvmsub.dp" 1 } } */ +/* { dg-final { scan-assembler-times "xvmuldp" 4 } } */ +/* { dg-final { scan-assembler-times "xvnmsub.dp" 2 } } */ +/* { dg-final { scan-assembler-times "xvrsqrtesp" 1 } } */ +/* { dg-final { scan-assembler-times "xvmsub.sp" 1 } } */ +/* { dg-final { scan-assembler-times "xvmulsp" 4 } } */ +/* { dg-final { scan-assembler-times "xvnmsub.sp" 2 } } */ + +#define SIZE 1024 + +extern double a_d[SIZE] __attribute__((__aligned__(32))); +extern double b_d[SIZE] __attribute__((__aligned__(32))); + +void +vectorize_rsqrt_d (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a_d[i] = 1.0 / __builtin_sqrt (b_d[i]); +} + +extern float a_f[SIZE] __attribute__((__aligned__(32))); +extern float b_f[SIZE] __attribute__((__aligned__(32))); + +void +vectorize_rsqrt_f (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a_f[i] = 1.0f / __builtin_sqrtf (b_f[i]); +} diff --git a/gcc/testsuite/gcc.target/powerpc/recip-5.c b/gcc/testsuite/gcc.target/powerpc/recip-5.c new file mode 100644 index 00000000000..4a9c496201a --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/recip-5.c @@ -0,0 +1,94 @@ +/* { dg-do compile { target { powerpc*-*-* } } } */ +/* { dg-options "-O3 -ftree-vectorize -mrecip=all -ffast-math -mcpu=power7 -fno-unroll-loops" } */ +/* { dg-final { scan-assembler-times "xvredp" 4 } } */ +/* { dg-final { scan-assembler-times "xvresp" 5 } } */ +/* { dg-final { scan-assembler-times "xsredp" 2 } } */ +/* { dg-final { scan-assembler-times "fres" 2 } } */ + +#include <altivec.h> + +float f_recip (float a, float b) { return __builtin_recipdivf (a, b); } +double d_recip (double a, double b) { return __builtin_recipdiv (a, b); } + +float f_div (float a, float b) { return a / b; } +double d_div (double a, double b) { return a / b; } + +#define SIZE 1024 + +double d_a[SIZE] __attribute__((__aligned__(32))); +double d_b[SIZE] __attribute__((__aligned__(32))); +double d_c[SIZE] __attribute__((__aligned__(32))); + +float f_a[SIZE] __attribute__((__aligned__(32))); +float f_b[SIZE] __attribute__((__aligned__(32))); +float f_c[SIZE] __attribute__((__aligned__(32))); + +void vec_f_recip (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + f_a[i] = __builtin_recipdivf (f_b[i], f_c[i]); +} + +void vec_d_recip (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + d_a[i] = __builtin_recipdiv (d_b[i], d_c[i]); +} + +void vec_f_div (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + f_a[i] = f_b[i] / f_c[i]; +} + +void vec_f_div2 (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + f_a[i] = f_b[i] / 2.0f; +} + +void vec_f_div53 (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + f_a[i] = f_b[i] / 53.0f; +} + +void vec_d_div (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + d_a[i] = d_b[i] / d_c[i]; +} + +void vec_d_div2 (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + d_a[i] = d_b[i] / 2.0; +} + +void vec_d_div53 (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + d_a[i] = d_b[i] / 53.0; +} + +vector float v4sf_recip1 (vector float a, vector float b) { return vec_recipdiv (a, b); } +vector float v4sf_recip2 (vector float a, vector float b) { return __builtin_altivec_vrecipdivfp (a, b); } +vector double v2df_recip1 (vector double a, vector double b) { return vec_recipdiv (a, b); } +vector float v4sf_recip3 (vector float a, vector float b) { return __builtin_vsx_xvrecipdivsp (a, b); } +vector double v2df_recip2 (vector double a, vector double b) { return __builtin_vsx_xvrecipdivdp (a, b); } diff --git a/gcc/testsuite/gcc.target/powerpc/recip-6.c b/gcc/testsuite/gcc.target/powerpc/recip-6.c new file mode 100644 index 00000000000..7d71df6709d --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/recip-6.c @@ -0,0 +1,16 @@ +/* { dg-do run { target { powerpc*-*-linux* } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */ +/* { dg-require-effective-target vsx_hw } */ +/* { dg-options "-mcpu=power7 -O3 -ftree-vectorize -ffast-math -mrecip=all -mrecip-precision" } */ + +/* Check reciprocal estimate functions for accuracy. */ + +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <math.h> +#include <float.h> +#include <string.h> + +#include "recip-test.h" diff --git a/gcc/testsuite/gcc.target/powerpc/recip-7.c b/gcc/testsuite/gcc.target/powerpc/recip-7.c new file mode 100644 index 00000000000..7b32ba076a3 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/recip-7.c @@ -0,0 +1,16 @@ +/* { dg-do run { target { powerpc*-*-linux* } } } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-skip-if "" { powerpc*-*-*spe* } { "*" } { "" } } */ +/* { dg-require-effective-target ppc_recip_hw } */ +/* { dg-options "-O3 -ftree-vectorize -ffast-math -mrecip -mpowerpc-gfxopt -mpowerpc-gpopt -mpopcntb" } */ + +/* Check reciprocal estimate functions for accuracy. */ + +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <math.h> +#include <float.h> +#include <string.h> + +#include "recip-test.h" diff --git a/gcc/testsuite/gcc.target/powerpc/recip-test.h b/gcc/testsuite/gcc.target/powerpc/recip-test.h new file mode 100644 index 00000000000..7a42df5757d --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/recip-test.h @@ -0,0 +1,149 @@ +/* Check reciprocal estimate functions for accuracy. */ + +#ifdef _ARCH_PPC64 +typedef unsigned long uns64_t; +#define UNUM64(x) x ## L + +#else +typedef unsigned long long uns64_t; +#define UNUM64(x) x ## LL +#endif + +typedef unsigned int uns32_t; + +#define TNAME2(x) #x +#define TNAME(x) TNAME2(x) + +/* + * Float functions. + */ + +#define TYPE float +#define NAME(PREFIX) PREFIX ## _float +#define UNS_TYPE uns32_t +#define UNS_ABS __builtin_abs +#define EXP_SIZE 8 +#define MAN_SIZE 23 +#define FABS __builtin_fabsf +#define FMAX __builtin_fmaxf +#define FMIN __builtin_fminf +#define SQRT __builtin_sqrtf +#define RMIN 1.0e-10 +#define RMAX 1.0e+10 +#define BDIV 1 +#define BRSQRT 2 +#define ASMDIV "fdivs" +#define ASMSQRT "fsqrts" + +#define INIT_DIV \ +{ \ + { 0x4fffffff }, /* 8589934080 */ \ + { 0x4effffff }, /* 2147483520 */ \ + { 0x40ffffff }, /* 7.99999952316284 */ \ + { 0x3fffffff }, /* 1.99999988079071 */ \ + { 0x417fffff }, /* 15.9999990463257 */ \ + { 0x42ffffff }, /* 127.999992370605 */ \ + { 0x3dffffff }, /* 0.124999992549419 */ \ + { 0x3effffff }, /* 0.499999970197678 */ \ +} + +#define INIT_RSQRT \ +{ \ + { 0x457ffffe }, /* 4096 - small amount */ \ + { 0x4c7fffff }, /* 6.71089e+07 */ \ + { 0x3d7fffff }, /* 0.0625 - small amount */ \ + { 0x307ffffe }, /* 9.31322e-10 */ \ + { 0x4c7ffffe }, /* 6.71089e+07 */ \ + { 0x397ffffe }, /* 0.000244141 */ \ + { 0x2e7fffff }, /* 5.82077e-11 */ \ + { 0x2f7fffff }, /* 2.32831e-10 */ \ +} + + +#include "recip-test2.h" + +/* + * Double functions. + */ + +#undef TYPE +#undef NAME +#undef UNS_TYPE +#undef UNS_ABS +#undef EXP_SIZE +#undef MAN_SIZE +#undef FABS +#undef FMAX +#undef FMIN +#undef SQRT +#undef RMIN +#undef RMAX +#undef BDIV +#undef BRSQRT +#undef ASMDIV +#undef ASMSQRT +#undef INIT_DIV +#undef INIT_RSQRT + +#define TYPE double +#define NAME(PREFIX) PREFIX ## _double +#define UNS_TYPE uns64_t +#define UNS_ABS __builtin_imaxabs +#define EXP_SIZE 11 +#define MAN_SIZE 52 +#define FABS __builtin_fabs +#define FMAX __builtin_fmax +#define FMIN __builtin_fmin +#define SQRT __builtin_sqrt +#define RMIN 1.0e-100 +#define RMAX 1.0e+100 +#define BDIV 1 +#define BRSQRT 2 +#define ASMDIV "fdiv" +#define ASMSQRT "fsqrt" + +#define INIT_DIV \ +{ \ + { UNUM64 (0x2b57be53f2a2f3a0) }, /* 6.78462e-100 */ \ + { UNUM64 (0x2b35f8e8ea553e52) }, /* 1.56963e-100 */ \ + { UNUM64 (0x2b5b9d861d2fe4fb) }, /* 7.89099e-100 */ \ + { UNUM64 (0x2b45dc44a084e682) }, /* 3.12327e-100 */ \ + { UNUM64 (0x2b424ce16945d777) }, /* 2.61463e-100 */ \ + { UNUM64 (0x2b20b5023d496b50) }, /* 5.96749e-101 */ \ + { UNUM64 (0x2b61170547f57caa) }, /* 9.76678e-100 */ \ + { UNUM64 (0x2b543b9d498aac37) }, /* 5.78148e-100 */ \ +} + +#define INIT_RSQRT \ +{ \ + { UNUM64 (0x2b616f2d8cbbc646) }, /* 9.96359e-100 */ \ + { UNUM64 (0x2b5c4db2da0a011d) }, /* 8.08764e-100 */ \ + { UNUM64 (0x2b55a82d5735b262) }, /* 6.1884e-100 */ \ + { UNUM64 (0x2b50b52908258cb8) }, /* 4.77416e-100 */ \ + { UNUM64 (0x2b363989a4fb29af) }, /* 1.58766e-100 */ \ + { UNUM64 (0x2b508b9f6f4180a9) }, /* 4.7278e-100 */ \ + { UNUM64 (0x2b4f7a1d48accb40) }, /* 4.49723e-100 */ \ + { UNUM64 (0x2b1146a37372a81f) }, /* 3.08534e-101 */ \ + { UNUM64 (0x2b33f876a8c48050) }, /* 1.42663e-100 */ \ +} + +#include "recip-test2.h" + +int +main (int argc __attribute__((__unused__)), + char *argv[] __attribute__((__unused__))) +{ + srand48 (1); + run_float (); + +#ifdef VERBOSE + printf ("\n"); +#endif + + run_double (); + + if (error_count_float != 0 || error_count_double != 0) + abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/powerpc/recip-test2.h b/gcc/testsuite/gcc.target/powerpc/recip-test2.h new file mode 100644 index 00000000000..3ec356cdfd8 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/recip-test2.h @@ -0,0 +1,432 @@ +/* + * Included file to common source float/double checking + * The following macros should be defined: + * TYPE -- floating point type + * NAME -- convert a name to include the type + * UNS_TYPE -- type to hold TYPE as an unsigned number + * EXP_SIZE -- size in bits of the exponent + * MAN_SIZE -- size in bits of the mantissa + * UNS_ABS -- absolute value for UNS_TYPE + * FABS -- absolute value function for TYPE + * FMAX -- maximum function for TYPE + * FMIN -- minimum function for TYPE + * SQRT -- square root function for TYPE + * RMIN -- minimum random number to generate + * RMAX -- maximum random number to generate + * ASMDIV -- assembler instruction to do divide + * ASMSQRT -- assembler instruction to do square root + * BDIV -- # of bits of inaccuracy to allow for division + * BRSQRT -- # of bits of inaccuracy to allow for 1/sqrt + * INIT_DIV -- Initial values to test 1/x against + * INIT_RSQRT -- Initial values to test 1/sqrt(x) against + */ + +typedef union +{ + UNS_TYPE i; + TYPE x; +} NAME (union); + +/* + * Input/output arrays. + */ + +static NAME (union) NAME (div_input) [] __attribute__((__aligned__(32))) = INIT_DIV; +static NAME (union) NAME (rsqrt_input)[] __attribute__((__aligned__(32))) = INIT_RSQRT; + +#define DIV_SIZE (sizeof (NAME (div_input)) / sizeof (TYPE)) +#define RSQRT_SIZE (sizeof (NAME (rsqrt_input)) / sizeof (TYPE)) + +static TYPE NAME (div_expected)[DIV_SIZE] __attribute__((__aligned__(32))); +static TYPE NAME (div_output) [DIV_SIZE] __attribute__((__aligned__(32))); + +static TYPE NAME (rsqrt_expected)[RSQRT_SIZE] __attribute__((__aligned__(32))); +static TYPE NAME (rsqrt_output) [RSQRT_SIZE] __attribute__((__aligned__(32))); + + +/* + * Crack a floating point number into sign bit, exponent, and mantissa. + */ + +static void +NAME (crack) (TYPE number, unsigned int *p_sign, unsigned *p_exponent, UNS_TYPE *p_mantissa) +{ + NAME (union) u; + UNS_TYPE bits; + + u.x = number; + bits = u.i; + + *p_sign = (unsigned int)((bits >> (EXP_SIZE + MAN_SIZE)) & 0x1); + *p_exponent = (unsigned int)((bits >> MAN_SIZE) & ((((UNS_TYPE)1) << EXP_SIZE) - 1)); + *p_mantissa = bits & ((((UNS_TYPE)1) << MAN_SIZE) - 1); + return; +} + + +/* + * Prevent optimizer from eliminating + 0.0 to remove -0.0. + */ + +volatile TYPE NAME (math_diff_0) = ((TYPE) 0.0); + +/* + * Return negative if two numbers are significanly different or return the + * number of bits that are different in the mantissa. + */ + +static int +NAME (math_diff) (TYPE a, TYPE b, int bits) +{ + TYPE zero = NAME (math_diff_0); + unsigned int sign_a, sign_b; + unsigned int exponent_a, exponent_b; + UNS_TYPE mantissa_a, mantissa_b, diff; + int i; + + /* eliminate signed zero. */ + a += zero; + b += zero; + + /* special case Nan. */ + if (__builtin_isnan (a)) + return (__builtin_isnan (b) ? 0 : -1); + + if (a == b) + return 0; + + /* special case infinity. */ + if (__builtin_isinf (a)) + return (__builtin_isinf (b) ? 0 : -1); + + /* punt on denormal numbers. */ + if (!__builtin_isnormal (a) || !__builtin_isnormal (b)) + return -1; + + NAME (crack) (a, &sign_a, &exponent_a, &mantissa_a); + NAME (crack) (b, &sign_b, &exponent_b, &mantissa_b); + + /* If the sign is different, there is no hope. */ + if (sign_a != sign_b) + return -1; + + /* If the exponent is off by 1, see if the values straddle the power of two, + and adjust things to do the mantassa check if we can. */ + if ((exponent_a == (exponent_b+1)) || (exponent_a == (exponent_b-1))) + { + TYPE big = FMAX (a, b); + TYPE small = FMIN (a, b); + TYPE diff = FABS (a - b); + unsigned int sign_big, sign_small, sign_test; + unsigned int exponent_big, exponent_small, exponent_test; + UNS_TYPE mantissa_big, mantissa_small, mantissa_test; + + NAME (crack) (big, &sign_big, &exponent_big, &mantissa_big); + NAME (crack) (small, &sign_small, &exponent_small, &mantissa_small); + + NAME (crack) (small - diff, &sign_test, &exponent_test, &mantissa_test); + if ((sign_test == sign_small) && (exponent_test == exponent_small)) + { + mantissa_a = mantissa_small; + mantissa_b = mantissa_test; + } + + else + { + NAME (crack) (big + diff, &sign_test, &exponent_test, &mantissa_test); + if ((sign_test == sign_big) && (exponent_test == exponent_big)) + { + mantissa_a = mantissa_big; + mantissa_b = mantissa_test; + } + + else + return -1; + } + } + + else if (exponent_a != exponent_b) + return -1; + + diff = UNS_ABS (mantissa_a - mantissa_b); + for (i = MAN_SIZE; i > 0; i--) + { + if ((diff & ((UNS_TYPE)1) << (i-1)) != 0) + return i; + } + + return -1; +} + + +/* + * Turn off inlining to make code inspection easier. + */ + +static void NAME (asm_div) (void) __attribute__((__noinline__)); +static void NAME (vector_div) (void) __attribute__((__noinline__)); +static void NAME (scalar_div) (void) __attribute__((__noinline__)); +static void NAME (asm_rsqrt) (void) __attribute__((__noinline__)); +static void NAME (vector_rsqrt) (void) __attribute__((__noinline__)); +static void NAME (scalar_rsqrt) (void) __attribute__((__noinline__)); +static void NAME (check_div) (const char *) __attribute__((__noinline__)); +static void NAME (check_rsqrt) (const char *) __attribute__((__noinline__)); +static void NAME (run) (void) __attribute__((__noinline__)); + + +/* + * Division function that might be vectorized. + */ + +static void +NAME (vector_div) (void) +{ + size_t i; + + for (i = 0; i < DIV_SIZE; i++) + NAME (div_output)[i] = ((TYPE) 1.0) / NAME (div_input)[i].x; +} + +/* + * Division function that is not vectorized. + */ + +static void +NAME (scalar_div) (void) +{ + size_t i; + + for (i = 0; i < DIV_SIZE; i++) + { + TYPE x = ((TYPE) 1.0) / NAME (div_input)[i].x; + TYPE y; + __asm__ ("" : "=d" (y) : "0" (x)); + NAME (div_output)[i] = y; + } +} + +/* + * Generate the division instruction via asm. + */ + +static void +NAME (asm_div) (void) +{ + size_t i; + + for (i = 0; i < DIV_SIZE; i++) + { + TYPE x; + __asm__ (ASMDIV " %0,%1,%2" + : "=d" (x) + : "d" ((TYPE) 1.0), "d" (NAME (div_input)[i].x)); + NAME (div_expected)[i] = x; + } +} + +/* + * Reciprocal square root function that might be vectorized. + */ + +static void +NAME (vector_rsqrt) (void) +{ + size_t i; + + for (i = 0; i < RSQRT_SIZE; i++) + NAME (rsqrt_output)[i] = ((TYPE) 1.0) / SQRT (NAME (rsqrt_input)[i].x); +} + +/* + * Reciprocal square root function that is not vectorized. + */ + +static void +NAME (scalar_rsqrt) (void) +{ + size_t i; + + for (i = 0; i < RSQRT_SIZE; i++) + { + TYPE x = ((TYPE) 1.0) / SQRT (NAME (rsqrt_input)[i].x); + TYPE y; + __asm__ ("" : "=d" (y) : "0" (x)); + NAME (rsqrt_output)[i] = y; + } +} + +/* + * Generate the 1/sqrt instructions via asm. + */ + +static void +NAME (asm_rsqrt) (void) +{ + size_t i; + + for (i = 0; i < RSQRT_SIZE; i++) + { + TYPE x; + TYPE y; + __asm__ (ASMSQRT " %0,%1" : "=d" (x) : "d" (NAME (rsqrt_input)[i].x)); + __asm__ (ASMDIV " %0,%1,%2" : "=d" (y) : "d" ((TYPE) 1.0), "d" (x)); + NAME (rsqrt_expected)[i] = y; + } +} + + +/* + * Functions to abort or report errors. + */ + +static int NAME (error_count) = 0; + +#ifdef VERBOSE +static int NAME (max_bits_div) = 0; +static int NAME (max_bits_rsqrt) = 0; +#endif + + +/* + * Compare the expected value with the value we got. + */ + +static void +NAME (check_div) (const char *test) +{ + size_t i; + int b; + + for (i = 0; i < DIV_SIZE; i++) + { + TYPE exp = NAME (div_expected)[i]; + TYPE out = NAME (div_output)[i]; + b = NAME (math_diff) (exp, out, BDIV); + +#ifdef VERBOSE + if (b != 0) + { + NAME (union) u_in = NAME (div_input)[i]; + NAME (union) u_exp; + NAME (union) u_out; + char explanation[64]; + const char *p_exp; + + if (b < 0) + p_exp = "failed"; + else + { + p_exp = explanation; + sprintf (explanation, "%d bit error%s", b, (b > BDIV) ? ", failed" : ""); + } + + u_exp.x = exp; + u_out.x = out; + printf ("%s %s %s for 1.0 / %g [0x%llx], expected %g [0x%llx], got %g [0x%llx]\n", + TNAME (TYPE), test, p_exp, + (double) u_in.x, (unsigned long long) u_in.i, + (double) exp, (unsigned long long) u_exp.i, + (double) out, (unsigned long long) u_out.i); + } +#endif + + if (b < 0 || b > BDIV) + NAME (error_count)++; + +#ifdef VERBOSE + if (b > NAME (max_bits_div)) + NAME (max_bits_div) = b; +#endif + } +} + +static void +NAME (check_rsqrt) (const char *test) +{ + size_t i; + int b; + + for (i = 0; i < RSQRT_SIZE; i++) + { + TYPE exp = NAME (rsqrt_expected)[i]; + TYPE out = NAME (rsqrt_output)[i]; + b = NAME (math_diff) (exp, out, BRSQRT); + +#ifdef VERBOSE + if (b != 0) + { + NAME (union) u_in = NAME (rsqrt_input)[i]; + NAME (union) u_exp; + NAME (union) u_out; + char explanation[64]; + const char *p_exp; + + if (b < 0) + p_exp = "failed"; + else + { + p_exp = explanation; + sprintf (explanation, "%d bit error%s", b, (b > BDIV) ? ", failed" : ""); + } + + u_exp.x = exp; + u_out.x = out; + printf ("%s %s %s for 1 / sqrt (%g) [0x%llx], expected %g [0x%llx], got %g [0x%llx]\n", + TNAME (TYPE), test, p_exp, + (double) u_in.x, (unsigned long long) u_in.i, + (double) exp, (unsigned long long) u_exp.i, + (double) out, (unsigned long long) u_out.i); + } +#endif + + if (b < 0 || b > BRSQRT) + NAME (error_count)++; + +#ifdef VERBOSE + if (b > NAME (max_bits_rsqrt)) + NAME (max_bits_rsqrt) = b; +#endif + } +} + + +/* + * Now do everything. + */ + +static void +NAME (run) (void) +{ +#ifdef VERBOSE + printf ("start run_%s, divide size = %ld, rsqrt size = %ld, %d bit%s for a/b, %d bit%s for 1/sqrt(a)\n", + TNAME (TYPE), + (long)DIV_SIZE, + (long)RSQRT_SIZE, + BDIV, (BDIV == 1) ? "" : "s", + BRSQRT, (BRSQRT == 1) ? "" : "s"); +#endif + + NAME (asm_div) (); + + NAME (scalar_div) (); + NAME (check_div) ("scalar"); + + NAME (vector_div) (); + NAME (check_div) ("vector"); + + NAME (asm_rsqrt) (); + + NAME (scalar_rsqrt) (); + NAME (check_rsqrt) ("scalar"); + + NAME (vector_rsqrt) (); + NAME (check_rsqrt) ("vector"); + +#ifdef VERBOSE + printf ("end run_%s, errors = %d, max div bits = %d, max rsqrt bits = %d\n", + TNAME (TYPE), + NAME (error_count), + NAME (max_bits_div), + NAME (max_bits_rsqrt)); +#endif +} diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 1a36127e95b..6ad8c344585 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -992,6 +992,30 @@ proc check_vmx_hw_available { } { }] } +proc check_ppc_recip_hw_available { } { + return [check_cached_effective_target ppc_recip_hw_available { + # Some simulators may not support FRE/FRES/FRSQRTE/FRSQRTES + # For now, disable on Darwin + if { [istarget powerpc-*-eabi] || [istarget powerpc*-*-eabispe] || [istarget *-*-darwin*]} { + expr 0 + } else { + set options "-mpowerpc-gfxopt -mpowerpc-gpopt -mpopcntb" + check_runtime_nocache ppc_recip_hw_available { + volatile double d_recip, d_rsqrt, d_four = 4.0; + volatile float f_recip, f_rsqrt, f_four = 4.0f; + int main() + { + asm volatile ("fres %0,%1" : "=f" (f_recip) : "f" (f_four)); + asm volatile ("fre %0,%1" : "=d" (d_recip) : "d" (d_four)); + asm volatile ("frsqrtes %0,%1" : "=f" (f_rsqrt) : "f" (f_four)); + asm volatile ("frsqrte %0,%1" : "=f" (d_rsqrt) : "d" (d_four)); + return 0; + } + } $options + } + }] +} + # Return 1 if the target supports executing AltiVec and Cell PPU # instructions, 0 otherwise. Cache the result. @@ -2972,6 +2996,8 @@ proc is-effective-target { arg } { } else { switch $arg { "vmx_hw" { set selected [check_vmx_hw_available] } + "vsx_hw" { set selected [check_vsx_hw_available] } + "ppc_recip_hw" { set selected [check_ppc_recip_hw_available] } "named_sections" { set selected [check_named_sections_available] } "gc_sections" { set selected [check_gc_sections_available] } "cxa_atexit" { set selected [check_cxa_atexit_available] } @@ -2991,6 +3017,8 @@ proc is-effective-target-keyword { arg } { # These have different names for their check_* procs. switch $arg { "vmx_hw" { return 1 } + "vsx_hw" { return 1 } + "ppc_recip_hw" { return 1 } "named_sections" { return 1 } "gc_sections" { return 1 } "cxa_atexit" { return 1 } |