diff options
-rw-r--r-- | gcc/ChangeLog | 63 | ||||
-rw-r--r-- | gcc/config/i386/i386-protos.h | 2 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 139 | ||||
-rw-r--r-- | gcc/config/i386/i386.h | 1 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 50 | ||||
-rw-r--r-- | gcc/config/i386/i386.opt | 4 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 46 | ||||
-rw-r--r-- | gcc/doc/invoke.texi | 9 | ||||
-rw-r--r-- | gcc/doc/tm.texi | 9 | ||||
-rw-r--r-- | gcc/hooks.c | 10 | ||||
-rw-r--r-- | gcc/hooks.h | 1 | ||||
-rw-r--r-- | gcc/passes.c | 1 | ||||
-rw-r--r-- | gcc/target-def.h | 10 | ||||
-rw-r--r-- | gcc/target.h | 4 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 9 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/recip-divf.c | 9 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/recip-sqrtf.c | 21 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/recip-vec-divf.c | 16 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/recip-vec-sqrtf.c | 34 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/sse-recip.c | 51 | ||||
-rw-r--r-- | gcc/tree-pass.h | 1 | ||||
-rw-r--r-- | gcc/tree-ssa-math-opts.c | 125 |
22 files changed, 604 insertions, 11 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index abe28d49f45..f832b77e9a5 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,66 @@ +2007-06-16 Uros Bizjak <ubizjak@gmail.com> + + PR middle-end/31723 + * hooks.c (hook_tree_tree_bool_null): New hook. + * hooks.h (hook_tree_tree_bool_null): Add prototype. + * tree-pass.h (pass_convert_to_rsqrt): Declare. + * passes.c (init_optimization_passes): Add pass_convert_to_rsqrt. + * tree-ssa-math-opts.c (execute_cse_reciprocals): Scan for a/func(b) + and convert it to reciprocal a*rfunc(b). + (execute_convert_to_rsqrt): New function. + (gate_convert_to_rsqrt): New function. + (pass_convert_to_rsqrt): New pass definition. + * target.h (struct gcc_target): Add builtin_reciprocal. + * target-def.h (TARGET_BUILTIN_RECIPROCAL): New define. + (TARGET_INITIALIZER): Initialize builtin_reciprocal with + TARGET_BUILTIN_RECIPROCAL. + * doc/tm.texi (TARGET_BUILTIN_RECIPROCAL): Document. + + * config/i386/i386.h (TARGET_RECIP): New define. + * config/i386/i386.md (divsf3): Expand by calling ix86_emit_swdivsf + for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and + flag_unsafe_math_optimizations are set, flag_trapping_math is unset + and not optimizing for size. + (*rcpsf2_sse): New insn pattern. + (*rsqrtsf2_sse): Ditto. + (rsqrtsf2): New expander. Expand by calling ix86_emit_swsqrtsf + for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and + flag_unsafe_math_optimizations are set, flag_trapping_math is unset + and not optimizing for size. + (sqrt<mode>2): Expand SFmode operands by calling ix86_emit_swsqrtsf + for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and + flag_unsafe_math_optimizations are set, flag_trapping_math is unset + and not optimizing for size. + * config/i386/sse.md (divv4sf): Expand by calling ix86_emit_swdivsf + for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and + flag_unsafe_math_optimizations are set, flag_trapping_math is unset + and not optimizing for size. + (*sse_rsqrtv4sf2): Do not export. + (sqrtv4sf2): Ditto. + (sse_rsqrtv4sf2): New expander. Expand by calling ix86_emit_swsqrtsf + for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and + flag_unsafe_math_optimizations are set, flag_trapping_math is unset + and not optimizing for size. + (sqrtv4sf2): Ditto. + * config/i386/i386.opt (mrecip): New option. + * config/i386/i386-protos.h (ix86_emit_swdivsf): Declare. + (ix86_emit_swsqrtsf): Ditto. + * config/i386/i386.c (IX86_BUILTIN_RSQRTF): New constant. + (ix86_init_mmx_sse_builtins): __builtin_ia32_rsqrtf: New + builtin definition. + (ix86_expand_builtin): Expand IX86_BUILTIN_RSQRTF using + ix86_expand_unop1_builtin. + (ix86_emit_swdivsf): New function. + (ix86_emit_swsqrtsf): Ditto. + (ix86_builtin_reciprocal): New function. + (TARGET_BUILTIN_RECIPROCAL): Use it. + (ix86_vectorize_builtin_conversion): Rename from + ix86_builtin_conversion. + (TARGET_VECTORIZE_BUILTIN_CONVERSION): Use renamed function. + * doc/invoke.texi (Machine Dependent Options): Add -mrecip to + "i386 and x86_64 Options" section. + (Intel 386 and AMD x86_64 Options): Document -mrecip. + 2007-06-15 Andrew Pinski <andrew_pinski@playstation.sony.com> Zdenek Dvorak <dvorakz@suse.cz> Richard Guenther <rguenther@suse.de> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 60b495582aa..a0eab4852a8 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -163,6 +163,8 @@ extern void x86_emit_floatuns (rtx [2]); extern void ix86_emit_fp_unordered_jump (rtx); extern void ix86_emit_i387_log1p (rtx, rtx); +extern void ix86_emit_swdivsf (rtx, rtx, rtx, enum machine_mode); +extern void ix86_emit_swsqrtsf (rtx, rtx, enum machine_mode, bool); extern enum rtx_code ix86_reverse_condition (enum rtx_code, enum machine_mode); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index f9e67aa4af3..f4ae18bc353 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -16450,6 +16450,7 @@ enum ix86_builtins IX86_BUILTIN_RCPSS, IX86_BUILTIN_RSQRTPS, IX86_BUILTIN_RSQRTSS, + IX86_BUILTIN_RSQRTF, IX86_BUILTIN_SQRTPS, IX86_BUILTIN_SQRTSS, @@ -18039,6 +18040,10 @@ ix86_init_mmx_sse_builtins (void) def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS); + ftype = build_function_type_list (float_type_node, + float_type_node, + NULL_TREE); + def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtf", ftype, IX86_BUILTIN_RSQRTF); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS); @@ -19133,6 +19138,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, emit_insn (pat); return 0; + case IX86_BUILTIN_RSQRTF: + return ix86_expand_unop1_builtin (CODE_FOR_rsqrtsf2, exp, target); + case IX86_BUILTIN_SQRTSS: return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, exp, target); case IX86_BUILTIN_RSQRTSS: @@ -19869,7 +19877,7 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out, input vector of type TYPE, or NULL_TREE if it is not available. */ static tree -ix86_builtin_conversion (unsigned int code, tree type) +ix86_vectorize_builtin_conversion (unsigned int code, tree type) { if (TREE_CODE (type) != VECTOR_TYPE) return NULL_TREE; @@ -19899,6 +19907,32 @@ ix86_builtin_conversion (unsigned int code, tree type) } } +/* Returns a code for a target-specific builtin that implements + reciprocal of the function, or NULL_TREE if not available. */ + +static tree +ix86_builtin_reciprocal (unsigned int code, bool sqrt ATTRIBUTE_UNUSED) +{ + if (! (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations)) + return NULL_TREE; + + switch (code) + { + /* Sqrt to rsqrt conversion. */ + case BUILT_IN_SQRTF: + return ix86_builtins[IX86_BUILTIN_RSQRTF]; + + /* Vectorized version of sqrt to rsqrt conversion. */ + case IX86_BUILTIN_SQRTPS: + return ix86_builtins[IX86_BUILTIN_RSQRTPS]; + + default: + return NULL_TREE; + } +} + /* Store OPERAND to the memory after reload is completed. This means that we can't easily use assign_stack_local. */ rtx @@ -22501,6 +22535,100 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1) emit_label (label2); } +/* Output code to perform a Newton-Rhapson approximation of a single precision + floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */ + +void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode) +{ + rtx x0, x1, e0, e1, two; + + x0 = gen_reg_rtx (mode); + e0 = gen_reg_rtx (mode); + e1 = gen_reg_rtx (mode); + x1 = gen_reg_rtx (mode); + + two = CONST_DOUBLE_FROM_REAL_VALUE (dconst2, SFmode); + + if (VECTOR_MODE_P (mode)) + two = ix86_build_const_vector (SFmode, true, two); + + two = force_reg (mode, two); + + /* a / b = a * rcp(b) * (2.0 - b * rcp(b)) */ + + /* x0 = 1./b estimate */ + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_UNSPEC (mode, gen_rtvec (1, b), + UNSPEC_RCP))); + /* e0 = x0 * b */ + emit_insn (gen_rtx_SET (VOIDmode, e0, + gen_rtx_MULT (mode, x0, b))); + /* e1 = 2. - e0 */ + emit_insn (gen_rtx_SET (VOIDmode, e1, + gen_rtx_MINUS (mode, two, e0))); + /* x1 = x0 * e1 */ + emit_insn (gen_rtx_SET (VOIDmode, x1, + gen_rtx_MULT (mode, x0, e1))); + /* res = a * x1 */ + emit_insn (gen_rtx_SET (VOIDmode, res, + gen_rtx_MULT (mode, a, x1))); +} + +/* Output code to perform a Newton-Rhapson approximation of a + single precision floating point [reciprocal] square root. */ + +void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, + bool recip) +{ + rtx x0, e0, e1, e2, e3, three, half; + + x0 = gen_reg_rtx (mode); + e0 = gen_reg_rtx (mode); + e1 = gen_reg_rtx (mode); + e2 = gen_reg_rtx (mode); + e3 = gen_reg_rtx (mode); + + three = CONST_DOUBLE_FROM_REAL_VALUE (dconst3, SFmode); + half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, SFmode); + + if (VECTOR_MODE_P (mode)) + { + three = ix86_build_const_vector (SFmode, true, three); + half = ix86_build_const_vector (SFmode, true, half); + } + + three = force_reg (mode, three); + half = force_reg (mode, half); + + /* sqrt(a) = 0.5 * a * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a)) + 1.0 / sqrt(a) = 0.5 * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a)) */ + + /* x0 = 1./sqrt(a) estimate */ + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_UNSPEC (mode, gen_rtvec (1, a), + UNSPEC_RSQRT))); + /* e0 = x0 * a */ + emit_insn (gen_rtx_SET (VOIDmode, e0, + gen_rtx_MULT (mode, x0, a))); + /* e1 = e0 * x0 */ + emit_insn (gen_rtx_SET (VOIDmode, e1, + gen_rtx_MULT (mode, e0, x0))); + /* e2 = 3. - e1 */ + emit_insn (gen_rtx_SET (VOIDmode, e2, + gen_rtx_MINUS (mode, three, e1))); + if (recip) + /* e3 = .5 * x0 */ + emit_insn (gen_rtx_SET (VOIDmode, e3, + gen_rtx_MULT (mode, half, x0))); + else + /* e3 = .5 * e0 */ + emit_insn (gen_rtx_SET (VOIDmode, e3, + gen_rtx_MULT (mode, half, e0))); + /* ret = e2 * e3 */ + emit_insn (gen_rtx_SET (VOIDmode, res, + gen_rtx_MULT (mode, e2, e3))); +} + /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */ static void ATTRIBUTE_UNUSED @@ -23205,9 +23333,14 @@ static const struct attribute_spec ix86_attribute_table[] = #define TARGET_EXPAND_BUILTIN ix86_expand_builtin #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION -#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION ix86_builtin_vectorized_function +#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ + ix86_builtin_vectorized_function + #undef TARGET_VECTORIZE_BUILTIN_CONVERSION -#define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_builtin_conversion +#define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_vectorize_builtin_conversion + +#undef TARGET_BUILTIN_RECIPROCAL +#define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal #undef TARGET_ASM_FUNCTION_EPILOGUE #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 5089883dca5..ee1fbbc2765 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -365,6 +365,7 @@ extern int x86_prefetch_sse; #define TARGET_POPCNT x86_popcnt #define TARGET_PREFETCH_SSE x86_prefetch_sse #define TARGET_SAHF x86_sahf +#define TARGET_RECIP x86_recip #define ASSEMBLER_DIALECT (ix86_asm_dialect) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 0ecb9961d85..3f8f97b46cb 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -7470,7 +7470,16 @@ (div:SF (match_operand:SF 1 "register_operand" "") (match_operand:SF 2 "nonimmediate_operand" "")))] "TARGET_80387 || TARGET_SSE_MATH" - "") +{ + if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + ix86_emit_swdivsf (operands[0], operands[1], + operands[2], SFmode); + DONE; + } +}) ;; Remainder instructions. @@ -15516,6 +15525,15 @@ (const_string "fop"))) (set_attr "mode" "SF")]) +(define_insn "*rcpsf2_sse" + [(set (match_operand:SF 0 "register_operand" "=x") + (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")] + UNSPEC_RCP))] + "TARGET_SSE_MATH" + "rcpss\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "mode" "SF")]) + (define_insn "*fop_sf_1_sse" [(set (match_operand:SF 0 "register_operand" "=x") (match_operator:SF 3 "binary_fp_operator" @@ -15980,6 +15998,27 @@ (set_attr "athlon_decode" "direct") (set_attr "amdfam10_decode" "direct")]) +(define_insn "*rsqrtsf2_sse" + [(set (match_operand:SF 0 "register_operand" "=x") + (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")] + UNSPEC_RSQRT))] + "TARGET_SSE_MATH" + "rsqrtss\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "mode" "SF")]) + +(define_expand "rsqrtsf2" + [(set (match_operand:SF 0 "register_operand" "=x") + (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")] + UNSPEC_RSQRT))] + "TARGET_SSE_MATH && TARGET_RECIP && !optimize_size + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations" +{ + ix86_emit_swsqrtsf (operands[0], operands[1], SFmode, 1); + DONE; +}) + (define_insn "*sqrt<mode>2_sse" [(set (match_operand:SSEMODEF 0 "register_operand" "=x") (sqrt:SSEMODEF @@ -15998,6 +16037,15 @@ "TARGET_USE_FANCY_MATH_387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)" { + if (<MODE>mode == SFmode + && TARGET_SSE_MATH && TARGET_RECIP && !optimize_size + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + ix86_emit_swsqrtsf (operands[0], operands[1], SFmode, 0); + DONE; + } + if (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)) { rtx op0 = gen_reg_rtx (XFmode); diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 72b40c93987..1e36d0f0bab 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -258,3 +258,7 @@ Support code generation of popcnt instruction. msahf Target Report RejectNegative Var(x86_sahf) Support code generation of sahf instruction in 64bit x86-64 code. + +mrecip +Target Report RejectNegative Var(x86_recip) +Generate reciprocals instead of divss and sqrtss. diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index c74c0f7fe69..65abbcf3b69 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -450,7 +450,18 @@ (div:V4SF (match_operand:V4SF 1 "register_operand" "") (match_operand:V4SF 2 "nonimmediate_operand" "")))] "TARGET_SSE" - "ix86_fixup_binary_operands_no_copy (DIV, V4SFmode, operands);") +{ + ix86_fixup_binary_operands_no_copy (DIV, V4SFmode, operands); + + if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + ix86_emit_swdivsf (operands[0], operands[1], + operands[2], V4SFmode); + DONE; + } +}) (define_insn "*divv4sf3" [(set (match_operand:V4SF 0 "register_operand" "=x") @@ -494,7 +505,7 @@ [(set_attr "type" "sse") (set_attr "mode" "SF")]) -(define_insn "sse_rsqrtv4sf2" +(define_insn "*sse_rsqrtv4sf2" [(set (match_operand:V4SF 0 "register_operand" "=x") (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))] @@ -503,6 +514,21 @@ [(set_attr "type" "sse") (set_attr "mode" "V4SF")]) +(define_expand "sse_rsqrtv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "") + (unspec:V4SF + [(match_operand:V4SF 1 "nonimmediate_operand" "")] UNSPEC_RSQRT))] + "TARGET_SSE" +{ + if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + ix86_emit_swsqrtsf (operands[0], operands[1], V4SFmode, 1); + DONE; + } +}) + (define_insn "sse_vmrsqrtv4sf2" [(set (match_operand:V4SF 0 "register_operand" "=x") (vec_merge:V4SF @@ -515,7 +541,7 @@ [(set_attr "type" "sse") (set_attr "mode" "SF")]) -(define_insn "sqrtv4sf2" +(define_insn "*sqrtv4sf2" [(set (match_operand:V4SF 0 "register_operand" "=x") (sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "xm")))] "TARGET_SSE" @@ -523,6 +549,20 @@ [(set_attr "type" "sse") (set_attr "mode" "V4SF")]) +(define_expand "sqrtv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "=") + (sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "")))] + "TARGET_SSE" +{ + if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size + && flag_finite_math_only && !flag_trapping_math + && flag_unsafe_math_optimizations) + { + ix86_emit_swsqrtsf (operands[0], operands[1], V4SFmode, 0); + DONE; + } +}) + (define_insn "sse_vmsqrtv4sf2" [(set (match_operand:V4SF 0 "register_operand" "=x") (vec_merge:V4SF diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index dddb37da84e..d12a6275735 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -548,7 +548,7 @@ Objective-C and Objective-C++ Dialects}. -masm=@var{dialect} -mno-fancy-math-387 @gol -mno-fp-ret-in-387 -msoft-float @gol -mno-wide-multiply -mrtd -malign-double @gol --mpreferred-stack-boundary=@var{num} -mcx16 -msahf @gol +-mpreferred-stack-boundary=@var{num} -mcx16 -msahf -mrecip @gol -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 @gol -msse4a -m3dnow -mpopcnt -mabm @gol -mthreads -mno-align-stringops -minline-all-stringops @gol @@ -10346,6 +10346,13 @@ SAHF are load and store instructions, respectively, for certain status flags. In 64-bit mode, SAHF instruction is used to optimize @code{fmod}, @code{drem} or @code{remainder} built-in functions: see @ref{Other Builtins} for details. +@item -mrecip +@opindex mrecip +This option will enable GCC to use RCPSS and RSQRTSS instructions (and their +vectorized variants RCPPS and RSQRTPS) instead of DIVSS and SQRTSS (and their +vectorized variants). These instructions will be generated only when +@option{-funsafe-math-optimizatons} is enabled. + @item -mpush-args @itemx -mno-push-args @opindex mpush-args diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 692082728c1..cd7ae6bdfae 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -5345,6 +5345,15 @@ of @var{x}. The default version returns false for all constants. @end deftypefn +@deftypefn {Target Hook} tree TARGET_BUILTIN_RECIPROCAL (enum tree_code @var{code}, bool @var{sqrt}) +This hook should return the DECL of a function that implements reciprocal of +the builtin function with builtin function code @var{code}, or +@code{NULL_TREE} if such a function is not available. When @var{sqrt} is +true, additional optimizations that apply only to the reciprocal of a square +root function are performed, and only reciprocals of @code{sqrt} function +are valid. +@end deftypefn + @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD (void) This hook should return the DECL of a function @var{f} that given an address @var{addr} as an argument returns a mask @var{m} that can be diff --git a/gcc/hooks.c b/gcc/hooks.c index 18b17dc85f2..4c57a1687ee 100644 --- a/gcc/hooks.c +++ b/gcc/hooks.c @@ -266,7 +266,15 @@ hook_constcharptr_tree_null (tree t ATTRIBUTE_UNUSED) } tree -hook_tree_tree_tree_bool_null (tree t0 ATTRIBUTE_UNUSED, tree t1 ATTRIBUTE_UNUSED, +hook_tree_tree_bool_null (tree t0 ATTRIBUTE_UNUSED, + bool ignore ATTRIBUTE_UNUSED) +{ + return NULL; +} + +tree +hook_tree_tree_tree_bool_null (tree t0 ATTRIBUTE_UNUSED, + tree t1 ATTRIBUTE_UNUSED, bool ignore ATTRIBUTE_UNUSED) { return NULL; diff --git a/gcc/hooks.h b/gcc/hooks.h index 02664c12803..15efef7ef22 100644 --- a/gcc/hooks.h +++ b/gcc/hooks.h @@ -58,6 +58,7 @@ extern int hook_int_void_no_regs (void); extern tree hook_tree_tree_tree_null (tree, tree); extern tree hook_tree_tree_tree_tree_3rd_identity (tree, tree, tree); +extern tree hook_tree_tree_bool_null (tree, bool); extern tree hook_tree_tree_tree_bool_null (tree, tree, bool); extern unsigned hook_uint_uint_constcharptrptr_0 (unsigned, const char **); diff --git a/gcc/passes.c b/gcc/passes.c index c4c94ff8806..c954847b82d 100644 --- a/gcc/passes.c +++ b/gcc/passes.c @@ -647,6 +647,7 @@ init_optimization_passes (void) NEXT_PASS (pass_tree_loop_done); } NEXT_PASS (pass_cse_reciprocals); + NEXT_PASS (pass_convert_to_rsqrt); NEXT_PASS (pass_reassoc); NEXT_PASS (pass_vrp); NEXT_PASS (pass_dominator); diff --git a/gcc/target-def.h b/gcc/target-def.h index 84532a6a04b..8acaa19bead 100644 --- a/gcc/target-def.h +++ b/gcc/target-def.h @@ -350,8 +350,10 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. TARGET_SCHED_SET_SCHED_FLAGS} #define TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD 0 -#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION default_builtin_vectorized_function -#define TARGET_VECTORIZE_BUILTIN_CONVERSION default_builtin_vectorized_conversion +#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ + default_builtin_vectorized_function +#define TARGET_VECTORIZE_BUILTIN_CONVERSION \ + default_builtin_vectorized_conversion #define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN 0 #define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD 0 @@ -385,6 +387,9 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. #define TARGET_RESOLVE_OVERLOADED_BUILTIN NULL #define TARGET_FOLD_BUILTIN hook_tree_tree_tree_bool_null +/* In tree-ssa-math-opts.c */ +#define TARGET_BUILTIN_RECIPROCAL hook_tree_tree_bool_null + /* In varasm.c. */ #ifndef TARGET_SECTION_TYPE_FLAGS #define TARGET_SECTION_TYPE_FLAGS default_section_type_flags @@ -668,6 +673,7 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. TARGET_EXPAND_BUILTIN, \ TARGET_RESOLVE_OVERLOADED_BUILTIN, \ TARGET_FOLD_BUILTIN, \ + TARGET_BUILTIN_RECIPROCAL, \ TARGET_MANGLE_FUNDAMENTAL_TYPE, \ TARGET_INIT_LIBFUNCS, \ TARGET_SECTION_TYPE_FLAGS, \ diff --git a/gcc/target.h b/gcc/target.h index f769ae0938a..2d446a121dc 100644 --- a/gcc/target.h +++ b/gcc/target.h @@ -483,6 +483,10 @@ struct gcc_target /* Fold a target-specific builtin. */ tree (* fold_builtin) (tree fndecl, tree arglist, bool ignore); + /* Returns a code for a target-specific builtin that implements + reciprocal of the function, or NULL_TREE if not available. */ + tree (* builtin_reciprocal) (unsigned, bool); + /* For a vendor-specific fundamental TYPE, return a pointer to a statically-allocated string containing the C++ mangling for TYPE. In all other cases, return NULL. */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index ba4707a541b..cc98594c016 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,12 @@ +2007-06-16 Uros Bizjak <ubizjak@gmail.com> + + PR middle-end/31723 + * gcc.target/i386/recip-divf.c: New test. + * gcc.target/i386/recip-sqrtf.c: Ditto. + * gcc.target/i386/recip-vec-divf.c: Ditto. + * gcc.target/i386/recip-vec-sqrtf.c: Ditto. + * gcc.target/i386/sse-recip.c: Ditto. + 2007-06-15 Andrew Pinski <andrew_pinski@playstation.sony.com> PR tree-opt/32225 diff --git a/gcc/testsuite/gcc.target/i386/recip-divf.c b/gcc/testsuite/gcc.target/i386/recip-divf.c new file mode 100644 index 00000000000..0a2e9c8bd81 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/recip-divf.c @@ -0,0 +1,9 @@ +/* { dg-do compile { target i?86-*-* x86_64-*-* } } */ +/* { dg-options "-O2 -ffast-math -msse2 -mfpmath=sse -mrecip" } */ + +float t1(float a, float b) +{ + return a / b; +} + +/* { dg-final { scan-assembler "rcpss" } } */ diff --git a/gcc/testsuite/gcc.target/i386/recip-sqrtf.c b/gcc/testsuite/gcc.target/i386/recip-sqrtf.c new file mode 100644 index 00000000000..c387077aae2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/recip-sqrtf.c @@ -0,0 +1,21 @@ +/* { dg-do compile { target i?86-*-* x86_64-*-* } } */ +/* { dg-options "-O2 -ffast-math -msse2 -mfpmath=sse -mrecip" } */ + +extern float sqrtf (float); + +float t1(float a, float b) +{ + return a/sqrtf(b); +} + +float t2(float x, float a, float b) +{ + return sqrtf(a/b); +} + +float t3(float a) +{ + return sqrtf(a); +} + +/* { dg-final { scan-assembler-times "rsqrtss" 3 } } */ diff --git a/gcc/testsuite/gcc.target/i386/recip-vec-divf.c b/gcc/testsuite/gcc.target/i386/recip-vec-divf.c new file mode 100644 index 00000000000..bf41e6c4fde --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/recip-vec-divf.c @@ -0,0 +1,16 @@ +/* { dg-do compile { target i?86-*-* x86_64-*-* } } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse -mrecip" } */ + +float a[16]; +float b[16]; +float r[16]; + +void t1(void) +{ + int i; + + for (i = 0; i < 16; i++) + r[i] = a[i] / b[i]; +} + +/* { dg-final { scan-assembler "rcpps" } } */ diff --git a/gcc/testsuite/gcc.target/i386/recip-vec-sqrtf.c b/gcc/testsuite/gcc.target/i386/recip-vec-sqrtf.c new file mode 100644 index 00000000000..2eb3f861f42 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/recip-vec-sqrtf.c @@ -0,0 +1,34 @@ +/* { dg-do compile { target i?86-*-* x86_64-*-* } } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse -mrecip" } */ + +float a[16]; +float b[16]; +float r[16]; + +extern float sqrtf (float); + +void t1(void) +{ + int i; + + for (i = 0; i < 16; i++) + r[i] = a[i] / sqrtf (b[i]); +} + +void t2(void) +{ + int i; + + for (i = 0; i < 16; i++) + r[i] = sqrtf (a[i] / b[i]); +} + +void t3(void) +{ + int i; + + for (i = 0; i < 16; i++) + r[i] = sqrtf (a[i]); +} + +/* { dg-final { scan-assembler-times "rsqrtps" 3 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse-recip.c b/gcc/testsuite/gcc.target/i386/sse-recip.c new file mode 100644 index 00000000000..2d7dff91452 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse-recip.c @@ -0,0 +1,51 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ffast-math -msse -mfpmath=sse -mrecip" } */ + +#include "../../gcc.dg/i386-cpuid.h" + +extern float sqrtf (float); +extern void abort (void); + +#define N 8 + +int __attribute__((noinline)) +main1 () +{ + float a[N] = { 0.f, 18.f, 108.f, 324.f, 720.f, 1944.f, 3087.f, 5832.f }; + float b[N] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f }; + float r[N]; + + float rc[N] = { 0.f, 3.f, 6.f, 9.f, 12.f, 18.f, 21.f, 27.f }; + + int i; + + for (i = 0; i < N; i++) + { + r[i] = sqrtf (a[i] / b[i]); + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (r[i] != rc[i]) + abort(); + } + + return 0; +} + +int +main () +{ + unsigned long cpu_facilities; + + cpu_facilities = i386_cpuid (); + + if ((cpu_facilities & (bit_MMX | bit_SSE | bit_CMOV)) + != (bit_MMX | bit_SSE | bit_CMOV)) + /* If host has no vector support, pass. */ + return 0; + + main1 (); + return 0; +} diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 6800edfdbc8..333ec41b853 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -293,6 +293,7 @@ extern struct tree_opt_pass pass_early_warn_uninitialized; extern struct tree_opt_pass pass_late_warn_uninitialized; extern struct tree_opt_pass pass_cse_reciprocals; extern struct tree_opt_pass pass_cse_sincos; +extern struct tree_opt_pass pass_convert_to_rsqrt; extern struct tree_opt_pass pass_warn_function_return; extern struct tree_opt_pass pass_warn_function_noreturn; extern struct tree_opt_pass pass_phiopt; diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c index fe67993f8dc..0534dcf2f90 100644 --- a/gcc/tree-ssa-math-opts.c +++ b/gcc/tree-ssa-math-opts.c @@ -496,6 +496,46 @@ execute_cse_reciprocals (void) && TREE_CODE (def) == SSA_NAME) execute_cse_reciprocals_1 (&bsi, def); } + + /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */ + for (bsi = bsi_after_labels (bb); !bsi_end_p (bsi); bsi_next (&bsi)) + { + tree stmt = bsi_stmt (bsi); + tree fndecl; + + if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT + && TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)) == RDIV_EXPR) + { + tree arg1 = TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt, 1), 1); + tree stmt1 = SSA_NAME_DEF_STMT (arg1); + + if (TREE_CODE (stmt1) == GIMPLE_MODIFY_STMT + && TREE_CODE (GIMPLE_STMT_OPERAND (stmt1, 1)) == CALL_EXPR + && (fndecl + = get_callee_fndecl (GIMPLE_STMT_OPERAND (stmt1, 1))) + && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL + || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)) + { + enum built_in_function code; + tree arg10; + tree tmp; + + code = DECL_FUNCTION_CODE (fndecl); + fndecl = targetm.builtin_reciprocal (code, false); + if (!fndecl) + continue; + + arg10 = CALL_EXPR_ARG (GIMPLE_STMT_OPERAND (stmt1, 1), 0); + tmp = build_call_expr (fndecl, 1, arg10); + GIMPLE_STMT_OPERAND (stmt1, 1) = tmp; + update_stmt (stmt1); + + TREE_SET_CODE (GIMPLE_STMT_OPERAND (stmt, 1), MULT_EXPR); + fold_stmt_inplace (stmt); + update_stmt (stmt); + } + } + } } free_dominance_info (CDI_DOMINATORS); @@ -726,3 +766,88 @@ struct tree_opt_pass pass_cse_sincos = | TODO_verify_stmts, /* todo_flags_finish */ 0 /* letter */ }; + +/* Find all expressions in the form of sqrt(a/b) and + convert them to rsqrt(b/a). */ + +static unsigned int +execute_convert_to_rsqrt (void) +{ + basic_block bb; + + FOR_EACH_BB (bb) + { + block_stmt_iterator bsi; + + for (bsi = bsi_after_labels (bb); !bsi_end_p (bsi); bsi_next (&bsi)) + { + tree stmt = bsi_stmt (bsi); + tree fndecl; + + if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT + && TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)) == CALL_EXPR + && (fndecl = get_callee_fndecl (GIMPLE_STMT_OPERAND (stmt, 1))) + && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL + || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)) + { + enum built_in_function code; + tree arg1; + tree stmt1; + + code = DECL_FUNCTION_CODE (fndecl); + fndecl = targetm.builtin_reciprocal (code, true); + if (!fndecl) + continue; + + arg1 = CALL_EXPR_ARG (GIMPLE_STMT_OPERAND (stmt, 1), 0); + stmt1 = SSA_NAME_DEF_STMT (arg1); + + if (TREE_CODE (stmt1) == GIMPLE_MODIFY_STMT + && TREE_CODE (GIMPLE_STMT_OPERAND (stmt1, 1)) == RDIV_EXPR) + { + tree arg10, arg11; + tree tmp; + + arg10 = TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 0); + arg11 = TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 1); + + /* Swap operands of RDIV_EXPR. */ + TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 0) = arg11; + TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 1) = arg10; + fold_stmt_inplace (stmt1); + update_stmt (stmt1); + + tmp = build_call_expr (fndecl, 1, arg1); + GIMPLE_STMT_OPERAND (stmt, 1) = tmp; + update_stmt (stmt); + } + } + } + } + + return 0; +} + +static bool +gate_convert_to_rsqrt (void) +{ + return flag_unsafe_math_optimizations && optimize; +} + +struct tree_opt_pass pass_convert_to_rsqrt = +{ + "rsqrt", /* name */ + gate_convert_to_rsqrt, /* gate */ + execute_convert_to_rsqrt, /* execute */ + NULL, /* sub */ + NULL, /* next */ + 0, /* static_pass_number */ + 0, /* tv_id */ + PROP_ssa, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_dump_func | TODO_update_ssa | TODO_verify_ssa + | TODO_verify_stmts, /* todo_flags_finish */ + 0 /* letter */ +}; |