summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoruros <uros@138bc75d-0d04-0410-961f-82ee72b054a4>2007-06-16 09:52:48 +0000
committeruros <uros@138bc75d-0d04-0410-961f-82ee72b054a4>2007-06-16 09:52:48 +0000
commite174638ff04a9c48151382a725c55994a8a8d558 (patch)
tree332aeacd4c176a2297138e64e399861ff84a0b59
parent0de36bdb2d7fa28a41b67b7ab3f8ccb92e1d2e03 (diff)
downloadgcc-e174638ff04a9c48151382a725c55994a8a8d558.tar.gz
PR middle-end/31723
* hooks.c (hook_tree_tree_bool_null): New hook. * hooks.h (hook_tree_tree_bool_null): Add prototype. * tree-pass.h (pass_convert_to_rsqrt): Declare. * passes.c (init_optimization_passes): Add pass_convert_to_rsqrt. * tree-ssa-math-opts.c (execute_cse_reciprocals): Scan for a/func(b) and convert it to reciprocal a*rfunc(b). (execute_convert_to_rsqrt): New function. (gate_convert_to_rsqrt): New function. (pass_convert_to_rsqrt): New pass definition. * target.h (struct gcc_target): Add builtin_reciprocal. * target-def.h (TARGET_BUILTIN_RECIPROCAL): New define. (TARGET_INITIALIZER): Initialize builtin_reciprocal with TARGET_BUILTIN_RECIPROCAL. * doc/tm.texi (TARGET_BUILTIN_RECIPROCAL): Document. * config/i386/i386.h (TARGET_RECIP): New define. * config/i386/i386.md (divsf3): Expand by calling ix86_emit_swdivsf for TARGET_SSE_MATH and TARGET_RECIP when flag_unsafe_math_optimizations is set and not optimizing for size. (*rcpsf2_sse): New insn pattern. (*rsqrtsf2_sse): Ditto. (rsqrtsf2): New expander. Expand by calling ix86_emit_swsqrtsf for TARGET_SSE_MATH and TARGET_RECIP when flag_unsafe_math_optimizations is set and not optimizing for size. (sqrt<mode>2): Expand SFmode operands by calling ix86_emit_swsqrtsf for TARGET_SSE_MATH and TARGET_RECIP when flag_unsafe_math_optimizations is set and not optimizing for size. * config/i386/sse.md (divv4sf): Expand by calling ix86_emit_swdivsf for TARGET_SSE_MATH and TARGET_RECIP when flag_unsafe_math_optimizations is set and not optimizing for size. (*sse_rsqrtv4sf2): Do not export. (sqrtv4sf2): Ditto. (sse_rsqrtv4sf2): New expander. Expand by calling ix86_emit_swsqrtsf for TARGET_SSE_MATH and TARGET_RECIP when flag_unsafe_math_optimizations is set and not optimizing for size. (sqrtv4sf2): Ditto. * config/i386/i386.opt (mrecip): New option. * config/i386/i386-protos.h (ix86_emit_swdivsf): Declare. (ix86_emit_swsqrtsf): Ditto. * config/i386/i386.c (IX86_BUILTIN_RSQRTF): New constant. (ix86_init_mmx_sse_builtins): __builtin_ia32_rsqrtf: New builtin definition. (ix86_expand_builtin): Expand IX86_BUILTIN_RSQRTF using ix86_expand_unop1_builtin. (ix86_emit_swdivsf): New function. (ix86_emit_swsqrtsf): Ditto. (ix86_builtin_reciprocal): New function. (TARGET_BUILTIN_RECIPROCAL): Use it. (ix86_vectorize_builtin_conversion): Rename from ix86_builtin_conversion. (TARGET_VECTORIZE_BUILTIN_CONVERSION): Use renamed function. * doc/invoke.texi (Machine Dependent Options): Add -mrecip to "i386 and x86_64 Options" section. (Intel 386 and AMD x86_64 Options): Document -mrecip. testsuite/ChangeLog: PR middle-end/31723 * gcc.target/i386/recip-divf.c: New test. * gcc.target/i386/recip-sqrtf.c: Ditto. * gcc.target/i386/recip-vec-divf.c: Ditto. * gcc.target/i386/recip-vec-sqrtf.c: Ditto. * gcc.target/i386/sse-recip.c: Ditto. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@125756 138bc75d-0d04-0410-961f-82ee72b054a4
-rw-r--r--gcc/ChangeLog63
-rw-r--r--gcc/config/i386/i386-protos.h2
-rw-r--r--gcc/config/i386/i386.c139
-rw-r--r--gcc/config/i386/i386.h1
-rw-r--r--gcc/config/i386/i386.md50
-rw-r--r--gcc/config/i386/i386.opt4
-rw-r--r--gcc/config/i386/sse.md46
-rw-r--r--gcc/doc/invoke.texi9
-rw-r--r--gcc/doc/tm.texi9
-rw-r--r--gcc/hooks.c10
-rw-r--r--gcc/hooks.h1
-rw-r--r--gcc/passes.c1
-rw-r--r--gcc/target-def.h10
-rw-r--r--gcc/target.h4
-rw-r--r--gcc/testsuite/ChangeLog9
-rw-r--r--gcc/testsuite/gcc.target/i386/recip-divf.c9
-rw-r--r--gcc/testsuite/gcc.target/i386/recip-sqrtf.c21
-rw-r--r--gcc/testsuite/gcc.target/i386/recip-vec-divf.c16
-rw-r--r--gcc/testsuite/gcc.target/i386/recip-vec-sqrtf.c34
-rw-r--r--gcc/testsuite/gcc.target/i386/sse-recip.c51
-rw-r--r--gcc/tree-pass.h1
-rw-r--r--gcc/tree-ssa-math-opts.c125
22 files changed, 604 insertions, 11 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index abe28d49f45..f832b77e9a5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,66 @@
+2007-06-16 Uros Bizjak <ubizjak@gmail.com>
+
+ PR middle-end/31723
+ * hooks.c (hook_tree_tree_bool_null): New hook.
+ * hooks.h (hook_tree_tree_bool_null): Add prototype.
+ * tree-pass.h (pass_convert_to_rsqrt): Declare.
+ * passes.c (init_optimization_passes): Add pass_convert_to_rsqrt.
+ * tree-ssa-math-opts.c (execute_cse_reciprocals): Scan for a/func(b)
+ and convert it to reciprocal a*rfunc(b).
+ (execute_convert_to_rsqrt): New function.
+ (gate_convert_to_rsqrt): New function.
+ (pass_convert_to_rsqrt): New pass definition.
+ * target.h (struct gcc_target): Add builtin_reciprocal.
+ * target-def.h (TARGET_BUILTIN_RECIPROCAL): New define.
+ (TARGET_INITIALIZER): Initialize builtin_reciprocal with
+ TARGET_BUILTIN_RECIPROCAL.
+ * doc/tm.texi (TARGET_BUILTIN_RECIPROCAL): Document.
+
+ * config/i386/i386.h (TARGET_RECIP): New define.
+ * config/i386/i386.md (divsf3): Expand by calling ix86_emit_swdivsf
+ for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
+ flag_unsafe_math_optimizations are set, flag_trapping_math is unset
+ and not optimizing for size.
+ (*rcpsf2_sse): New insn pattern.
+ (*rsqrtsf2_sse): Ditto.
+ (rsqrtsf2): New expander. Expand by calling ix86_emit_swsqrtsf
+ for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
+ flag_unsafe_math_optimizations are set, flag_trapping_math is unset
+ and not optimizing for size.
+ (sqrt<mode>2): Expand SFmode operands by calling ix86_emit_swsqrtsf
+ for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
+ flag_unsafe_math_optimizations are set, flag_trapping_math is unset
+ and not optimizing for size.
+ * config/i386/sse.md (divv4sf): Expand by calling ix86_emit_swdivsf
+ for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
+ flag_unsafe_math_optimizations are set, flag_trapping_math is unset
+ and not optimizing for size.
+ (*sse_rsqrtv4sf2): Do not export.
+ (sqrtv4sf2): Ditto.
+ (sse_rsqrtv4sf2): New expander. Expand by calling ix86_emit_swsqrtsf
+ for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
+ flag_unsafe_math_optimizations are set, flag_trapping_math is unset
+ and not optimizing for size.
+ (sqrtv4sf2): Ditto.
+ * config/i386/i386.opt (mrecip): New option.
+ * config/i386/i386-protos.h (ix86_emit_swdivsf): Declare.
+ (ix86_emit_swsqrtsf): Ditto.
+ * config/i386/i386.c (IX86_BUILTIN_RSQRTF): New constant.
+ (ix86_init_mmx_sse_builtins): __builtin_ia32_rsqrtf: New
+ builtin definition.
+ (ix86_expand_builtin): Expand IX86_BUILTIN_RSQRTF using
+ ix86_expand_unop1_builtin.
+ (ix86_emit_swdivsf): New function.
+ (ix86_emit_swsqrtsf): Ditto.
+ (ix86_builtin_reciprocal): New function.
+ (TARGET_BUILTIN_RECIPROCAL): Use it.
+ (ix86_vectorize_builtin_conversion): Rename from
+ ix86_builtin_conversion.
+ (TARGET_VECTORIZE_BUILTIN_CONVERSION): Use renamed function.
+ * doc/invoke.texi (Machine Dependent Options): Add -mrecip to
+ "i386 and x86_64 Options" section.
+ (Intel 386 and AMD x86_64 Options): Document -mrecip.
+
2007-06-15 Andrew Pinski <andrew_pinski@playstation.sony.com>
Zdenek Dvorak <dvorakz@suse.cz>
Richard Guenther <rguenther@suse.de>
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 60b495582aa..a0eab4852a8 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -163,6 +163,8 @@ extern void x86_emit_floatuns (rtx [2]);
extern void ix86_emit_fp_unordered_jump (rtx);
extern void ix86_emit_i387_log1p (rtx, rtx);
+extern void ix86_emit_swdivsf (rtx, rtx, rtx, enum machine_mode);
+extern void ix86_emit_swsqrtsf (rtx, rtx, enum machine_mode, bool);
extern enum rtx_code ix86_reverse_condition (enum rtx_code, enum machine_mode);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index f9e67aa4af3..f4ae18bc353 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -16450,6 +16450,7 @@ enum ix86_builtins
IX86_BUILTIN_RCPSS,
IX86_BUILTIN_RSQRTPS,
IX86_BUILTIN_RSQRTSS,
+ IX86_BUILTIN_RSQRTF,
IX86_BUILTIN_SQRTPS,
IX86_BUILTIN_SQRTSS,
@@ -18039,6 +18040,10 @@ ix86_init_mmx_sse_builtins (void)
def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS);
+ ftype = build_function_type_list (float_type_node,
+ float_type_node,
+ NULL_TREE);
+ def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtf", ftype, IX86_BUILTIN_RSQRTF);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS);
@@ -19133,6 +19138,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
emit_insn (pat);
return 0;
+ case IX86_BUILTIN_RSQRTF:
+ return ix86_expand_unop1_builtin (CODE_FOR_rsqrtsf2, exp, target);
+
case IX86_BUILTIN_SQRTSS:
return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, exp, target);
case IX86_BUILTIN_RSQRTSS:
@@ -19869,7 +19877,7 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
input vector of type TYPE, or NULL_TREE if it is not available. */
static tree
-ix86_builtin_conversion (unsigned int code, tree type)
+ix86_vectorize_builtin_conversion (unsigned int code, tree type)
{
if (TREE_CODE (type) != VECTOR_TYPE)
return NULL_TREE;
@@ -19899,6 +19907,32 @@ ix86_builtin_conversion (unsigned int code, tree type)
}
}
+/* Returns a code for a target-specific builtin that implements
+ reciprocal of the function, or NULL_TREE if not available. */
+
+static tree
+ix86_builtin_reciprocal (unsigned int code, bool sqrt ATTRIBUTE_UNUSED)
+{
+ if (! (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+ && flag_finite_math_only && !flag_trapping_math
+ && flag_unsafe_math_optimizations))
+ return NULL_TREE;
+
+ switch (code)
+ {
+ /* Sqrt to rsqrt conversion. */
+ case BUILT_IN_SQRTF:
+ return ix86_builtins[IX86_BUILTIN_RSQRTF];
+
+ /* Vectorized version of sqrt to rsqrt conversion. */
+ case IX86_BUILTIN_SQRTPS:
+ return ix86_builtins[IX86_BUILTIN_RSQRTPS];
+
+ default:
+ return NULL_TREE;
+ }
+}
+
/* Store OPERAND to the memory after reload is completed. This means
that we can't easily use assign_stack_local. */
rtx
@@ -22501,6 +22535,100 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1)
emit_label (label2);
}
+/* Output code to perform a Newton-Rhapson approximation of a single precision
+ floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
+
+void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
+{
+ rtx x0, x1, e0, e1, two;
+
+ x0 = gen_reg_rtx (mode);
+ e0 = gen_reg_rtx (mode);
+ e1 = gen_reg_rtx (mode);
+ x1 = gen_reg_rtx (mode);
+
+ two = CONST_DOUBLE_FROM_REAL_VALUE (dconst2, SFmode);
+
+ if (VECTOR_MODE_P (mode))
+ two = ix86_build_const_vector (SFmode, true, two);
+
+ two = force_reg (mode, two);
+
+ /* a / b = a * rcp(b) * (2.0 - b * rcp(b)) */
+
+ /* x0 = 1./b estimate */
+ emit_insn (gen_rtx_SET (VOIDmode, x0,
+ gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+ UNSPEC_RCP)));
+ /* e0 = x0 * b */
+ emit_insn (gen_rtx_SET (VOIDmode, e0,
+ gen_rtx_MULT (mode, x0, b)));
+ /* e1 = 2. - e0 */
+ emit_insn (gen_rtx_SET (VOIDmode, e1,
+ gen_rtx_MINUS (mode, two, e0)));
+ /* x1 = x0 * e1 */
+ emit_insn (gen_rtx_SET (VOIDmode, x1,
+ gen_rtx_MULT (mode, x0, e1)));
+ /* res = a * x1 */
+ emit_insn (gen_rtx_SET (VOIDmode, res,
+ gen_rtx_MULT (mode, a, x1)));
+}
+
+/* Output code to perform a Newton-Rhapson approximation of a
+ single precision floating point [reciprocal] square root. */
+
+void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
+ bool recip)
+{
+ rtx x0, e0, e1, e2, e3, three, half;
+
+ x0 = gen_reg_rtx (mode);
+ e0 = gen_reg_rtx (mode);
+ e1 = gen_reg_rtx (mode);
+ e2 = gen_reg_rtx (mode);
+ e3 = gen_reg_rtx (mode);
+
+ three = CONST_DOUBLE_FROM_REAL_VALUE (dconst3, SFmode);
+ half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, SFmode);
+
+ if (VECTOR_MODE_P (mode))
+ {
+ three = ix86_build_const_vector (SFmode, true, three);
+ half = ix86_build_const_vector (SFmode, true, half);
+ }
+
+ three = force_reg (mode, three);
+ half = force_reg (mode, half);
+
+ /* sqrt(a) = 0.5 * a * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a))
+ 1.0 / sqrt(a) = 0.5 * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a)) */
+
+ /* x0 = 1./sqrt(a) estimate */
+ emit_insn (gen_rtx_SET (VOIDmode, x0,
+ gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+ UNSPEC_RSQRT)));
+ /* e0 = x0 * a */
+ emit_insn (gen_rtx_SET (VOIDmode, e0,
+ gen_rtx_MULT (mode, x0, a)));
+ /* e1 = e0 * x0 */
+ emit_insn (gen_rtx_SET (VOIDmode, e1,
+ gen_rtx_MULT (mode, e0, x0)));
+ /* e2 = 3. - e1 */
+ emit_insn (gen_rtx_SET (VOIDmode, e2,
+ gen_rtx_MINUS (mode, three, e1)));
+ if (recip)
+ /* e3 = .5 * x0 */
+ emit_insn (gen_rtx_SET (VOIDmode, e3,
+ gen_rtx_MULT (mode, half, x0)));
+ else
+ /* e3 = .5 * e0 */
+ emit_insn (gen_rtx_SET (VOIDmode, e3,
+ gen_rtx_MULT (mode, half, e0)));
+ /* ret = e2 * e3 */
+ emit_insn (gen_rtx_SET (VOIDmode, res,
+ gen_rtx_MULT (mode, e2, e3)));
+}
+
/* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
static void ATTRIBUTE_UNUSED
@@ -23205,9 +23333,14 @@ static const struct attribute_spec ix86_attribute_table[] =
#define TARGET_EXPAND_BUILTIN ix86_expand_builtin
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
-#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION ix86_builtin_vectorized_function
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
+ ix86_builtin_vectorized_function
+
#undef TARGET_VECTORIZE_BUILTIN_CONVERSION
-#define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_builtin_conversion
+#define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_vectorize_builtin_conversion
+
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
#undef TARGET_ASM_FUNCTION_EPILOGUE
#define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 5089883dca5..ee1fbbc2765 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -365,6 +365,7 @@ extern int x86_prefetch_sse;
#define TARGET_POPCNT x86_popcnt
#define TARGET_PREFETCH_SSE x86_prefetch_sse
#define TARGET_SAHF x86_sahf
+#define TARGET_RECIP x86_recip
#define ASSEMBLER_DIALECT (ix86_asm_dialect)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 0ecb9961d85..3f8f97b46cb 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -7470,7 +7470,16 @@
(div:SF (match_operand:SF 1 "register_operand" "")
(match_operand:SF 2 "nonimmediate_operand" "")))]
"TARGET_80387 || TARGET_SSE_MATH"
- "")
+{
+ if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+ && flag_finite_math_only && !flag_trapping_math
+ && flag_unsafe_math_optimizations)
+ {
+ ix86_emit_swdivsf (operands[0], operands[1],
+ operands[2], SFmode);
+ DONE;
+ }
+})
;; Remainder instructions.
@@ -15516,6 +15525,15 @@
(const_string "fop")))
(set_attr "mode" "SF")])
+(define_insn "*rcpsf2_sse"
+ [(set (match_operand:SF 0 "register_operand" "=x")
+ (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
+ UNSPEC_RCP))]
+ "TARGET_SSE_MATH"
+ "rcpss\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sse")
+ (set_attr "mode" "SF")])
+
(define_insn "*fop_sf_1_sse"
[(set (match_operand:SF 0 "register_operand" "=x")
(match_operator:SF 3 "binary_fp_operator"
@@ -15980,6 +15998,27 @@
(set_attr "athlon_decode" "direct")
(set_attr "amdfam10_decode" "direct")])
+(define_insn "*rsqrtsf2_sse"
+ [(set (match_operand:SF 0 "register_operand" "=x")
+ (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
+ UNSPEC_RSQRT))]
+ "TARGET_SSE_MATH"
+ "rsqrtss\t{%1, %0|%0, %1}"
+ [(set_attr "type" "sse")
+ (set_attr "mode" "SF")])
+
+(define_expand "rsqrtsf2"
+ [(set (match_operand:SF 0 "register_operand" "=x")
+ (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
+ UNSPEC_RSQRT))]
+ "TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+ && flag_finite_math_only && !flag_trapping_math
+ && flag_unsafe_math_optimizations"
+{
+ ix86_emit_swsqrtsf (operands[0], operands[1], SFmode, 1);
+ DONE;
+})
+
(define_insn "*sqrt<mode>2_sse"
[(set (match_operand:SSEMODEF 0 "register_operand" "=x")
(sqrt:SSEMODEF
@@ -15998,6 +16037,15 @@
"TARGET_USE_FANCY_MATH_387
|| (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
{
+ if (<MODE>mode == SFmode
+ && TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+ && flag_finite_math_only && !flag_trapping_math
+ && flag_unsafe_math_optimizations)
+ {
+ ix86_emit_swsqrtsf (operands[0], operands[1], SFmode, 0);
+ DONE;
+ }
+
if (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH))
{
rtx op0 = gen_reg_rtx (XFmode);
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 72b40c93987..1e36d0f0bab 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -258,3 +258,7 @@ Support code generation of popcnt instruction.
msahf
Target Report RejectNegative Var(x86_sahf)
Support code generation of sahf instruction in 64bit x86-64 code.
+
+mrecip
+Target Report RejectNegative Var(x86_recip)
+Generate reciprocals instead of divss and sqrtss.
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c74c0f7fe69..65abbcf3b69 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -450,7 +450,18 @@
(div:V4SF (match_operand:V4SF 1 "register_operand" "")
(match_operand:V4SF 2 "nonimmediate_operand" "")))]
"TARGET_SSE"
- "ix86_fixup_binary_operands_no_copy (DIV, V4SFmode, operands);")
+{
+ ix86_fixup_binary_operands_no_copy (DIV, V4SFmode, operands);
+
+ if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+ && flag_finite_math_only && !flag_trapping_math
+ && flag_unsafe_math_optimizations)
+ {
+ ix86_emit_swdivsf (operands[0], operands[1],
+ operands[2], V4SFmode);
+ DONE;
+ }
+})
(define_insn "*divv4sf3"
[(set (match_operand:V4SF 0 "register_operand" "=x")
@@ -494,7 +505,7 @@
[(set_attr "type" "sse")
(set_attr "mode" "SF")])
-(define_insn "sse_rsqrtv4sf2"
+(define_insn "*sse_rsqrtv4sf2"
[(set (match_operand:V4SF 0 "register_operand" "=x")
(unspec:V4SF
[(match_operand:V4SF 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))]
@@ -503,6 +514,21 @@
[(set_attr "type" "sse")
(set_attr "mode" "V4SF")])
+(define_expand "sse_rsqrtv4sf2"
+ [(set (match_operand:V4SF 0 "register_operand" "")
+ (unspec:V4SF
+ [(match_operand:V4SF 1 "nonimmediate_operand" "")] UNSPEC_RSQRT))]
+ "TARGET_SSE"
+{
+ if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+ && flag_finite_math_only && !flag_trapping_math
+ && flag_unsafe_math_optimizations)
+ {
+ ix86_emit_swsqrtsf (operands[0], operands[1], V4SFmode, 1);
+ DONE;
+ }
+})
+
(define_insn "sse_vmrsqrtv4sf2"
[(set (match_operand:V4SF 0 "register_operand" "=x")
(vec_merge:V4SF
@@ -515,7 +541,7 @@
[(set_attr "type" "sse")
(set_attr "mode" "SF")])
-(define_insn "sqrtv4sf2"
+(define_insn "*sqrtv4sf2"
[(set (match_operand:V4SF 0 "register_operand" "=x")
(sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "xm")))]
"TARGET_SSE"
@@ -523,6 +549,20 @@
[(set_attr "type" "sse")
(set_attr "mode" "V4SF")])
+(define_expand "sqrtv4sf2"
+ [(set (match_operand:V4SF 0 "register_operand" "=")
+ (sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "")))]
+ "TARGET_SSE"
+{
+ if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+ && flag_finite_math_only && !flag_trapping_math
+ && flag_unsafe_math_optimizations)
+ {
+ ix86_emit_swsqrtsf (operands[0], operands[1], V4SFmode, 0);
+ DONE;
+ }
+})
+
(define_insn "sse_vmsqrtv4sf2"
[(set (match_operand:V4SF 0 "register_operand" "=x")
(vec_merge:V4SF
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index dddb37da84e..d12a6275735 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -548,7 +548,7 @@ Objective-C and Objective-C++ Dialects}.
-masm=@var{dialect} -mno-fancy-math-387 @gol
-mno-fp-ret-in-387 -msoft-float @gol
-mno-wide-multiply -mrtd -malign-double @gol
--mpreferred-stack-boundary=@var{num} -mcx16 -msahf @gol
+-mpreferred-stack-boundary=@var{num} -mcx16 -msahf -mrecip @gol
-mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 @gol
-msse4a -m3dnow -mpopcnt -mabm @gol
-mthreads -mno-align-stringops -minline-all-stringops @gol
@@ -10346,6 +10346,13 @@ SAHF are load and store instructions, respectively, for certain status flags.
In 64-bit mode, SAHF instruction is used to optimize @code{fmod}, @code{drem}
or @code{remainder} built-in functions: see @ref{Other Builtins} for details.
+@item -mrecip
+@opindex mrecip
+This option will enable GCC to use RCPSS and RSQRTSS instructions (and their
+vectorized variants RCPPS and RSQRTPS) instead of DIVSS and SQRTSS (and their
+vectorized variants). These instructions will be generated only when
+@option{-funsafe-math-optimizatons} is enabled.
+
@item -mpush-args
@itemx -mno-push-args
@opindex mpush-args
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 692082728c1..cd7ae6bdfae 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5345,6 +5345,15 @@ of @var{x}.
The default version returns false for all constants.
@end deftypefn
+@deftypefn {Target Hook} tree TARGET_BUILTIN_RECIPROCAL (enum tree_code @var{code}, bool @var{sqrt})
+This hook should return the DECL of a function that implements reciprocal of
+the builtin function with builtin function code @var{code}, or
+@code{NULL_TREE} if such a function is not available. When @var{sqrt} is
+true, additional optimizations that apply only to the reciprocal of a square
+root function are performed, and only reciprocals of @code{sqrt} function
+are valid.
+@end deftypefn
+
@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD (void)
This hook should return the DECL of a function @var{f} that given an
address @var{addr} as an argument returns a mask @var{m} that can be
diff --git a/gcc/hooks.c b/gcc/hooks.c
index 18b17dc85f2..4c57a1687ee 100644
--- a/gcc/hooks.c
+++ b/gcc/hooks.c
@@ -266,7 +266,15 @@ hook_constcharptr_tree_null (tree t ATTRIBUTE_UNUSED)
}
tree
-hook_tree_tree_tree_bool_null (tree t0 ATTRIBUTE_UNUSED, tree t1 ATTRIBUTE_UNUSED,
+hook_tree_tree_bool_null (tree t0 ATTRIBUTE_UNUSED,
+ bool ignore ATTRIBUTE_UNUSED)
+{
+ return NULL;
+}
+
+tree
+hook_tree_tree_tree_bool_null (tree t0 ATTRIBUTE_UNUSED,
+ tree t1 ATTRIBUTE_UNUSED,
bool ignore ATTRIBUTE_UNUSED)
{
return NULL;
diff --git a/gcc/hooks.h b/gcc/hooks.h
index 02664c12803..15efef7ef22 100644
--- a/gcc/hooks.h
+++ b/gcc/hooks.h
@@ -58,6 +58,7 @@ extern int hook_int_void_no_regs (void);
extern tree hook_tree_tree_tree_null (tree, tree);
extern tree hook_tree_tree_tree_tree_3rd_identity (tree, tree, tree);
+extern tree hook_tree_tree_bool_null (tree, bool);
extern tree hook_tree_tree_tree_bool_null (tree, tree, bool);
extern unsigned hook_uint_uint_constcharptrptr_0 (unsigned, const char **);
diff --git a/gcc/passes.c b/gcc/passes.c
index c4c94ff8806..c954847b82d 100644
--- a/gcc/passes.c
+++ b/gcc/passes.c
@@ -647,6 +647,7 @@ init_optimization_passes (void)
NEXT_PASS (pass_tree_loop_done);
}
NEXT_PASS (pass_cse_reciprocals);
+ NEXT_PASS (pass_convert_to_rsqrt);
NEXT_PASS (pass_reassoc);
NEXT_PASS (pass_vrp);
NEXT_PASS (pass_dominator);
diff --git a/gcc/target-def.h b/gcc/target-def.h
index 84532a6a04b..8acaa19bead 100644
--- a/gcc/target-def.h
+++ b/gcc/target-def.h
@@ -350,8 +350,10 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
TARGET_SCHED_SET_SCHED_FLAGS}
#define TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD 0
-#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION default_builtin_vectorized_function
-#define TARGET_VECTORIZE_BUILTIN_CONVERSION default_builtin_vectorized_conversion
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
+ default_builtin_vectorized_function
+#define TARGET_VECTORIZE_BUILTIN_CONVERSION \
+ default_builtin_vectorized_conversion
#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN 0
#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD 0
@@ -385,6 +387,9 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#define TARGET_RESOLVE_OVERLOADED_BUILTIN NULL
#define TARGET_FOLD_BUILTIN hook_tree_tree_tree_bool_null
+/* In tree-ssa-math-opts.c */
+#define TARGET_BUILTIN_RECIPROCAL hook_tree_tree_bool_null
+
/* In varasm.c. */
#ifndef TARGET_SECTION_TYPE_FLAGS
#define TARGET_SECTION_TYPE_FLAGS default_section_type_flags
@@ -668,6 +673,7 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
TARGET_EXPAND_BUILTIN, \
TARGET_RESOLVE_OVERLOADED_BUILTIN, \
TARGET_FOLD_BUILTIN, \
+ TARGET_BUILTIN_RECIPROCAL, \
TARGET_MANGLE_FUNDAMENTAL_TYPE, \
TARGET_INIT_LIBFUNCS, \
TARGET_SECTION_TYPE_FLAGS, \
diff --git a/gcc/target.h b/gcc/target.h
index f769ae0938a..2d446a121dc 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -483,6 +483,10 @@ struct gcc_target
/* Fold a target-specific builtin. */
tree (* fold_builtin) (tree fndecl, tree arglist, bool ignore);
+ /* Returns a code for a target-specific builtin that implements
+ reciprocal of the function, or NULL_TREE if not available. */
+ tree (* builtin_reciprocal) (unsigned, bool);
+
/* For a vendor-specific fundamental TYPE, return a pointer to
a statically-allocated string containing the C++ mangling for
TYPE. In all other cases, return NULL. */
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index ba4707a541b..cc98594c016 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,12 @@
+2007-06-16 Uros Bizjak <ubizjak@gmail.com>
+
+ PR middle-end/31723
+ * gcc.target/i386/recip-divf.c: New test.
+ * gcc.target/i386/recip-sqrtf.c: Ditto.
+ * gcc.target/i386/recip-vec-divf.c: Ditto.
+ * gcc.target/i386/recip-vec-sqrtf.c: Ditto.
+ * gcc.target/i386/sse-recip.c: Ditto.
+
2007-06-15 Andrew Pinski <andrew_pinski@playstation.sony.com>
PR tree-opt/32225
diff --git a/gcc/testsuite/gcc.target/i386/recip-divf.c b/gcc/testsuite/gcc.target/i386/recip-divf.c
new file mode 100644
index 00000000000..0a2e9c8bd81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/recip-divf.c
@@ -0,0 +1,9 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -ffast-math -msse2 -mfpmath=sse -mrecip" } */
+
+float t1(float a, float b)
+{
+ return a / b;
+}
+
+/* { dg-final { scan-assembler "rcpss" } } */
diff --git a/gcc/testsuite/gcc.target/i386/recip-sqrtf.c b/gcc/testsuite/gcc.target/i386/recip-sqrtf.c
new file mode 100644
index 00000000000..c387077aae2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/recip-sqrtf.c
@@ -0,0 +1,21 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -ffast-math -msse2 -mfpmath=sse -mrecip" } */
+
+extern float sqrtf (float);
+
+float t1(float a, float b)
+{
+ return a/sqrtf(b);
+}
+
+float t2(float x, float a, float b)
+{
+ return sqrtf(a/b);
+}
+
+float t3(float a)
+{
+ return sqrtf(a);
+}
+
+/* { dg-final { scan-assembler-times "rsqrtss" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/recip-vec-divf.c b/gcc/testsuite/gcc.target/i386/recip-vec-divf.c
new file mode 100644
index 00000000000..bf41e6c4fde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/recip-vec-divf.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse -mrecip" } */
+
+float a[16];
+float b[16];
+float r[16];
+
+void t1(void)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ r[i] = a[i] / b[i];
+}
+
+/* { dg-final { scan-assembler "rcpps" } } */
diff --git a/gcc/testsuite/gcc.target/i386/recip-vec-sqrtf.c b/gcc/testsuite/gcc.target/i386/recip-vec-sqrtf.c
new file mode 100644
index 00000000000..2eb3f861f42
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/recip-vec-sqrtf.c
@@ -0,0 +1,34 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse -mrecip" } */
+
+float a[16];
+float b[16];
+float r[16];
+
+extern float sqrtf (float);
+
+void t1(void)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ r[i] = a[i] / sqrtf (b[i]);
+}
+
+void t2(void)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ r[i] = sqrtf (a[i] / b[i]);
+}
+
+void t3(void)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ r[i] = sqrtf (a[i]);
+}
+
+/* { dg-final { scan-assembler-times "rsqrtps" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse-recip.c b/gcc/testsuite/gcc.target/i386/sse-recip.c
new file mode 100644
index 00000000000..2d7dff91452
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse-recip.c
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -msse -mfpmath=sse -mrecip" } */
+
+#include "../../gcc.dg/i386-cpuid.h"
+
+extern float sqrtf (float);
+extern void abort (void);
+
+#define N 8
+
+int __attribute__((noinline))
+main1 ()
+{
+ float a[N] = { 0.f, 18.f, 108.f, 324.f, 720.f, 1944.f, 3087.f, 5832.f };
+ float b[N] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f };
+ float r[N];
+
+ float rc[N] = { 0.f, 3.f, 6.f, 9.f, 12.f, 18.f, 21.f, 27.f };
+
+ int i;
+
+ for (i = 0; i < N; i++)
+ {
+ r[i] = sqrtf (a[i] / b[i]);
+ }
+
+ /* check results: */
+ for (i = 0; i < N; i++)
+ {
+ if (r[i] != rc[i])
+ abort();
+ }
+
+ return 0;
+}
+
+int
+main ()
+{
+ unsigned long cpu_facilities;
+
+ cpu_facilities = i386_cpuid ();
+
+ if ((cpu_facilities & (bit_MMX | bit_SSE | bit_CMOV))
+ != (bit_MMX | bit_SSE | bit_CMOV))
+ /* If host has no vector support, pass. */
+ return 0;
+
+ main1 ();
+ return 0;
+}
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 6800edfdbc8..333ec41b853 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -293,6 +293,7 @@ extern struct tree_opt_pass pass_early_warn_uninitialized;
extern struct tree_opt_pass pass_late_warn_uninitialized;
extern struct tree_opt_pass pass_cse_reciprocals;
extern struct tree_opt_pass pass_cse_sincos;
+extern struct tree_opt_pass pass_convert_to_rsqrt;
extern struct tree_opt_pass pass_warn_function_return;
extern struct tree_opt_pass pass_warn_function_noreturn;
extern struct tree_opt_pass pass_phiopt;
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index fe67993f8dc..0534dcf2f90 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -496,6 +496,46 @@ execute_cse_reciprocals (void)
&& TREE_CODE (def) == SSA_NAME)
execute_cse_reciprocals_1 (&bsi, def);
}
+
+ /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
+ for (bsi = bsi_after_labels (bb); !bsi_end_p (bsi); bsi_next (&bsi))
+ {
+ tree stmt = bsi_stmt (bsi);
+ tree fndecl;
+
+ if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
+ && TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)) == RDIV_EXPR)
+ {
+ tree arg1 = TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt, 1), 1);
+ tree stmt1 = SSA_NAME_DEF_STMT (arg1);
+
+ if (TREE_CODE (stmt1) == GIMPLE_MODIFY_STMT
+ && TREE_CODE (GIMPLE_STMT_OPERAND (stmt1, 1)) == CALL_EXPR
+ && (fndecl
+ = get_callee_fndecl (GIMPLE_STMT_OPERAND (stmt1, 1)))
+ && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
+ || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
+ {
+ enum built_in_function code;
+ tree arg10;
+ tree tmp;
+
+ code = DECL_FUNCTION_CODE (fndecl);
+ fndecl = targetm.builtin_reciprocal (code, false);
+ if (!fndecl)
+ continue;
+
+ arg10 = CALL_EXPR_ARG (GIMPLE_STMT_OPERAND (stmt1, 1), 0);
+ tmp = build_call_expr (fndecl, 1, arg10);
+ GIMPLE_STMT_OPERAND (stmt1, 1) = tmp;
+ update_stmt (stmt1);
+
+ TREE_SET_CODE (GIMPLE_STMT_OPERAND (stmt, 1), MULT_EXPR);
+ fold_stmt_inplace (stmt);
+ update_stmt (stmt);
+ }
+ }
+ }
}
free_dominance_info (CDI_DOMINATORS);
@@ -726,3 +766,88 @@ struct tree_opt_pass pass_cse_sincos =
| TODO_verify_stmts, /* todo_flags_finish */
0 /* letter */
};
+
+/* Find all expressions in the form of sqrt(a/b) and
+ convert them to rsqrt(b/a). */
+
+static unsigned int
+execute_convert_to_rsqrt (void)
+{
+ basic_block bb;
+
+ FOR_EACH_BB (bb)
+ {
+ block_stmt_iterator bsi;
+
+ for (bsi = bsi_after_labels (bb); !bsi_end_p (bsi); bsi_next (&bsi))
+ {
+ tree stmt = bsi_stmt (bsi);
+ tree fndecl;
+
+ if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
+ && TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)) == CALL_EXPR
+ && (fndecl = get_callee_fndecl (GIMPLE_STMT_OPERAND (stmt, 1)))
+ && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
+ || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
+ {
+ enum built_in_function code;
+ tree arg1;
+ tree stmt1;
+
+ code = DECL_FUNCTION_CODE (fndecl);
+ fndecl = targetm.builtin_reciprocal (code, true);
+ if (!fndecl)
+ continue;
+
+ arg1 = CALL_EXPR_ARG (GIMPLE_STMT_OPERAND (stmt, 1), 0);
+ stmt1 = SSA_NAME_DEF_STMT (arg1);
+
+ if (TREE_CODE (stmt1) == GIMPLE_MODIFY_STMT
+ && TREE_CODE (GIMPLE_STMT_OPERAND (stmt1, 1)) == RDIV_EXPR)
+ {
+ tree arg10, arg11;
+ tree tmp;
+
+ arg10 = TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 0);
+ arg11 = TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 1);
+
+ /* Swap operands of RDIV_EXPR. */
+ TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 0) = arg11;
+ TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 1) = arg10;
+ fold_stmt_inplace (stmt1);
+ update_stmt (stmt1);
+
+ tmp = build_call_expr (fndecl, 1, arg1);
+ GIMPLE_STMT_OPERAND (stmt, 1) = tmp;
+ update_stmt (stmt);
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+static bool
+gate_convert_to_rsqrt (void)
+{
+ return flag_unsafe_math_optimizations && optimize;
+}
+
+struct tree_opt_pass pass_convert_to_rsqrt =
+{
+ "rsqrt", /* name */
+ gate_convert_to_rsqrt, /* gate */
+ execute_convert_to_rsqrt, /* execute */
+ NULL, /* sub */
+ NULL, /* next */
+ 0, /* static_pass_number */
+ 0, /* tv_id */
+ PROP_ssa, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
+ | TODO_verify_stmts, /* todo_flags_finish */
+ 0 /* letter */
+};