summaryrefslogtreecommitdiff
path: root/gcc/config/gcn/gcn-valu.md
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/gcn/gcn-valu.md')
-rw-r--r--gcc/config/gcn/gcn-valu.md223
1 files changed, 128 insertions, 95 deletions
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index c0b43fcfb64..3628e190eb1 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -15,6 +15,7 @@
;; <http://www.gnu.org/licenses/>.
;; {{{ Vector iterators
+; SV iterators include both scalar and vector modes.
; Vector modes for specific types
(define_mode_iterator V_QI
@@ -126,6 +127,15 @@
V32SI V32DI
V64SI V64DI])
+(define_mode_iterator SV_SFDF
+ [SF DF
+ V2SF V2DF
+ V4SF V4DF
+ V8SF V8DF
+ V16SF V16DF
+ V32SF V32DF
+ V64SF V64DF])
+
; All of above
(define_mode_iterator V_ALL
[V2QI V2HI V2HF V2SI V2SF V2DI V2DF
@@ -156,9 +166,19 @@
V16HF V16SF V16DF
V32HF V32SF V32DF
V64HF V64SF V64DF])
+(define_mode_iterator SV_FP
+ [HF SF DF
+ V2HF V2SF V2DF
+ V4HF V4SF V4DF
+ V8HF V8SF V8DF
+ V16HF V16SF V16DF
+ V32HF V32SF V32DF
+ V64HF V64SF V64DF])
(define_mode_attr scalar_mode
- [(V2QI "qi") (V2HI "hi") (V2SI "si")
+ [(QI "qi") (HI "hi") (SI "si")
+ (HF "hf") (SF "sf") (DI "di") (DF "df")
+ (V2QI "qi") (V2HI "hi") (V2SI "si")
(V2HF "hf") (V2SF "sf") (V2DI "di") (V2DF "df")
(V4QI "qi") (V4HI "hi") (V4SI "si")
(V4HF "hf") (V4SF "sf") (V4DI "di") (V4DF "df")
@@ -172,7 +192,9 @@
(V64HF "hf") (V64SF "sf") (V64DI "di") (V64DF "df")])
(define_mode_attr SCALAR_MODE
- [(V2QI "QI") (V2HI "HI") (V2SI "SI")
+ [(QI "QI") (HI "HI") (SI "SI")
+ (HF "HF") (SF "SF") (DI "DI") (DF "DF")
+ (V2QI "QI") (V2HI "HI") (V2SI "SI")
(V2HF "HF") (V2SF "SF") (V2DI "DI") (V2DF "DF")
(V4QI "QI") (V4HI "HI") (V4SI "SI")
(V4HF "HF") (V4SF "SF") (V4DI "DI") (V4DF "DF")
@@ -3247,113 +3269,124 @@
;; {{{ FP division
(define_insn "recip<mode>2<exec>"
- [(set (match_operand:V_FP 0 "register_operand" "= v")
- (unspec:V_FP
- [(match_operand:V_FP 1 "gcn_alu_operand" "vSvB")]
+ [(set (match_operand:SV_FP 0 "register_operand" "= v")
+ (unspec:SV_FP
+ [(match_operand:SV_FP 1 "gcn_alu_operand" "vSvB")]
UNSPEC_RCP))]
""
"v_rcp%i0\t%0, %1"
[(set_attr "type" "vop1")
(set_attr "length" "8")])
-(define_insn "recip<mode>2"
- [(set (match_operand:FP 0 "register_operand" "= v")
- (unspec:FP
- [(match_operand:FP 1 "gcn_alu_operand" "vSvB")]
- UNSPEC_RCP))]
- ""
- "v_rcp%i0\t%0, %1"
- [(set_attr "type" "vop1")
+;; v_div_scale takes a numerator (op2) and denominator (op1) and returns the
+;; one that matches op3 adjusted for best results in reciprocal division.
+;; It also emits a VCC mask that is intended for input to v_div_fmas.
+;; The caller is expected to call this twice, once for each input. The output
+;; VCC is the same in both cases, so the caller may discard one.
+(define_insn "div_scale<mode><exec_vcc>"
+ [(set (match_operand:SV_SFDF 0 "register_operand" "=v")
+ (unspec:SV_SFDF
+ [(match_operand:SV_SFDF 1 "gcn_alu_operand" "v")
+ (match_operand:SV_SFDF 2 "gcn_alu_operand" "v")
+ (match_operand:SV_SFDF 3 "gcn_alu_operand" "v")]
+ UNSPEC_DIV_SCALE))
+ (set (match_operand:DI 4 "register_operand" "=SvcV")
+ (unspec:DI
+ [(match_dup 1) (match_dup 2) (match_dup 3)]
+ UNSPEC_DIV_SCALE))]
+ ""
+ "v_div_scale%i0\t%0, %4, %3, %1, %2"
+ [(set_attr "type" "vop3b")
(set_attr "length" "8")])
-;; Do division via a = b * 1/c
-;; The v_rcp_* instructions are not sufficiently accurate on their own,
-;; so we use 2 v_fma_* instructions to do one round of Newton-Raphson
-;; which the ISA manual says is enough to improve the reciprocal accuracy.
-;;
-;; FIXME: This does not handle denormals, NaNs, division-by-zero etc.
+;; v_div_fmas is "FMA and Scale" that uses the VCC output from v_div_scale
+;; to conditionally scale the output of the whole division operation.
+;; This is necessary to counter the adjustments made by v_div_scale and
+;; replaces the last FMA instruction of the Newton Raphson algorithm.
+(define_insn "div_fmas<mode><exec>"
+ [(set (match_operand:SV_SFDF 0 "register_operand" "=v")
+ (unspec:SV_SFDF
+ [(plus:SV_SFDF
+ (mult:SV_SFDF
+ (match_operand:SV_SFDF 1 "gcn_alu_operand" "v")
+ (match_operand:SV_SFDF 2 "gcn_alu_operand" "v"))
+ (match_operand:SV_SFDF 3 "gcn_alu_operand" "v"))
+ (match_operand:DI 4 "register_operand" "cV")]
+ UNSPEC_DIV_FMAS))]
+ ""
+ "v_div_fmas%i0\t%0, %1, %2, %3; %4"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")
+ (set_attr "vccwait" "5")])
+
+;; v_div_fixup takes the inputs and outputs of a division operation already
+;; completed and cleans up the floating-point sign bit, infinity, underflow,
+;; overflow, and NaN status. It will also emit any FP exceptions.
+;; op1: quotient, op2: denominator, op3: numerator
+(define_insn "div_fixup<mode><exec>"
+ [(set (match_operand:SV_FP 0 "register_operand" "=v")
+ (unspec:SV_FP
+ [(match_operand:SV_FP 1 "register_operand" "v")
+ (match_operand:SV_FP 2 "gcn_alu_operand" "v")
+ (match_operand:SV_FP 3 "gcn_alu_operand" "v")]
+ UNSPEC_DIV_FIXUP))]
+ ""
+ "v_div_fixup%i0\t%0, %1, %2, %3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
(define_expand "div<mode>3"
- [(match_operand:V_FP 0 "gcn_valu_dst_operand")
- (match_operand:V_FP 1 "gcn_valu_src0_operand")
- (match_operand:V_FP 2 "gcn_valu_src0_operand")]
- "flag_reciprocal_math"
+ [(match_operand:SV_SFDF 0 "register_operand")
+ (match_operand:SV_SFDF 1 "gcn_alu_operand")
+ (match_operand:SV_SFDF 2 "gcn_alu_operand")]
+ ""
{
+ rtx numerator = operands[1];
+ rtx denominator = operands[2];
+
+ /* Scale the inputs if they are close to the FP limits.
+ This will be reversed later. */
+ rtx vcc = gen_reg_rtx (DImode);
+ rtx discardedvcc = gen_reg_rtx (DImode);
+ rtx scaled_numerator = gen_reg_rtx (<MODE>mode);
+ rtx scaled_denominator = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_div_scale<mode> (scaled_denominator,
+ denominator, numerator,
+ denominator, discardedvcc));
+ emit_insn (gen_div_scale<mode> (scaled_numerator,
+ denominator, numerator,
+ numerator, vcc));
+
+ /* Find the reciprocal of the denominator, and use Newton-Raphson to
+ improve the accuracy over the basic hardware instruction. */
rtx one = gcn_vec_constant (<MODE>mode,
const_double_from_real_value (dconst1, <SCALAR_MODE>mode));
rtx initrcp = gen_reg_rtx (<MODE>mode);
- rtx fma = gen_reg_rtx (<MODE>mode);
- rtx rcp;
- rtx num = operands[1], denom = operands[2];
-
- bool is_rcp = (GET_CODE (num) == CONST_VECTOR
- && real_identical
- (CONST_DOUBLE_REAL_VALUE
- (CONST_VECTOR_ELT (num, 0)), &dconstm1));
-
- if (is_rcp)
- rcp = operands[0];
- else
- rcp = gen_reg_rtx (<MODE>mode);
-
- emit_insn (gen_recip<mode>2 (initrcp, denom));
- emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, denom, one));
- emit_insn (gen_fma<mode>4 (rcp, fma, initrcp, initrcp));
-
- if (!is_rcp)
- {
- rtx div_est = gen_reg_rtx (<MODE>mode);
- rtx fma2 = gen_reg_rtx (<MODE>mode);
- rtx fma3 = gen_reg_rtx (<MODE>mode);
- rtx fma4 = gen_reg_rtx (<MODE>mode);
- emit_insn (gen_mul<mode>3 (div_est, num, rcp));
- emit_insn (gen_fma<mode>4_negop2 (fma2, div_est, denom, num));
- emit_insn (gen_fma<mode>4 (fma3, fma2, rcp, div_est));
- emit_insn (gen_fma<mode>4_negop2 (fma4, fma3, denom, num));
- emit_insn (gen_fma<mode>4 (operands[0], fma4, rcp, fma3));
- }
-
- DONE;
- })
-
-(define_expand "div<mode>3"
- [(match_operand:FP 0 "gcn_valu_dst_operand")
- (match_operand:FP 1 "gcn_valu_src0_operand")
- (match_operand:FP 2 "gcn_valu_src0_operand")]
- "flag_reciprocal_math"
- {
- rtx one = const_double_from_real_value (dconst1, <MODE>mode);
- rtx initrcp = gen_reg_rtx (<MODE>mode);
- rtx fma = gen_reg_rtx (<MODE>mode);
- rtx rcp;
- rtx num = operands[1], denom = operands[2];
-
- bool is_rcp = (GET_CODE (operands[1]) == CONST_DOUBLE
- && real_identical (CONST_DOUBLE_REAL_VALUE (operands[1]),
- &dconstm1));
-
- if (is_rcp)
- rcp = operands[0];
- else
- rcp = gen_reg_rtx (<MODE>mode);
-
- emit_insn (gen_recip<mode>2 (initrcp, denom));
- emit_insn (gen_fma<mode>4_negop2 (fma, initrcp, denom, one));
- emit_insn (gen_fma<mode>4 (rcp, fma, initrcp, initrcp));
-
- if (!is_rcp)
- {
- rtx div_est = gen_reg_rtx (<MODE>mode);
- rtx fma2 = gen_reg_rtx (<MODE>mode);
- rtx fma3 = gen_reg_rtx (<MODE>mode);
- rtx fma4 = gen_reg_rtx (<MODE>mode);
- emit_insn (gen_mul<mode>3 (div_est, num, rcp));
- emit_insn (gen_fma<mode>4_negop2 (fma2, div_est, denom, num));
- emit_insn (gen_fma<mode>4 (fma3, fma2, rcp, div_est));
- emit_insn (gen_fma<mode>4_negop2 (fma4, fma3, denom, num));
- emit_insn (gen_fma<mode>4 (operands[0], fma4, rcp, fma3));
- }
-
+ rtx fma1 = gen_reg_rtx (<MODE>mode);
+ rtx rcp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_recip<mode>2 (initrcp, scaled_denominator));
+ emit_insn (gen_fma<mode>4_negop2 (fma1, initrcp, scaled_denominator, one));
+ emit_insn (gen_fma<mode>4 (rcp, fma1, initrcp, initrcp));
+
+ /* Do the division "a/b" via "a*1/b" and use Newton-Raphson to improve
+ the accuracy. The "div_fmas" instruction reverses any scaling
+ performed by "div_scale", above. */
+ rtx div_est = gen_reg_rtx (<MODE>mode);
+ rtx fma2 = gen_reg_rtx (<MODE>mode);
+ rtx fma3 = gen_reg_rtx (<MODE>mode);
+ rtx fma4 = gen_reg_rtx (<MODE>mode);
+ rtx fmas = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (div_est, scaled_numerator, rcp));
+ emit_insn (gen_fma<mode>4_negop2 (fma2, div_est, scaled_denominator,
+ scaled_numerator));
+ emit_insn (gen_fma<mode>4 (fma3, fma2, rcp, div_est));
+ emit_insn (gen_fma<mode>4_negop2 (fma4, fma3, scaled_denominator,
+ scaled_numerator));
+ emit_insn (gen_div_fmas<mode> (fmas, fma4, rcp, fma3, vcc));
+
+ /* Finally, use "div_fixup" to get the details right and find errors. */
+ emit_insn (gen_div_fixup<mode> (operands[0], fmas, denominator,
+ numerator));
DONE;
})