summaryrefslogtreecommitdiff
path: root/compiler/GHC/CmmToAsm/X86
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/GHC/CmmToAsm/X86')
-rw-r--r--compiler/GHC/CmmToAsm/X86/CodeGen.hs92
-rw-r--r--compiler/GHC/CmmToAsm/X86/Instr.hs22
-rw-r--r--compiler/GHC/CmmToAsm/X86/Ppr.hs23
3 files changed, 123 insertions, 14 deletions
diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
index d6ef821c9f..859b27e248 100644
--- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
@@ -901,14 +901,10 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = -- dyadic MachOps
MO_U_Lt _ -> condIntReg LU x y
MO_U_Le _ -> condIntReg LEU x y
- MO_F_Add w -> trivialFCode_sse2 w ADD x y
-
- MO_F_Sub w -> trivialFCode_sse2 w SUB x y
-
- MO_F_Quot w -> trivialFCode_sse2 w FDIV x y
-
- MO_F_Mul w -> trivialFCode_sse2 w MUL x y
-
+ MO_F_Add w -> trivialFCode_sse2 w ADD x y
+ MO_F_Sub w -> trivialFCode_sse2 w SUB x y
+ MO_F_Quot w -> trivialFCode_sse2 w FDIV x y
+ MO_F_Mul w -> trivialFCode_sse2 w MUL x y
MO_Add rep -> add_code rep x y
MO_Sub rep -> sub_code rep x y
@@ -1113,6 +1109,13 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = -- dyadic MachOps
return (Fixed format result code)
+getRegister' _plat _is32Bit (CmmMachOp mop [x, y, z]) = -- ternary MachOps
+ case mop of
+ -- Floating point fused multiply-add operations @ ± x*y ± z@
+ MO_FMA var w -> genFMA3Code w var x y z
+
+ _other -> pprPanic "getRegister(x86) - ternary CmmMachOp (1)"
+ (pprMachOp mop)
getRegister' _ _ (CmmLoad mem pk _)
| isFloatType pk
@@ -3151,12 +3154,12 @@ genTrivialCode rep instr a b = do
a_code <- getAnyReg a
tmp <- getNewRegNat rep
let
- -- We want the value of b to stay alive across the computation of a.
- -- But, we want to calculate a straight into the destination register,
+ -- We want the value of 'b' to stay alive across the computation of 'a'.
+ -- But, we want to calculate 'a' straight into the destination register,
-- because the instruction only has two operands (dst := dst `op` src).
- -- The troublesome case is when the result of b is in the same register
- -- as the destination reg. In this case, we have to save b in a
- -- new temporary across the computation of a.
+ -- The troublesome case is when the result of 'b' is in the same register
+ -- as the destination 'reg'. In this case, we have to save 'b' in a
+ -- new temporary across the computation of 'a'.
code dst
| dst `regClashesWithOp` b_op =
b_code `appOL`
@@ -3174,6 +3177,69 @@ reg `regClashesWithOp` OpReg reg2 = reg == reg2
reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
_ `regClashesWithOp` _ = False
+-- | Generate code for a fused multiply-add operation, of the form @± x * y ± z@,
+-- with 3 operands (FMA3 instruction set).
+genFMA3Code :: Width
+ -> FMASign
+ -> CmmExpr -> CmmExpr -> CmmExpr -> NatM Register
+genFMA3Code w signs x y z = do
+
+ -- For the FMA instruction, we want to compute x * y + z
+ --
+ -- There are three possible instructions we could emit:
+ --
+ -- - fmadd213 z y x, result in x, z can be a memory address
+ -- - fmadd132 x z y, result in y, x can be a memory address
+ -- - fmadd231 y x z, result in z, y can be a memory address
+ --
+ -- This suggests two possible optimisations:
+ --
+ -- - OPTIMISATION 1
+ -- If one argument is an address, use the instruction that allows
+ -- a memory address in that position.
+ --
+ -- - OPTIMISATION 2
+ -- If one argument is in a fixed register, use the instruction that puts
+ -- the result in that same register.
+ --
+ -- Currently we follow neither of these optimisations,
+ -- opting to always use fmadd213 for simplicity.
+ let rep = floatFormat w
+ (y_reg, y_code) <- getNonClobberedReg y
+ (z_reg, z_code) <- getNonClobberedReg z
+ x_code <- getAnyReg x
+ y_tmp <- getNewRegNat rep
+ z_tmp <- getNewRegNat rep
+ let
+ fma213 = FMA3 rep signs FMA213
+ code dst
+ | dst == y_reg
+ , dst == z_reg
+ = y_code `appOL`
+ unitOL (MOV rep (OpReg y_reg) (OpReg y_tmp)) `appOL`
+ z_code `appOL`
+ unitOL (MOV rep (OpReg z_reg) (OpReg z_tmp)) `appOL`
+ x_code dst `snocOL`
+ fma213 (OpReg z_tmp) y_tmp dst
+ | dst == y_reg
+ = y_code `appOL`
+ unitOL (MOV rep (OpReg y_reg) (OpReg z_tmp)) `appOL`
+ z_code `appOL`
+ x_code dst `snocOL`
+ fma213 (OpReg z_reg) y_tmp dst
+ | dst == z_reg
+ = y_code `appOL`
+ z_code `appOL`
+ unitOL (MOV rep (OpReg z_reg) (OpReg z_tmp)) `appOL`
+ x_code dst `snocOL`
+ fma213 (OpReg z_tmp) y_reg dst
+ | otherwise
+ = y_code `appOL`
+ z_code `appOL`
+ x_code dst `snocOL`
+ fma213 (OpReg z_reg) y_reg dst
+ return (Any rep code)
+
-----------
trivialUCode :: Format -> (Operand -> Instr)
diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs
index ccb3ce09ba..b4e93a1c5d 100644
--- a/compiler/GHC/CmmToAsm/X86/Instr.hs
+++ b/compiler/GHC/CmmToAsm/X86/Instr.hs
@@ -12,6 +12,7 @@ module GHC.CmmToAsm.X86.Instr
( Instr(..)
, Operand(..)
, PrefetchVariant(..)
+ , FMAPermutation(..)
, JumpDest(..)
, getJumpDestBlockId
, canShortcut
@@ -272,6 +273,10 @@ data Instr
| CVTSI2SS Format Operand Reg -- I32/I64 to F32
| CVTSI2SD Format Operand Reg -- I32/I64 to F64
+ -- | FMA3 fused multiply-add operations.
+ | FMA3 Format FMASign FMAPermutation Operand Reg Reg
+ -- src1 (r/m), src2 (r), dst (r)
+
-- use ADD, SUB, and SQRT for arithmetic. In both cases, operands
-- are Operand Reg.
@@ -351,7 +356,7 @@ data Operand
| OpImm Imm -- immediate value
| OpAddr AddrMode -- memory reference
-
+data FMAPermutation = FMA132 | FMA213 | FMA231
-- | Returns which registers are read and written as a (read, written)
-- pair.
@@ -438,6 +443,8 @@ regUsageOfInstr platform instr
PDEP _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
PEXT _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
+ FMA3 _ _ _ src1 src2 dst -> usageFMA src1 src2 dst
+
-- note: might be a better way to do this
PREFETCH _ _ src -> mkRU (use_R src []) []
LOCK i -> regUsageOfInstr platform i
@@ -482,6 +489,15 @@ regUsageOfInstr platform instr
usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg]
usageRMM _ _ _ = panic "X86.RegInfo.usageRMM: no match"
+ -- 3 operand form of FMA instructions.
+ usageFMA :: Operand -> Reg -> Reg -> RegUsage
+ usageFMA (OpReg src1) src2 dst
+ = mkRU [src1, src2, dst] [dst]
+ usageFMA (OpAddr ea1) src2 dst
+ = mkRU (use_EA ea1 [src2, dst]) [dst]
+ usageFMA _ _ _
+ = panic "X86.RegInfo.usageFMA: no match"
+
-- 1 operand form; operand Modified
usageM :: Operand -> RegUsage
usageM (OpReg reg) = mkRU [reg] [reg]
@@ -561,6 +577,8 @@ patchRegsOfInstr instr env
JMP op regs -> JMP (patchOp op) regs
JMP_TBL op ids s lbl -> JMP_TBL (patchOp op) ids s lbl
+ FMA3 fmt perm var x1 x2 x3 -> patch3 (FMA3 fmt perm var) x1 x2 x3
+
-- literally only support storing the top x87 stack value st(0)
X87Store fmt dst -> X87Store fmt (lookupAddr dst)
@@ -612,6 +630,8 @@ patchRegsOfInstr instr env
patch1 insn op = insn $! patchOp op
patch2 :: (Operand -> Operand -> a) -> Operand -> Operand -> a
patch2 insn src dst = (insn $! patchOp src) $! patchOp dst
+ patch3 :: (Operand -> Reg -> Reg -> a) -> Operand -> Reg -> Reg -> a
+ patch3 insn src1 src2 dst = ((insn $! patchOp src1) $! env src2) $! env dst
patchOp (OpReg reg) = OpReg $! env reg
patchOp (OpImm imm) = OpImm imm
diff --git a/compiler/GHC/CmmToAsm/X86/Ppr.hs b/compiler/GHC/CmmToAsm/X86/Ppr.hs
index 4a8f55fdf0..0d649f2efb 100644
--- a/compiler/GHC/CmmToAsm/X86/Ppr.hs
+++ b/compiler/GHC/CmmToAsm/X86/Ppr.hs
@@ -838,6 +838,14 @@ pprInstr platform i = case i of
FDIV format op1 op2
-> pprFormatOpOp (text "div") format op1 op2
+ FMA3 format var perm op1 op2 op3
+ -> let mnemo = case var of
+ FMAdd -> text "vfmadd"
+ FMSub -> text "vfmsub"
+ FNMAdd -> text "vfnmadd"
+ FNMSub -> text "vfnmsub"
+ in pprFormatOpRegReg (mnemo <> pprFMAPermutation perm) format op1 op2 op3
+
SQRT format op1 op2
-> pprFormatOpReg (text "sqrt") format op1 op2
@@ -968,6 +976,21 @@ pprInstr platform i = case i of
pprOperand platform format op2
]
+ pprFormatOpRegReg :: Line doc -> Format -> Operand -> Reg -> Reg -> doc
+ pprFormatOpRegReg name format op1 op2 op3
+ = line $ hcat [
+ pprMnemonic name format,
+ pprOperand platform format op1,
+ comma,
+ pprReg platform format op2,
+ comma,
+ pprReg platform format op3
+ ]
+
+ pprFMAPermutation :: FMAPermutation -> Line doc
+ pprFMAPermutation FMA132 = text "132"
+ pprFMAPermutation FMA213 = text "213"
+ pprFMAPermutation FMA231 = text "231"
pprOpOp :: Line doc -> Format -> Operand -> Operand -> doc
pprOpOp name format op1 op2