10 files changed, 254 insertions, 18 deletions
diff --git a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
index 8ebccaf093..c0e9a7e8d5 100644
--- a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
@@ -783,7 +783,7 @@ getRegister' config plat expr
       where w' = formatToWidth (cmmTypeFormat (cmmRegType reg))
             r' = getRegisterReg plat reg
 
-    -- Generic case.
+    -- Generic binary case.
     CmmMachOp op [x, y] -> do
       -- alright, so we have an operation, and two expressions. And we want to essentially do
       -- ensure we get float regs (TODO(Ben): What?)
@@ -956,7 +956,44 @@ getRegister' config plat expr
 
         -- TODO
 
-        op -> pprPanic "getRegister' (unhandled dyadic CmmMachOp): " $ (pprMachOp op) <+> text "in" <+> (pdoc plat expr)
+        op -> pprPanic "getRegister' (unhandled dyadic CmmMachOp): " $
+                (pprMachOp op) <+> text "in" <+> (pdoc plat expr)
+
+    -- Generic ternary case.
+    CmmMachOp op [x, y, z] ->
+
+      case op of
+
+        -- Floating-point fused multiply-add operations
+
+        -- x86 fmadd    x * y + z <=> AArch64 fmadd : d =   r1 * r2 + r3
+        -- x86 fmsub    x * y - z <=> AArch64 fnmsub: d =   r1 * r2 - r3
+        -- x86 fnmadd - x * y + z <=> AArch64 fmsub : d = - r1 * r2 + r3
+        -- x86 fnmsub - x * y - z <=> AArch64 fnmadd: d = - r1 * r2 - r3
+
+        MO_FMA var w -> case var of
+          FMAdd  -> float3Op w (\d n m a -> unitOL $ FMA FMAdd  d n m a)
+          FMSub  -> float3Op w (\d n m a -> unitOL $ FMA FNMSub d n m a)
+          FNMAdd -> float3Op w (\d n m a -> unitOL $ FMA FMSub  d n m a)
+          FNMSub -> float3Op w (\d n m a -> unitOL $ FMA FNMAdd d n m a)
+
+        _ -> pprPanic "getRegister' (unhandled ternary CmmMachOp): " $
+                (pprMachOp op) <+> text "in" <+> (pdoc plat expr)
+
+      where
+          float3Op w op = do
+            (reg_fx, format_x, code_fx) <- getFloatReg x
+            (reg_fy, format_y, code_fy) <- getFloatReg y
+            (reg_fz, format_z, code_fz) <- getFloatReg z
+            massertPpr (isFloatFormat format_x && isFloatFormat format_y && isFloatFormat format_z) $
+              text "float3Op: non-float"
+            return $
+              Any (floatFormat w) $ \ dst ->
+                code_fx `appOL`
+                code_fy `appOL`
+                code_fz `appOL`
+                op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy) (OpReg w reg_fz)
+
     CmmMachOp _op _xs
       -> pprPanic "getRegister' (variadic CmmMachOp): " (pdoc plat expr)
 
diff --git a/compiler/GHC/CmmToAsm/AArch64/Instr.hs b/compiler/GHC/CmmToAsm/AArch64/Instr.hs
index 7bf78becb6..166ab2ca17 100644
--- a/compiler/GHC/CmmToAsm/AArch64/Instr.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/Instr.hs
@@ -142,6 +142,8 @@ regUsageOfInstr platform instr = case instr of
   SCVTF dst src            -> usage (regOp src, regOp dst)
   FCVTZS dst src           -> usage (regOp src, regOp dst)
   FABS dst src             -> usage (regOp src, regOp dst)
+  FMA _ dst src1 src2 src3 ->
+    usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst)
 
   _ -> panic $ "regUsageOfInstr: " ++ instrCon instr
 
@@ -280,6 +282,9 @@ patchRegsOfInstr instr env = case instr of
     SCVTF o1 o2    -> SCVTF (patchOp o1) (patchOp o2)
     FCVTZS o1 o2   -> FCVTZS (patchOp o1) (patchOp o2)
     FABS o1 o2     -> FABS (patchOp o1) (patchOp o2)
+    FMA s o1 o2 o3 o4 ->
+      FMA s (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
+
     _              -> panic $ "patchRegsOfInstr: " ++ instrCon instr
     where
         patchOp :: Operand -> Operand
@@ -650,6 +655,14 @@ data Instr
     -- Float ABSolute value
     | FABS Operand Operand
 
+    -- | Floating-point fused multiply-add instructions
+    --
+    -- - fmadd : d =   r1 * r2 + r3
+    -- - fnmsub: d =   r1 * r2 - r3
+    -- - fmsub : d = - r1 * r2 + r3
+    -- - fnmadd: d = - r1 * r2 - r3
+    | FMA FMASign Operand Operand Operand Operand
+
 instrCon :: Instr -> String
 instrCon i =
     case i of
@@ -715,6 +728,12 @@ instrCon i =
       SCVTF{} -> "SCVTF"
       FCVTZS{} -> "FCVTZS"
       FABS{} -> "FABS"
+      FMA variant _ _ _ _ ->
+        case variant of
+          FMAdd  -> "FMADD"
+          FMSub  -> "FMSUB"
+          FNMAdd -> "FNMADD"
+          FNMSub -> "FNMSUB"
 
 data Target
     = TBlock BlockId
diff --git a/compiler/GHC/CmmToAsm/AArch64/Ppr.hs b/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
index 475324afce..646f914c8d 100644
--- a/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
@@ -546,6 +546,13 @@ pprInstr platform instr = case instr of
   SCVTF o1 o2 -> op2 (text "\tscvtf") o1 o2
   FCVTZS o1 o2 -> op2 (text "\tfcvtzs") o1 o2
   FABS o1 o2 -> op2 (text "\tfabs") o1 o2
+  FMA variant d r1 r2 r3 ->
+    let fma = case variant of
+                FMAdd  -> text "\tfmadd"
+                FMSub  -> text "\tfmsub"
+                FNMAdd -> text "\tfnmadd"
+                FNMSub -> text "\tfnmsub"
+    in op4 fma d r1 r2 r3
  where op2 op o1 o2        = line $ op <+> pprOp platform o1 <> comma <+> pprOp platform o2
        op3 op o1 o2 o3     = line $ op <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3
        op4 op o1 o2 o3 o4  = line $ op <+> pprOp platform o1 <> comma <+> pprOp platform o2 <> comma <+> pprOp platform o3 <> comma <+> pprOp platform o4
diff --git a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
index 7dac4f221b..f8a726da6c 100644
--- a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
@@ -649,6 +649,21 @@ getRegister' _ _ (CmmMachOp mop [x, y]) -- dyadic PrimOps
       code <- remainderCode rep sgn tmp x y
       return (Any fmt code)
 
+getRegister' _ _ (CmmMachOp mop [x, y, z]) -- ternary PrimOps
+  = case mop of
+
+      -- x86 fmadd    x * y + z <> PPC fmadd  rt =   ra * rc + rb
+      -- x86 fmsub    x * y - z <> PPC fmsub  rt =   ra * rc - rb
+      -- x86 fnmadd - x * y + z ~~ PPC fnmsub rt = -(ra * rc - rb)
+      -- x86 fnmsub - x * y - z ~~ PPC fnmadd rt = -(ra * rc + rb)
+
+      MO_FMA variant w ->
+        case variant of
+          FMAdd  -> fma_code w (FMADD FMAdd) x y z
+          FMSub  -> fma_code w (FMADD FMSub) x y z
+          FNMAdd -> fma_code w (FMADD FNMAdd) x y z
+          FNMSub -> fma_code w (FMADD FNMSub) x y z
+      _ -> panic "PPC.CodeGen.getRegister: no match"
 
 getRegister' _ _ (CmmLit (CmmInt i rep))
   | Just imm <- makeImmediate rep True i
@@ -2358,10 +2373,28 @@ trivialUCode rep instr x = do
     let code' dst = code `snocOL` instr dst src
     return (Any rep code')
 
+-- | Generate code for a 4-register FMA instruction,
+-- e.g. @fmadd rt ra rc rb := rt <- ra * rc + rb@.
+fma_code :: Width
+         -> (Format -> Reg -> Reg -> Reg -> Reg -> Instr)
+         -> CmmExpr
+         -> CmmExpr
+         -> CmmExpr
+         -> NatM Register
+fma_code w instr ra rc rb = do
+    let rep = floatFormat w
+    (src1, code1) <- getSomeReg ra
+    (src2, code2) <- getSomeReg rc
+    (src3, code3) <- getSomeReg rb
+    let instrCode rt =
+          code1 `appOL`
+          code2 `appOL`
+          code3 `snocOL` instr rep rt src1 src2 src3
+    return $ Any rep instrCode
+
 -- There is no "remainder" instruction on the PPC, so we have to do
 -- it the hard way.
 -- The "sgn" parameter is the signedness for the division instruction
-
 remainderCode :: Width -> Bool -> Reg -> CmmExpr -> CmmExpr
                -> NatM (Reg -> InstrBlock)
 remainderCode rep sgn reg_q arg_x arg_y = do
diff --git a/compiler/GHC/CmmToAsm/PPC/Instr.hs b/compiler/GHC/CmmToAsm/PPC/Instr.hs
index 639ae979f8..3fedcc1fc4 100644
--- a/compiler/GHC/CmmToAsm/PPC/Instr.hs
+++ b/compiler/GHC/CmmToAsm/PPC/Instr.hs
@@ -280,6 +280,14 @@ data Instr
     | FABS    Reg Reg               -- abs is the same for single and double
     | FNEG    Reg Reg               -- negate is the same for single and double prec.
 
+    -- | Fused multiply-add instructions.
+    --
+    --   - FMADD:  @rd =  (ra * rb) + rd@
+    --   - FMSUB:  @rd =   ra * rb  - rd@
+    --   - FNMADD: @rd = -(ra * rb + rd)@
+    --   - FNMSUB: @rd = -(ra * rb - rd)@
+    | FMADD FMASign Format Reg Reg Reg Reg
+
     | FCMP    Reg Reg
 
     | FCTIWZ  Reg Reg           -- convert to integer word
@@ -380,6 +388,7 @@ regUsageOfInstr platform instr
     MFCR    reg             -> usage ([], [reg])
     MFLR    reg             -> usage ([], [reg])
     FETCHPC reg             -> usage ([], [reg])
+    FMADD _ _ rt ra rc rb   -> usage ([ra, rc, rb], [rt])
     _                       -> noUsage
   where
     usage (src, dst) = RU (filter (interesting platform) src)
@@ -467,6 +476,8 @@ patchRegsOfInstr instr env
     FDIV    fmt r1 r2 r3    -> FDIV fmt (env r1) (env r2) (env r3)
     FABS    r1 r2           -> FABS (env r1) (env r2)
     FNEG    r1 r2           -> FNEG (env r1) (env r2)
+    FMADD   sgn fmt r1 r2 r3 r4
+                            -> FMADD sgn fmt (env r1) (env r2) (env r3) (env r4)
     FCMP    r1 r2           -> FCMP (env r1) (env r2)
     FCTIWZ  r1 r2           -> FCTIWZ (env r1) (env r2)
     FCTIDZ  r1 r2           -> FCTIDZ (env r1) (env r2)
diff --git a/compiler/GHC/CmmToAsm/PPC/Ppr.hs b/compiler/GHC/CmmToAsm/PPC/Ppr.hs
index ba364df1b0..f1d6733327 100644
--- a/compiler/GHC/CmmToAsm/PPC/Ppr.hs
+++ b/compiler/GHC/CmmToAsm/PPC/Ppr.hs
@@ -934,6 +934,9 @@ pprInstr platform instr = case instr of
    FNEG reg1 reg2
       -> pprUnary (text "fneg") reg1 reg2
 
+   FMADD signs fmt dst ra rc rb
+     -> pprTernaryF (pprFMASign signs) fmt dst ra rc rb
+
    FCMP reg1 reg2
       -> line $ hcat [
            char '\t',
@@ -1083,6 +1086,21 @@ pprBinaryF op fmt reg1 reg2 reg3 = line $ hcat [
         pprReg reg3
     ]
 
+pprTernaryF :: IsDoc doc => Line doc -> Format -> Reg -> Reg -> Reg -> Reg -> doc
+pprTernaryF op fmt rt ra rc rb = line $ hcat [
+        char '\t',
+        op,
+        pprFFormat fmt,
+        char '\t',
+        pprReg rt,
+        text ", ",
+        pprReg ra,
+        text ", ",
+        pprReg rc,
+        text ", ",
+        pprReg rb
+    ]
+
 pprRI :: IsLine doc => Platform -> RI -> doc
 pprRI _        (RIReg r) = pprReg r
 pprRI platform (RIImm r) = pprImm platform r
diff --git a/compiler/GHC/CmmToAsm/Wasm/FromCmm.hs b/compiler/GHC/CmmToAsm/Wasm/FromCmm.hs
index 7ca323d72d..9a4c3f34c2 100644
--- a/compiler/GHC/CmmToAsm/Wasm/FromCmm.hs
+++ b/compiler/GHC/CmmToAsm/Wasm/FromCmm.hs
@@ -816,7 +816,9 @@ lower_CmmMachOp lbl (MO_SS_Conv w0 w1) xs = lower_MO_SS_Conv lbl w0 w1 xs
 lower_CmmMachOp lbl (MO_UU_Conv w0 w1) xs = lower_MO_UU_Conv lbl w0 w1 xs
 lower_CmmMachOp lbl (MO_XX_Conv w0 w1) xs = lower_MO_UU_Conv lbl w0 w1 xs
 lower_CmmMachOp lbl (MO_FF_Conv w0 w1) xs = lower_MO_FF_Conv lbl w0 w1 xs
-lower_CmmMachOp _ _ _ = panic "lower_CmmMachOp: unreachable"
+lower_CmmMachOp _ mop _ =
+  pprPanic "lower_CmmMachOp: unreachable" $
+    vcat [ text "offending MachOp:" <+> pprMachOp mop ]
 
 -- | Lower a 'CmmLit'. Note that we don't emit 'f32.const' or
 -- 'f64.const' for the time being, and instead emit their relative bit
diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
index d6ef821c9f..859b27e248 100644
--- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
@@ -901,14 +901,10 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = -- dyadic MachOps
       MO_U_Lt _ -> condIntReg LU  x y
       MO_U_Le _ -> condIntReg LEU x y
 
-      MO_F_Add w   -> trivialFCode_sse2 w ADD  x y
-
-      MO_F_Sub w   -> trivialFCode_sse2 w SUB  x y
-
-      MO_F_Quot w  -> trivialFCode_sse2 w FDIV x y
-
-      MO_F_Mul w   -> trivialFCode_sse2 w MUL x y
-
+      MO_F_Add  w -> trivialFCode_sse2 w ADD  x y
+      MO_F_Sub  w -> trivialFCode_sse2 w SUB  x y
+      MO_F_Quot w -> trivialFCode_sse2 w FDIV x y
+      MO_F_Mul  w -> trivialFCode_sse2 w MUL  x y
 
       MO_Add rep -> add_code rep x y
       MO_Sub rep -> sub_code rep x y
@@ -1113,6 +1109,13 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = -- dyadic MachOps
 
            return (Fixed format result code)
 
+getRegister' _plat _is32Bit (CmmMachOp mop [x, y, z]) = -- ternary MachOps
+  case mop of
+      -- Floating point fused multiply-add operations @ ± x*y ± z@
+      MO_FMA var w -> genFMA3Code w var x y z
+
+      _other -> pprPanic "getRegister(x86) - ternary CmmMachOp (1)"
+                  (pprMachOp mop)
 
 getRegister' _ _ (CmmLoad mem pk _)
   | isFloatType pk
@@ -3151,12 +3154,12 @@ genTrivialCode rep instr a b = do
   a_code <- getAnyReg a
   tmp <- getNewRegNat rep
   let
-     -- We want the value of b to stay alive across the computation of a.
-     -- But, we want to calculate a straight into the destination register,
+     -- We want the value of 'b' to stay alive across the computation of 'a'.
+     -- But, we want to calculate 'a' straight into the destination register,
      -- because the instruction only has two operands (dst := dst `op` src).
-     -- The troublesome case is when the result of b is in the same register
-     -- as the destination reg.  In this case, we have to save b in a
-     -- new temporary across the computation of a.
+     -- The troublesome case is when the result of 'b' is in the same register
+     -- as the destination 'reg'.  In this case, we have to save 'b' in a
+     -- new temporary across the computation of 'a'.
      code dst
         | dst `regClashesWithOp` b_op =
                 b_code `appOL`
@@ -3174,6 +3177,69 @@ reg `regClashesWithOp` OpReg reg2   = reg == reg2
 reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
 _   `regClashesWithOp` _            = False
 
+-- | Generate code for a fused multiply-add operation, of the form @± x * y ± z@,
+-- with 3 operands (FMA3 instruction set).
+genFMA3Code :: Width
+            -> FMASign
+            -> CmmExpr -> CmmExpr -> CmmExpr -> NatM Register
+genFMA3Code w signs x y z = do
+
+  -- For the FMA instruction, we want to compute x * y + z
+  --
+  -- There are three possible instructions we could emit:
+  --
+  --   - fmadd213 z y x, result in x, z can be a memory address
+  --   - fmadd132 x z y, result in y, x can be a memory address
+  --   - fmadd231 y x z, result in z, y can be a memory address
+  --
+  -- This suggests two possible optimisations:
+  --
+  --   - OPTIMISATION 1
+  --     If one argument is an address, use the instruction that allows
+  --     a memory address in that position.
+  --
+  --   - OPTIMISATION 2
+  --     If one argument is in a fixed register, use the instruction that puts
+  --     the result in that same register.
+  --
+  -- Currently we follow neither of these optimisations,
+  -- opting to always use fmadd213 for simplicity.
+  let rep = floatFormat w
+  (y_reg, y_code) <- getNonClobberedReg y
+  (z_reg, z_code) <- getNonClobberedReg z
+  x_code <- getAnyReg x
+  y_tmp <- getNewRegNat rep
+  z_tmp <- getNewRegNat rep
+  let
+     fma213 = FMA3 rep signs FMA213
+     code dst
+         | dst == y_reg
+         , dst == z_reg
+         = y_code `appOL`
+           unitOL (MOV rep (OpReg y_reg) (OpReg y_tmp)) `appOL`
+           z_code `appOL`
+           unitOL (MOV rep (OpReg z_reg) (OpReg z_tmp)) `appOL`
+           x_code dst `snocOL`
+           fma213 (OpReg z_tmp) y_tmp dst
+        | dst == y_reg
+        = y_code `appOL`
+          unitOL (MOV rep (OpReg y_reg) (OpReg z_tmp)) `appOL`
+          z_code `appOL`
+          x_code dst `snocOL`
+          fma213 (OpReg z_reg) y_tmp dst
+        | dst == z_reg
+        = y_code `appOL`
+          z_code `appOL`
+          unitOL (MOV rep (OpReg z_reg) (OpReg z_tmp)) `appOL`
+          x_code dst `snocOL`
+          fma213 (OpReg z_tmp) y_reg dst
+        | otherwise
+        = y_code `appOL`
+          z_code `appOL`
+          x_code dst `snocOL`
+          fma213 (OpReg z_reg) y_reg dst
+  return (Any rep code)
+
 -----------
 
 trivialUCode :: Format -> (Operand -> Instr)
diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs
index ccb3ce09ba..b4e93a1c5d 100644
--- a/compiler/GHC/CmmToAsm/X86/Instr.hs
+++ b/compiler/GHC/CmmToAsm/X86/Instr.hs
@@ -12,6 +12,7 @@ module GHC.CmmToAsm.X86.Instr
    ( Instr(..)
    , Operand(..)
    , PrefetchVariant(..)
+   , FMAPermutation(..)
    , JumpDest(..)
    , getJumpDestBlockId
    , canShortcut
@@ -272,6 +273,10 @@ data Instr
         | CVTSI2SS      Format Operand Reg -- I32/I64 to F32
         | CVTSI2SD      Format Operand Reg -- I32/I64 to F64
 
+        -- | FMA3 fused multiply-add operations.
+        | FMA3         Format FMASign FMAPermutation Operand Reg Reg
+          -- src1 (r/m), src2 (r), dst (r)
+
         -- use ADD, SUB, and SQRT for arithmetic.  In both cases, operands
         -- are  Operand Reg.
 
@@ -351,7 +356,7 @@ data Operand
         | OpImm  Imm            -- immediate value
         | OpAddr AddrMode       -- memory reference
 
-
+data FMAPermutation = FMA132 | FMA213 | FMA231
 
 -- | Returns which registers are read and written as a (read, written)
 -- pair.
@@ -438,6 +443,8 @@ regUsageOfInstr platform instr
     PDEP   _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
     PEXT   _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
 
+    FMA3 _ _ _ src1 src2 dst -> usageFMA src1 src2 dst
+
     -- note: might be a better way to do this
     PREFETCH _  _ src -> mkRU (use_R src []) []
     LOCK i              -> regUsageOfInstr platform i
@@ -482,6 +489,15 @@ regUsageOfInstr platform instr
     usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg]
     usageRMM _ _ _                               = panic "X86.RegInfo.usageRMM: no match"
 
+    -- 3 operand form of FMA instructions.
+    usageFMA :: Operand -> Reg -> Reg -> RegUsage
+    usageFMA (OpReg src1) src2 dst
+      = mkRU [src1, src2, dst] [dst]
+    usageFMA (OpAddr ea1) src2 dst
+      = mkRU (use_EA ea1 [src2, dst]) [dst]
+    usageFMA _ _ _
+      = panic "X86.RegInfo.usageFMA: no match"
+
     -- 1 operand form; operand Modified
     usageM :: Operand -> RegUsage
     usageM (OpReg reg)          = mkRU [reg] [reg]
@@ -561,6 +577,8 @@ patchRegsOfInstr instr env
     JMP op regs          -> JMP (patchOp op) regs
     JMP_TBL op ids s lbl -> JMP_TBL (patchOp op) ids s lbl
 
+    FMA3 fmt perm var x1 x2 x3 -> patch3 (FMA3 fmt perm var) x1 x2 x3
+
     -- literally only support storing the top x87 stack value st(0)
     X87Store  fmt  dst     -> X87Store fmt  (lookupAddr dst)
 
@@ -612,6 +630,8 @@ patchRegsOfInstr instr env
     patch1 insn op      = insn $! patchOp op
     patch2 :: (Operand -> Operand -> a) -> Operand -> Operand -> a
     patch2 insn src dst = (insn $! patchOp src) $! patchOp dst
+    patch3 :: (Operand -> Reg -> Reg -> a) -> Operand -> Reg -> Reg -> a
+    patch3 insn src1 src2 dst = ((insn $! patchOp src1) $! env src2) $! env dst
 
     patchOp (OpReg  reg) = OpReg $! env reg
     patchOp (OpImm  imm) = OpImm imm
diff --git a/compiler/GHC/CmmToAsm/X86/Ppr.hs b/compiler/GHC/CmmToAsm/X86/Ppr.hs
index 4a8f55fdf0..0d649f2efb 100644
--- a/compiler/GHC/CmmToAsm/X86/Ppr.hs
+++ b/compiler/GHC/CmmToAsm/X86/Ppr.hs
@@ -838,6 +838,14 @@ pprInstr platform i = case i of
    FDIV format op1 op2
       -> pprFormatOpOp (text "div") format op1 op2
 
+   FMA3 format var perm op1 op2 op3
+      -> let mnemo = case var of
+               FMAdd  -> text "vfmadd"
+               FMSub  -> text "vfmsub"
+               FNMAdd -> text "vfnmadd"
+               FNMSub -> text "vfnmsub"
+         in pprFormatOpRegReg (mnemo <> pprFMAPermutation perm) format op1 op2 op3
+
    SQRT format op1 op2
       -> pprFormatOpReg (text "sqrt") format op1 op2
 
@@ -968,6 +976,21 @@ pprInstr platform i = case i of
            pprOperand platform format op2
        ]
 
+   pprFormatOpRegReg :: Line doc -> Format -> Operand -> Reg -> Reg -> doc
+   pprFormatOpRegReg name format op1 op2 op3
+     = line $ hcat [
+           pprMnemonic name format,
+           pprOperand platform format op1,
+           comma,
+           pprReg platform format op2,
+           comma,
+           pprReg platform format op3
+       ]
+
+   pprFMAPermutation :: FMAPermutation -> Line doc
+   pprFMAPermutation FMA132 = text "132"
+   pprFMAPermutation FMA213 = text "213"
+   pprFMAPermutation FMA231 = text "231"
 
    pprOpOp :: Line doc -> Format -> Operand -> Operand -> doc
    pprOpOp name format op1 op2