Generate better fp abs for X86 and llvm with default cmm otherwise

Currently we have this in libraries/base/GHC/Float.hs: ``` abs x | x == 0 = 0 -- handles (-0.0) | x > 0 = x | otherwise = negateFloat x ``` But 3-4 years ago it was noted that this was inefficient: https://mail.haskell.org/pipermail/libraries/2013-April/019690.html We can generate better code for X86 and llvm and for others generate some custom cmm code which is similar to what the compiler generates now. Reviewers: austin, simonmar, hvr, bgamari Reviewed By: bgamari Subscribers: dfeuer, thomie Differential Revision: https://phabricator.haskell.org/D3265
author: Dominic Steinitz <dominic@steinitz.org> 2017-03-07 09:26:16 -0500
committer: Ben Gamari <ben@smart-cactus.org> 2017-03-07 13:32:33 -0500
commit: 12ccf767af3373e319b75d5d61fe79df4a389e45 (patch)
tree: ec4bf27785be88d67617ddddd5968e4c128fb6a1
parent: 48759c0ef0e7ce718b52557599ebbb884c19a2ad (diff)
download: haskell-12ccf767af3373e319b75d5d61fe79df4a389e45.tar.gz
10 files changed, 94 insertions, 12 deletions
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index a8cbd682e6..d736f14bfc 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -528,6 +528,7 @@ data CallishMachOp
   | MO_F64_Atan
   | MO_F64_Log
   | MO_F64_Exp
+  | MO_F64_Fabs
   | MO_F64_Sqrt
   | MO_F32_Pwr
   | MO_F32_Sin
@@ -541,6 +542,7 @@ data CallishMachOp
   | MO_F32_Atan
   | MO_F32_Log
   | MO_F32_Exp
+  | MO_F32_Fabs
   | MO_F32_Sqrt
 
   | MO_UF_Conv Width
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index dba8ca6e8c..6a84e30abb 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -754,6 +754,7 @@ pprCallishMachOp_for_C mop
         MO_F64_Log      -> text "log"
         MO_F64_Exp      -> text "exp"
         MO_F64_Sqrt     -> text "sqrt"
+        MO_F64_Fabs     -> unsupported
         MO_F32_Pwr      -> text "powf"
         MO_F32_Sin      -> text "sinf"
         MO_F32_Cos      -> text "cosf"
@@ -767,6 +768,7 @@ pprCallishMachOp_for_C mop
         MO_F32_Log      -> text "logf"
         MO_F32_Exp      -> text "expf"
         MO_F32_Sqrt     -> text "sqrtf"
+        MO_F32_Fabs     -> unsupported
         MO_WriteBarrier -> text "write_barrier"
         MO_Memcpy _     -> text "memcpy"
         MO_Memset _     -> text "memset"
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index 14eb4258de..0edde0659f 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -844,6 +844,12 @@ callishPrimOpSupported dflags op
       WordMul2Op     | ncg && x86ish
                          || llvm      -> Left (MO_U_Mul2     (wordWidth dflags))
                      | otherwise      -> Right genericWordMul2Op
+      FloatFabsOp    | (ncg && x86ish)
+                         || llvm      -> Left MO_F32_Fabs
+                     | otherwise      -> Right $ genericFabsOp W32
+      DoubleFabsOp   | (ncg && x86ish)
+                         || llvm      -> Left MO_F64_Fabs
+                     | otherwise      -> Right $ genericFabsOp W64
 
       _ -> pprPanic "emitPrimOp: can't translate PrimOp " (ppr op)
  where
@@ -1064,6 +1070,34 @@ genericWordMul2Op [res_h, res_l] [arg_x, arg_y]
                         topHalf (CmmReg r)])]
 genericWordMul2Op _ _ = panic "genericWordMul2Op"
 
+-- This replicates what we had in libraries/base/GHC/Float.hs:
+--
+--    abs x    | x == 0    = 0 -- handles (-0.0)
+--             | x >  0    = x
+--             | otherwise = negateFloat x
+genericFabsOp :: Width -> GenericOp
+genericFabsOp w [res_r] [aa]
+ = do dflags <- getDynFlags
+      let zero   = CmmLit (CmmFloat 0 w)
+
+          eq x y = CmmMachOp (MO_F_Eq w) [x, y]
+          gt x y = CmmMachOp (MO_F_Gt w) [x, y]
+
+          neg x  = CmmMachOp (MO_F_Neg w) [x]
+
+          g1 = catAGraphs [mkAssign (CmmLocal res_r) zero]
+          g2 = catAGraphs [mkAssign (CmmLocal res_r) aa]
+
+      res_t <- CmmLocal <$> newTemp (cmmExprType dflags aa)
+      let g3 = catAGraphs [mkAssign res_t aa,
+                           mkAssign (CmmLocal res_r) (neg (CmmReg res_t))]
+
+      g4 <- mkCmmIfThenElse (gt aa zero) g2 g3
+
+      emit =<< mkCmmIfThenElse (eq aa zero) g1 g4
+
+genericFabsOp _ _ _ = panic "genericFabsOp"
+
 -- These PrimOps are NOPs in Cmm
 
 nopOp :: PrimOp -> Bool
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index d88d234057..40c5498255 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -690,6 +690,7 @@ cmmPrimOpFunctions mop = do
     MO_F32_Exp    -> fsLit "expf"
     MO_F32_Log    -> fsLit "logf"
     MO_F32_Sqrt   -> fsLit "llvm.sqrt.f32"
+    MO_F32_Fabs   -> fsLit "llvm.fabs.f32"
     MO_F32_Pwr    -> fsLit "llvm.pow.f32"
 
     MO_F32_Sin    -> fsLit "llvm.sin.f32"
@@ -707,6 +708,7 @@ cmmPrimOpFunctions mop = do
     MO_F64_Exp    -> fsLit "exp"
     MO_F64_Log    -> fsLit "log"
     MO_F64_Sqrt   -> fsLit "llvm.sqrt.f64"
+    MO_F64_Fabs   -> fsLit "llvm.fabs.f64"
     MO_F64_Pwr    -> fsLit "llvm.pow.f64"
 
     MO_F64_Sin    -> fsLit "llvm.sin.f64"
diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs
index 849516f229..1f06c7bd3e 100644
--- a/compiler/nativeGen/PPC/CodeGen.hs
+++ b/compiler/nativeGen/PPC/CodeGen.hs
@@ -1525,6 +1525,7 @@ genCCall' dflags gcp target dest_regs args
                     MO_F32_Exp   -> (fsLit "exp", True)
                     MO_F32_Log   -> (fsLit "log", True)
                     MO_F32_Sqrt  -> (fsLit "sqrt", True)
+                    MO_F32_Fabs  -> unsupported
 
                     MO_F32_Sin   -> (fsLit "sin", True)
                     MO_F32_Cos   -> (fsLit "cos", True)
@@ -1542,6 +1543,7 @@ genCCall' dflags gcp target dest_regs args
                     MO_F64_Exp   -> (fsLit "exp", False)
                     MO_F64_Log   -> (fsLit "log", False)
                     MO_F64_Sqrt  -> (fsLit "sqrt", False)
+                    MO_F64_Fabs  -> unsupported
 
                     MO_F64_Sin   -> (fsLit "sin", False)
                     MO_F64_Cos   -> (fsLit "cos", False)
diff --git a/compiler/nativeGen/SPARC/CodeGen.hs b/compiler/nativeGen/SPARC/CodeGen.hs
index a6d3f9484e..3e9058bdfd 100644
--- a/compiler/nativeGen/SPARC/CodeGen.hs
+++ b/compiler/nativeGen/SPARC/CodeGen.hs
@@ -610,6 +610,7 @@ outOfLineMachOp_table mop
         MO_F32_Exp    -> fsLit "expf"
         MO_F32_Log    -> fsLit "logf"
         MO_F32_Sqrt   -> fsLit "sqrtf"
+        MO_F32_Fabs   -> unsupported
         MO_F32_Pwr    -> fsLit "powf"
 
         MO_F32_Sin    -> fsLit "sinf"
@@ -627,6 +628,7 @@ outOfLineMachOp_table mop
         MO_F64_Exp    -> fsLit "exp"
         MO_F64_Log    -> fsLit "log"
         MO_F64_Sqrt   -> fsLit "sqrt"
+        MO_F64_Fabs   -> unsupported
         MO_F64_Pwr    -> fsLit "pow"
 
         MO_F64_Sin    -> fsLit "sin"
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 877d822023..704514e324 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -2043,17 +2043,24 @@ genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] =
 genCCall _ is32Bit target dest_regs args = do
   dflags <- getDynFlags
   let platform = targetPlatform dflags
+      sse2     = isSse2Enabled dflags
   case (target, dest_regs) of
     -- void return type prim op
     (PrimTarget op, []) ->
         outOfLineCmmOp op Nothing args
     -- we only cope with a single result for foreign calls
     (PrimTarget op, [r])
-      | not is32Bit -> outOfLineCmmOp op (Just r) args
+      | sse2 -> case op of
+          MO_F32_Fabs -> case args of
+            [x] -> sse2FabsCode W32 x
+            _ -> panic "genCCall: Wrong number of arguments for fabs"
+          MO_F64_Fabs -> case args of
+            [x] -> sse2FabsCode W64 x
+            _ -> panic "genCCall: Wrong number of arguments for fabs"
+          _other_op -> outOfLineCmmOp op (Just r) args
       | otherwise -> do
         l1 <- getNewLabelNat
         l2 <- getNewLabelNat
-        sse2 <- sse2Enabled
         if sse2
           then
             outOfLineCmmOp op (Just r) args
@@ -2082,6 +2089,23 @@ genCCall _ is32Bit target dest_regs args = do
               = panic $ "genCCall.actuallyInlineFloatOp: bad number of arguments! ("
                       ++ show (length args) ++ ")"
 
+        sse2FabsCode :: Width -> CmmExpr -> NatM InstrBlock
+        sse2FabsCode w x = do
+          let fmt = floatFormat w
+          x_code <- getAnyReg x
+          let
+            const | FF32 <- fmt = CmmInt 0x7fffffff W32
+                  | otherwise   = CmmInt 0x7fffffffffffffff W64
+          Amode amode amode_code <- memConstant (widthInBytes w) const
+          tmp <- getNewRegNat fmt
+          let
+            code dst = x_code dst `appOL` amode_code `appOL` toOL [
+                MOV fmt (OpAddr amode) (OpReg tmp),
+                AND fmt (OpReg tmp) (OpReg dst)
+                ]
+
+          return $ code (getRegisterReg platform True (CmmLocal r))
+
     (PrimTarget (MO_S_QuotRem  width), _) -> divOp1 platform True  width dest_regs args
     (PrimTarget (MO_U_QuotRem  width), _) -> divOp1 platform False width dest_regs args
     (PrimTarget (MO_U_QuotRem2 width), _) -> divOp2 platform False width dest_regs args
@@ -2599,6 +2623,7 @@ outOfLineCmmOp mop res args
 
         fn = case mop of
               MO_F32_Sqrt  -> fsLit "sqrtf"
+              MO_F32_Fabs  -> unsupported
               MO_F32_Sin   -> fsLit "sinf"
               MO_F32_Cos   -> fsLit "cosf"
               MO_F32_Tan   -> fsLit "tanf"
@@ -2615,6 +2640,7 @@ outOfLineCmmOp mop res args
               MO_F32_Pwr   -> fsLit "powf"
 
               MO_F64_Sqrt  -> fsLit "sqrt"
+              MO_F64_Fabs  -> unsupported
               MO_F64_Sin   -> fsLit "sin"
               MO_F64_Cos   -> fsLit "cos"
               MO_F64_Tan   -> fsLit "tan"
@@ -3050,8 +3076,16 @@ sse2NegCode w x = do
   x_code <- getAnyReg x
   -- This is how gcc does it, so it can't be that bad:
   let
-    const | FF32 <- fmt = CmmInt 0x80000000 W32
-          | otherwise   = CmmInt 0x8000000000000000 W64
+    const = case fmt of
+      FF32 -> CmmInt 0x80000000 W32
+      FF64 -> CmmInt 0x8000000000000000 W64
+      x@II8  -> wrongFmt x
+      x@II16 -> wrongFmt x
+      x@II32 -> wrongFmt x
+      x@II64 -> wrongFmt x
+      x@FF80 -> wrongFmt x
+      where
+        wrongFmt x = panic $ "sse2NegCode: " ++ show x
   Amode amode amode_code <- memConstant (widthInBytes w) const
   tmp <- getNewRegNat fmt
   let
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 223ea13994..7d19e990ce 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -631,6 +631,8 @@ pprInstr (SUB_CC format src dst)
 pprInstr (AND II64 src@(OpImm (ImmInteger mask)) dst)
   | 0 <= mask && mask < 0xffffffff
     = pprInstr (AND II32 src dst)
+pprInstr (AND FF32 src dst) = pprOpOp (sLit "andps") FF32 src dst
+pprInstr (AND FF64 src dst) = pprOpOp (sLit "andpd") FF64 src dst
 pprInstr (AND format src dst) = pprFormatOpOp (sLit "and") format src dst
 pprInstr (OR  format src dst) = pprFormatOpOp (sLit "or")  format src dst
 
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index 11928b6f69..76cfe671c1 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -531,6 +531,8 @@ primop   DoubleDivOp   "/##"   Dyadic
 
 primop   DoubleNegOp   "negateDouble#"  Monadic   Double# -> Double#
 
+primop   DoubleFabsOp  "fabsDouble#"    Monadic   Double# -> Double#
+
 primop   Double2IntOp   "double2Int#"          GenPrimOp  Double# -> Int#
    {Truncates a {\tt Double#} value to the nearest {\tt Int#}.
     Results are undefined if the truncation if truncation yields
@@ -657,6 +659,8 @@ primop   FloatDivOp   "divideFloat#"      Dyadic
 
 primop   FloatNegOp   "negateFloat#"      Monadic    Float# -> Float#
 
+primop   FloatFabsOp  "fabsFloat#"        Monadic    Float# -> Float#
+
 primop   Float2IntOp   "float2Int#"      GenPrimOp  Float# -> Int#
    {Truncates a {\tt Float#} value to the nearest {\tt Int#}.
     Results are undefined if the truncation if truncation yields
diff --git a/libraries/base/GHC/Float.hs b/libraries/base/GHC/Float.hs
index 18dd288dd6..64467b338e 100644
--- a/libraries/base/GHC/Float.hs
+++ b/libraries/base/GHC/Float.hs
@@ -245,9 +245,7 @@ instance  Num Float  where
     (-)         x y     =  minusFloat x y
     negate      x       =  negateFloat x
     (*)         x y     =  timesFloat x y
-    abs x    | x == 0    = 0 -- handles (-0.0)
-             | x >  0    = x
-             | otherwise = negateFloat x
+    abs         x       =  fabsFloat x
     signum x | x > 0     = 1
              | x < 0     = negateFloat 1
              | otherwise = x -- handles 0.0, (-0.0), and NaN
@@ -427,9 +425,7 @@ instance  Num Double  where
     (-)         x y     =  minusDouble x y
     negate      x       =  negateDouble x
     (*)         x y     =  timesDouble x y
-    abs x    | x == 0    = 0 -- handles (-0.0)
-             | x >  0    = x
-             | otherwise = negateDouble x
+    abs         x       =  fabsDouble x
     signum x | x > 0     = 1
              | x < 0     = negateDouble 1
              | otherwise = x -- handles 0.0, (-0.0), and NaN
@@ -1087,13 +1083,14 @@ geFloat     (F# x) (F# y) = isTrue# (geFloat# x y)
 ltFloat     (F# x) (F# y) = isTrue# (ltFloat# x y)
 leFloat     (F# x) (F# y) = isTrue# (leFloat# x y)
 
-expFloat, logFloat, sqrtFloat :: Float -> Float
+expFloat, logFloat, sqrtFloat, fabsFloat :: Float -> Float
 sinFloat, cosFloat, tanFloat  :: Float -> Float
 asinFloat, acosFloat, atanFloat  :: Float -> Float
 sinhFloat, coshFloat, tanhFloat  :: Float -> Float
 expFloat    (F# x) = F# (expFloat# x)
 logFloat    (F# x) = F# (logFloat# x)
 sqrtFloat   (F# x) = F# (sqrtFloat# x)
+fabsFloat   (F# x) = F# (fabsFloat# x)
 sinFloat    (F# x) = F# (sinFloat# x)
 cosFloat    (F# x) = F# (cosFloat# x)
 tanFloat    (F# x) = F# (tanFloat# x)
@@ -1131,13 +1128,14 @@ double2Float (D# x) = F# (double2Float# x)
 float2Double :: Float -> Double
 float2Double (F# x) = D# (float2Double# x)
 
-expDouble, logDouble, sqrtDouble :: Double -> Double
+expDouble, logDouble, sqrtDouble, fabsDouble :: Double -> Double
 sinDouble, cosDouble, tanDouble  :: Double -> Double
 asinDouble, acosDouble, atanDouble  :: Double -> Double
 sinhDouble, coshDouble, tanhDouble  :: Double -> Double
 expDouble    (D# x) = D# (expDouble# x)
 logDouble    (D# x) = D# (logDouble# x)
 sqrtDouble   (D# x) = D# (sqrtDouble# x)
+fabsDouble   (D# x) = D# (fabsDouble# x)
 sinDouble    (D# x) = D# (sinDouble# x)
 cosDouble    (D# x) = D# (cosDouble# x)
 tanDouble    (D# x) = D# (tanDouble# x)
author	Dominic Steinitz <dominic@steinitz.org>	2017-03-07 09:26:16 -0500
committer	Ben Gamari <ben@smart-cactus.org>	2017-03-07 13:32:33 -0500
commit	12ccf767af3373e319b75d5d61fe79df4a389e45 (patch)
tree	ec4bf27785be88d67617ddddd5968e4c128fb6a1
parent	48759c0ef0e7ce718b52557599ebbb884c19a2ad (diff)
download	haskell-12ccf767af3373e319b75d5d61fe79df4a389e45.tar.gz