11 files changed, 174 insertions, 0 deletions
diff --git a/compiler/GHC/StgToCmm/Prim.hs b/compiler/GHC/StgToCmm/Prim.hs
index e309d061a8..3728c0cac2 100644
--- a/compiler/GHC/StgToCmm/Prim.hs
+++ b/compiler/GHC/StgToCmm/Prim.hs
@@ -1426,10 +1426,17 @@ dispatchPrimop dflags = \case
     if ncg && (x86ish || ppc) || llvm
     then Left (MO_U_Mul2     (wordWidth dflags))
     else Right genericWordMul2Op
+
+  IntMul2Op  -> \_ -> OpDest_CallishHandledLater $
+    if ncg && x86ish
+    then Left (MO_S_Mul2     (wordWidth dflags))
+    else Right genericIntMul2Op
+
   FloatFabsOp -> \_ -> OpDest_CallishHandledLater $
     if (ncg && x86ish || ppc) || llvm
     then Left MO_F32_Fabs
     else Right $ genericFabsOp W32
+
   DoubleFabsOp -> \_ -> OpDest_CallishHandledLater $
     if (ncg && x86ish || ppc) || llvm
     then Left MO_F64_Fabs
@@ -1870,6 +1877,31 @@ genericWordMul2Op [res_h, res_l] [arg_x, arg_y]
                         topHalf (CmmReg r)])]
 genericWordMul2Op _ _ = panic "genericWordMul2Op"
 
+genericIntMul2Op :: GenericOp
+genericIntMul2Op [res_c, res_h, res_l] [arg_x, arg_y]
+ = do dflags <- getDynFlags
+      -- Implement algorithm from Hacker's Delight, 2nd edition, p.174
+      let t = cmmExprType dflags arg_x
+      p   <- newTemp t
+      -- 1) compute the multiplication as if numbers were unsigned
+      let wordMul2 = fromMaybe (panic "Unsupported out-of-line WordMul2Op")
+                               (emitPrimOp dflags WordMul2Op [arg_x,arg_y])
+      wordMul2 [p,res_l]
+      -- 2) correct the high bits of the unsigned result
+      let carryFill x = CmmMachOp (MO_S_Shr ww) [x, wwm1]
+          sub x y     = CmmMachOp (MO_Sub   ww) [x, y]
+          and x y     = CmmMachOp (MO_And   ww) [x, y]
+          neq x y     = CmmMachOp (MO_Ne    ww) [x, y]
+          f   x y     = (carryFill x) `and` y
+          wwm1        = CmmLit (CmmInt (fromIntegral (widthInBits ww - 1)) ww)
+          rl x        = CmmReg (CmmLocal x)
+          ww          = wordWidth dflags
+      emit $ catAGraphs
+             [ mkAssign (CmmLocal res_h) (rl p `sub` f arg_x arg_y `sub` f arg_y arg_x)
+             , mkAssign (CmmLocal res_c) (rl res_h `neq` carryFill (rl res_l))
+             ]
+genericIntMul2Op _ _ = panic "genericIntMul2Op"
+
 -- This replicates what we had in libraries/base/GHC/Float.hs:
 --
 --    abs x    | x == 0    = 0 -- handles (-0.0)
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index 9740d21bef..f8b7d4fb74 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -583,6 +583,7 @@ data CallishMachOp
 
   | MO_UF_Conv Width
 
+  | MO_S_Mul2    Width
   | MO_S_QuotRem Width
   | MO_U_QuotRem Width
   | MO_U_QuotRem2 Width
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index 506116c584..d94bc01e03 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -825,6 +825,7 @@ pprCallishMachOp_for_C mop
         (MO_AtomicWrite w) -> ptext (sLit $ atomicWriteLabel w)
         (MO_UF_Conv w)  -> ptext (sLit $ word2FloatLabel w)
 
+        MO_S_Mul2     {} -> unsupported
         MO_S_QuotRem  {} -> unsupported
         MO_U_QuotRem  {} -> unsupported
         MO_U_QuotRem2 {} -> unsupported
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index f86207e081..c8d88a8c2a 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -833,6 +833,7 @@ cmmPrimOpFunctions mop = do
     MO_SubWordC w   -> fsLit $ "llvm.usub.with.overflow."
                              ++ showSDoc dflags (ppr $ widthToLlvmInt w)
 
+    MO_S_Mul2    {}  -> unsupported
     MO_S_QuotRem {}  -> unsupported
     MO_U_QuotRem {}  -> unsupported
     MO_U_QuotRem2 {} -> unsupported
diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs
index 11759fb1a5..05883d0e5a 100644
--- a/compiler/nativeGen/PPC/CodeGen.hs
+++ b/compiler/nativeGen/PPC/CodeGen.hs
@@ -2021,6 +2021,7 @@ genCCall' dflags gcp target dest_regs args
                     MO_AtomicRead _  -> unsupported
                     MO_AtomicWrite _ -> unsupported
 
+                    MO_S_Mul2    {}  -> unsupported
                     MO_S_QuotRem {}  -> unsupported
                     MO_U_QuotRem {}  -> unsupported
                     MO_U_QuotRem2 {} -> unsupported
diff --git a/compiler/nativeGen/SPARC/CodeGen.hs b/compiler/nativeGen/SPARC/CodeGen.hs
index e24180e535..46b29d0a03 100644
--- a/compiler/nativeGen/SPARC/CodeGen.hs
+++ b/compiler/nativeGen/SPARC/CodeGen.hs
@@ -681,6 +681,7 @@ outOfLineMachOp_table mop
         MO_AtomicRead w -> fsLit $ atomicReadLabel w
         MO_AtomicWrite w -> fsLit $ atomicWriteLabel w
 
+        MO_S_Mul2    {}  -> unsupported
         MO_S_QuotRem {}  -> unsupported
         MO_U_QuotRem {}  -> unsupported
         MO_U_QuotRem2 {} -> unsupported
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 1807bdcea1..59a1e4115b 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -2613,6 +2613,26 @@ genCCall' _ is32Bit target dest_regs args bid = do
                                 MOV format (OpReg rax) (OpReg reg_l)]
                return code
         _ -> panic "genCCall: Wrong number of arguments/results for mul2"
+    (PrimTarget (MO_S_Mul2 width), [res_c, res_h, res_l]) ->
+        case args of
+        [arg_x, arg_y] ->
+            do (y_reg, y_code) <- getRegOrMem arg_y
+               x_code <- getAnyReg arg_x
+               reg_tmp <- getNewRegNat II8
+               let format = intFormat width
+                   reg_h = getRegisterReg platform (CmmLocal res_h)
+                   reg_l = getRegisterReg platform (CmmLocal res_l)
+                   reg_c = getRegisterReg platform (CmmLocal res_c)
+                   code = y_code `appOL`
+                          x_code rax `appOL`
+                          toOL [ IMUL2 format y_reg
+                               , MOV format (OpReg rdx) (OpReg reg_h)
+                               , MOV format (OpReg rax) (OpReg reg_l)
+                               , SETCC CARRY (OpReg reg_tmp)
+                               , MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
+                               ]
+               return code
+        _ -> panic "genCCall: Wrong number of arguments/results for imul2"
 
     _ -> if is32Bit
          then genCCall32' dflags target dest_regs args
@@ -3204,6 +3224,7 @@ outOfLineCmmOp bid mop res args
 
               MO_UF_Conv _ -> unsupported
 
+              MO_S_Mul2    {}  -> unsupported
               MO_S_QuotRem {}  -> unsupported
               MO_U_QuotRem {}  -> unsupported
               MO_U_QuotRem2 {} -> unsupported
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index 076854b4d2..0faf180061 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -251,6 +251,13 @@ primop   IntMulOp    "*#"
    with commutable = True
         fixity = infixl 7
 
+primop   IntMul2Op    "timesInt2#" GenPrimOp
+   Int# -> Int# -> (# Int#, Int#, Int# #)
+   {Return a triple (isHighNeeded,high,low) where high and low are respectively
+   the high and low bits of the double-word result. isHighNeeded is a cheap way
+   to test if the high word is a sign-extension of the low word (isHighNeeded =
+   0#) or not (isHighNeeded = 1#).}
+
 primop   IntMulMayOfloOp  "mulIntMayOflo#"
    Dyadic   Int# -> Int# -> Int#
    {Return non-zero if there is any possibility that the upper word of a
diff --git a/libraries/ghc-prim/changelog.md b/libraries/ghc-prim/changelog.md
index 411d118aa1..cf14d21c81 100644
--- a/libraries/ghc-prim/changelog.md
+++ b/libraries/ghc-prim/changelog.md
@@ -27,6 +27,16 @@
   reverses the order of its bits e.g. `0b110001` becomes `0b100011`.
   These primitives use optimized machine instructions when available.
 
+- Add Int# multiplication primop:
+
+      timesInt2# :: Int# -> Int# -> (# Int#, Int#, Int# #)
+
+   `timesInt2#` computes the multiplication of its two parameters and returns a
+   triple (isHighNeeded,high,low) where high and low are respectively the high
+   and low bits of the double-word result. isHighNeeded is a cheap way to test
+   if the high word is a sign-extension of the low word (isHighNeeded = 0#) or
+   not (isHighNeeded = 1#).
+
 ## 0.6.0
 
 - Shipped with GHC 8.8.1
diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T
index f96820de81..01516136c2 100644
--- a/testsuite/tests/codeGen/should_run/all.T
+++ b/testsuite/tests/codeGen/should_run/all.T
@@ -88,6 +88,7 @@ test('cgrun075', normal, compile_and_run, [''])
 test('cgrun076', normal, compile_and_run, [''])
 test('cgrun077', [when(have_cpu_feature('bmi2'), extra_hc_opts('-mbmi2'))], compile_and_run, [''])
 test('cgrun078', omit_ways(['ghci']), compile_and_run, [''])
+test('cgrun079', normal, compile_and_run, [''])
 
 test('T1852', normal, compile_and_run, [''])
 test('T1861', extra_run_opts('0'), compile_and_run, [''])
diff --git a/testsuite/tests/codeGen/should_run/cgrun079.hs b/testsuite/tests/codeGen/should_run/cgrun079.hs
new file mode 100644
index 0000000000..e299c860c3
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/cgrun079.hs
@@ -0,0 +1,98 @@
+{-# LANGUAGE CPP, MagicHash, BlockArguments, UnboxedTuples #-}
+
+-- Tests for the timesInt2# primop
+module Main ( main ) where
+
+import Data.Bits
+import GHC.Int
+import GHC.Prim
+import GHC.Word
+import Control.Monad
+
+#include "MachDeps.h"
+
+
+imul2 :: Int -> Int -> (Int,Int,Int)
+imul2 (I# x) (I# y) = case timesInt2# x y of
+   (# c, h, l #) -> (I# c, I# h, I# l)
+
+checkImul2 :: Int -> Int -> IO ()
+checkImul2 x y = do
+   -- First we compare against Integer result. Note that this test will become
+   -- moot when Integer implementation will use this primitive
+   let
+      w2 = fromIntegral x * (fromIntegral y :: Integer)
+      (c,h,l) = imul2 x y
+      w = case c of
+            0 -> fromIntegral l
+            _ -> int2ToInteger h l
+
+   unless (w == w2) do
+      putStrLn $ mconcat
+       [ "Failed: "
+       , show x
+       , " * "
+       , show y
+       , "\n    Got: "
+       , show w
+       , "\n    Expected: "
+       , show w2
+       ]
+
+   -- Now we compare with a generic version using unsigned multiply.
+   -- This reimplements the fallback generic version that the compiler uses when
+   -- the mach-op isn't available so it'd better be correct too.
+   let (c',h',l') = genericIMul2 x y
+
+   unless ((c,h,l) == (c',h',l')) do
+      putStrLn $ mconcat
+       [ "Failed: "
+       , show x
+       , " * "
+       , show y
+       , "\n    Got: "
+       , show (c,h,l)
+       , "\n    Expected: "
+       , show (c',h',l')
+       ]
+
+addWordC :: Word -> Word -> (Word,Word)
+addWordC (W# x) (W# y) = case addWordC# x y of
+   (# l,c #) -> (W# (int2Word# c), W# l)
+
+int2ToInteger :: Int -> Int -> Integer
+int2ToInteger h l
+  | h < 0     = case addWordC (complement (fromIntegral l)) 1 of
+                  (c,w) -> -1 * word2ToInteger (c + complement (fromIntegral h)) w
+  | otherwise = word2ToInteger (fromIntegral h) (fromIntegral l)
+  where
+   word2ToInteger :: Word -> Word -> Integer
+   word2ToInteger x y = (fromIntegral x) `shiftL` WORD_SIZE_IN_BITS + fromIntegral y
+
+timesWord2 :: Word -> Word -> (Int,Int)
+timesWord2 (W# x) (W# y) = case timesWord2# x y of
+   (# h, l #) -> (I# (word2Int# h), I# (word2Int# l))
+
+genericIMul2 :: Int -> Int -> (Int,Int,Int)
+genericIMul2 x y = (c,h,l)
+   where
+      (p,l) = timesWord2 (fromIntegral x) (fromIntegral y)
+      h = p - f x y - f y x
+      c = if h == carryFill l then 0 else 1
+      f u v = carryFill u .&. v
+
+      -- Return either 00..00 or FF..FF depending on the carry
+      carryFill :: Int -> Int
+      carryFill x = x `shiftR` (WORD_SIZE_IN_BITS - 1)
+
+
+main = do
+   checkImul2 10 10
+   checkImul2 10 (-10)
+   checkImul2 minBound (-1)
+   checkImul2 maxBound (-1)
+   checkImul2 minBound 0
+   checkImul2 maxBound 0
+   checkImul2 minBound minBound
+   checkImul2 minBound maxBound
+   checkImul2 maxBound maxBound