Compile count{Leading,Trailing}Zeros to corresponding x86_64 instructions under -mbmi2

This works similarly to existing implementation for popCount. Trac ticket: #16086.
author: Dmitry Ivanov <ethercrow@gmail.com> 2018-12-08 16:45:02 +0100
committer: Ben Gamari <ben@smart-cactus.org> 2019-01-30 10:06:31 -0500
commit: c1d9416f2672b8d844141c0393fe773676749777 (patch)
tree: 30d388f8070e3316e40397293ce4150c24351536
parent: cc2261d42f6a954d88e355aaad41f001f65c95da (diff)
download: haskell-c1d9416f2672b8d844141c0393fe773676749777.tar.gz
7 files changed, 107 insertions, 28 deletions
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 37080b990e..9591c42ede 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -2045,25 +2045,37 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] b
 
   | otherwise = do
     code_src <- getAnyReg src
-    src_r <- getNewRegNat format
-    tmp_r <- getNewRegNat format
     let dst_r = getRegisterReg platform False (CmmLocal dst)
-
-    -- The following insn sequence makes sure 'clz 0' has a defined value.
-    -- starting with Haswell, one could use the LZCNT insn instead.
-    return $ code_src src_r `appOL` toOL
-             ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
-              [ BSR     format (OpReg src_r) tmp_r
-              , MOV     II32   (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
-              , CMOV NE format (OpReg tmp_r) dst_r
-              , XOR     format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
-              ]) -- NB: We don't need to zero-extend the result for the
-                 -- W8/W16 cases because the 'MOV' insn already
-                 -- took care of implicitly clearing the upper bits
+    if isBmi2Enabled dflags
+        then do
+            src_r <- getNewRegNat (intFormat width)
+            return $ appOL (code_src src_r) $ case width of
+                W8 -> toOL
+                    [ MOVZxL II8  (OpReg src_r)       (OpReg src_r) -- zero-extend to 32 bit
+                    , LZCNT  II32 (OpReg src_r)       dst_r         -- lzcnt with extra 24 zeros
+                    , SUB    II32 (OpImm (ImmInt 24)) (OpReg dst_r) -- compensate for extra zeros
+                    ]
+                W16 -> toOL
+                    [ LZCNT  II16 (OpReg src_r) dst_r
+                    , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) -- zero-extend from 16 bit
+                    ]
+                _ -> unitOL (LZCNT (intFormat width) (OpReg src_r) dst_r)
+        else do
+            let format = if width == W8 then II16 else intFormat width
+            src_r <- getNewRegNat format
+            tmp_r <- getNewRegNat format
+            return $ code_src src_r `appOL` toOL
+                     ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+                      [ BSR     format (OpReg src_r) tmp_r
+                      , MOV     II32   (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
+                      , CMOV NE format (OpReg tmp_r) dst_r
+                      , XOR     format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
+                      ]) -- NB: We don't need to zero-extend the result for the
+                         -- W8/W16 cases because the 'MOV' insn already
+                         -- took care of implicitly clearing the upper bits
   where
     bw = widthInBits width
     platform = targetPlatform dflags
-    format = if width == W8 then II16 else intFormat width
     lbl = mkCmmCodeLabel primUnitId (fsLit (clzLabel width))
 
 genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
@@ -2073,6 +2085,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
           dst_r   = getRegisterReg platform False (CmmLocal dst)
       lbl1 <- getBlockIdNat
       lbl2 <- getBlockIdNat
+      let format = if width == W8 then II16 else intFormat width
       tmp_r <- getNewRegNat format
 
       -- New CFG Edges:
@@ -2109,24 +2122,38 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
 
   | otherwise = do
     code_src <- getAnyReg src
-    src_r <- getNewRegNat format
-    tmp_r <- getNewRegNat format
     let dst_r = getRegisterReg platform False (CmmLocal dst)
 
-    -- The following insn sequence makes sure 'ctz 0' has a defined value.
-    -- starting with Haswell, one could use the TZCNT insn instead.
-    return $ code_src src_r `appOL` toOL
-             ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
-              [ BSF     format (OpReg src_r) tmp_r
-              , MOV     II32   (OpImm (ImmInt bw)) (OpReg dst_r)
-              , CMOV NE format (OpReg tmp_r) dst_r
-              ]) -- NB: We don't need to zero-extend the result for the
-                 -- W8/W16 cases because the 'MOV' insn already
-                 -- took care of implicitly clearing the upper bits
+    if isBmi2Enabled dflags
+    then do
+        src_r <- getNewRegNat (intFormat width)
+        return $ appOL (code_src src_r) $ case width of
+            W8 -> toOL
+                [ OR    II32 (OpImm (ImmInt 0xFFFFFF00)) (OpReg src_r)
+                , TZCNT II32 (OpReg src_r)        dst_r
+                ]
+            W16 -> toOL
+                [ TZCNT  II16 (OpReg src_r) dst_r
+                , MOVZxL II16 (OpReg dst_r) (OpReg dst_r)
+                ]
+            _ -> unitOL $ TZCNT (intFormat width) (OpReg src_r) dst_r
+    else do
+        -- The following insn sequence makes sure 'ctz 0' has a defined value.
+        -- starting with Haswell, one could use the TZCNT insn instead.
+        let format = if width == W8 then II16 else intFormat width
+        src_r <- getNewRegNat format
+        tmp_r <- getNewRegNat format
+        return $ code_src src_r `appOL` toOL
+                 ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+                  [ BSF     format (OpReg src_r) tmp_r
+                  , MOV     II32   (OpImm (ImmInt bw)) (OpReg dst_r)
+                  , CMOV NE format (OpReg tmp_r) dst_r
+                  ]) -- NB: We don't need to zero-extend the result for the
+                     -- W8/W16 cases because the 'MOV' insn already
+                     -- took care of implicitly clearing the upper bits
   where
     bw = widthInBits width
     platform = targetPlatform dflags
-    format = if width == W8 then II16 else intFormat width
 
 genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
     targetExpr <- cmmMakeDynamicReference dflags
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index c47e1fae83..5e790e481e 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -342,6 +342,8 @@ data Instr
 
     -- bit counting instructions
         | POPCNT      Format Operand Reg -- [SSE4.2] count number of bits set to 1
+        | LZCNT       Format Operand Reg -- [BMI2] count number of leading zeros
+        | TZCNT       Format Operand Reg -- [BMI2] count number of trailing zeros
         | BSF         Format Operand Reg -- bit scan forward
         | BSR         Format Operand Reg -- bit scan reverse
 
@@ -471,6 +473,8 @@ x86_regUsageOfInstr platform instr
     DELTA   _           -> noUsage
 
     POPCNT _ src dst -> mkRU (use_R src []) [dst]
+    LZCNT  _ src dst -> mkRU (use_R src []) [dst]
+    TZCNT  _ src dst -> mkRU (use_R src []) [dst]
     BSF    _ src dst -> mkRU (use_R src []) [dst]
     BSR    _ src dst -> mkRU (use_R src []) [dst]
 
@@ -653,6 +657,8 @@ x86_patchRegsOfInstr instr env
     CLTD _              -> instr
 
     POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst)
+    LZCNT  fmt src dst -> LZCNT  fmt (patchOp src) (env dst)
+    TZCNT  fmt src dst -> TZCNT  fmt (patchOp src) (env dst)
     PDEP   fmt src mask dst -> PDEP   fmt (patchOp src) (patchOp mask) (env dst)
     PEXT   fmt src mask dst -> PEXT   fmt (patchOp src) (patchOp mask) (env dst)
     BSF    fmt src dst -> BSF    fmt (patchOp src) (env dst)
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index bf449d044e..075bb26337 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -693,6 +693,8 @@ pprInstr (XOR FF64 src dst) = pprOpOp (sLit "xorpd") FF64 src dst
 pprInstr (XOR format src dst) = pprFormatOpOp (sLit "xor")  format src dst
 
 pprInstr (POPCNT format src dst) = pprOpOp (sLit "popcnt") format src (OpReg dst)
+pprInstr (LZCNT format src dst)  = pprOpOp (sLit "lzcnt")  format src (OpReg dst)
+pprInstr (TZCNT format src dst)  = pprOpOp (sLit "tzcnt")  format src (OpReg dst)
 pprInstr (BSF format src dst)    = pprOpOp (sLit "bsf")    format src (OpReg dst)
 pprInstr (BSR format src dst)    = pprOpOp (sLit "bsr")    format src (OpReg dst)
 
diff --git a/docs/users_guide/using.rst b/docs/users_guide/using.rst
index 420e2d3bfc..83eeb51eed 100644
--- a/docs/users_guide/using.rst
+++ b/docs/users_guide/using.rst
@@ -1088,6 +1088,16 @@ Some flags only make sense for particular target platforms.
     SSE4.2 if your processor supports it but detects this automatically
     so no flag is required.
 
+.. ghc-flag:: -mbmi2
+    :shortdesc: (x86 only) Use BMI2 for bit manipulation operations
+    :type: dynamic
+    :category: platform-options
+
+    (x86 only, added in GHC 7.4.1) Use the BMI2 instruction set to
+    implement some bit operations when using the
+    :ref:`native code generator <native-code-gen>`. The resulting compiled
+    code will only run on processors that support BMI2 (Intel Haswell and newer, AMD Excavator, Zen and newer).
+
 Miscellaneous flags
 -------------------
 
diff --git a/libraries/base/Data/Bits.hs b/libraries/base/Data/Bits.hs
index 000e663b83..69446f9adc 100644
--- a/libraries/base/Data/Bits.hs
+++ b/libraries/base/Data/Bits.hs
@@ -438,6 +438,9 @@ instance Bits Int where
     {-# INLINE shift #-}
     {-# INLINE bit #-}
     {-# INLINE testBit #-}
+    -- We want popCnt# to be inlined in user code so that `ghc -msse4.2`
+    -- can compile it down to a popcnt instruction without an extra function call
+    {-# INLINE popCount #-}
 
     zeroBits = 0
 
@@ -478,13 +481,16 @@ instance Bits Int where
 instance FiniteBits Int where
     finiteBitSize _ = WORD_SIZE_IN_BITS
     countLeadingZeros  (I# x#) = I# (word2Int# (clz# (int2Word# x#)))
+    {-# INLINE countLeadingZeros #-}
     countTrailingZeros (I# x#) = I# (word2Int# (ctz# (int2Word# x#)))
+    {-# INLINE countTrailingZeros #-}
 
 -- | @since 2.01
 instance Bits Word where
     {-# INLINE shift #-}
     {-# INLINE bit #-}
     {-# INLINE testBit #-}
+    {-# INLINE popCount #-}
 
     (W# x#) .&.   (W# y#)    = W# (x# `and#` y#)
     (W# x#) .|.   (W# y#)    = W# (x# `or#`  y#)
@@ -519,7 +525,9 @@ instance Bits Word where
 instance FiniteBits Word where
     finiteBitSize _ = WORD_SIZE_IN_BITS
     countLeadingZeros  (W# x#) = I# (word2Int# (clz# x#))
+    {-# INLINE countLeadingZeros #-}
     countTrailingZeros (W# x#) = I# (word2Int# (ctz# x#))
+    {-# INLINE countTrailingZeros #-}
 
 -- | @since 2.01
 instance Bits Integer where
diff --git a/libraries/base/GHC/Int.hs b/libraries/base/GHC/Int.hs
index 2c5ca9d5a8..d87d352cb7 100644
--- a/libraries/base/GHC/Int.hs
+++ b/libraries/base/GHC/Int.hs
@@ -177,6 +177,7 @@ instance Bits Int8 where
     {-# INLINE shift #-}
     {-# INLINE bit #-}
     {-# INLINE testBit #-}
+    {-# INLINE popCount #-}
 
     (I8# x#) .&.   (I8# y#)   = I8# (word2Int# (int2Word# x# `and#` int2Word# y#))
     (I8# x#) .|.   (I8# y#)   = I8# (word2Int# (int2Word# x# `or#`  int2Word# y#))
@@ -211,6 +212,8 @@ instance Bits Int8 where
 
 -- | @since 4.6.0.0
 instance FiniteBits Int8 where
+    {-# INLINE countLeadingZeros #-}
+    {-# INLINE countTrailingZeros #-}
     finiteBitSize _ = 8
     countLeadingZeros  (I8# x#) = I# (word2Int# (clz8# (int2Word# x#)))
     countTrailingZeros (I8# x#) = I# (word2Int# (ctz8# (int2Word# x#)))
@@ -381,6 +384,7 @@ instance Bits Int16 where
     {-# INLINE shift #-}
     {-# INLINE bit #-}
     {-# INLINE testBit #-}
+    {-# INLINE popCount #-}
 
     (I16# x#) .&.   (I16# y#)  = I16# (word2Int# (int2Word# x# `and#` int2Word# y#))
     (I16# x#) .|.   (I16# y#)  = I16# (word2Int# (int2Word# x# `or#`  int2Word# y#))
@@ -415,6 +419,8 @@ instance Bits Int16 where
 
 -- | @since 4.6.0.0
 instance FiniteBits Int16 where
+    {-# INLINE countLeadingZeros #-}
+    {-# INLINE countTrailingZeros #-}
     finiteBitSize _ = 16
     countLeadingZeros  (I16# x#) = I# (word2Int# (clz16# (int2Word# x#)))
     countTrailingZeros (I16# x#) = I# (word2Int# (ctz16# (int2Word# x#)))
@@ -587,6 +593,7 @@ instance Bits Int32 where
     {-# INLINE shift #-}
     {-# INLINE bit #-}
     {-# INLINE testBit #-}
+    {-# INLINE popCount #-}
 
     (I32# x#) .&.   (I32# y#)  = I32# (word2Int# (int2Word# x# `and#` int2Word# y#))
     (I32# x#) .|.   (I32# y#)  = I32# (word2Int# (int2Word# x# `or#`  int2Word# y#))
@@ -622,6 +629,8 @@ instance Bits Int32 where
 
 -- | @since 4.6.0.0
 instance FiniteBits Int32 where
+    {-# INLINE countLeadingZeros #-}
+    {-# INLINE countTrailingZeros #-}
     finiteBitSize _ = 32
     countLeadingZeros  (I32# x#) = I# (word2Int# (clz32# (int2Word# x#)))
     countTrailingZeros (I32# x#) = I# (word2Int# (ctz32# (int2Word# x#)))
@@ -825,6 +834,7 @@ instance Bits Int64 where
     {-# INLINE shift #-}
     {-# INLINE bit #-}
     {-# INLINE testBit #-}
+    {-# INLINE popCount #-}
 
     (I64# x#) .&.   (I64# y#)  = I64# (word64ToInt64# (int64ToWord64# x# `and64#` int64ToWord64# y#))
     (I64# x#) .|.   (I64# y#)  = I64# (word64ToInt64# (int64ToWord64# x# `or64#`  int64ToWord64# y#))
@@ -1002,6 +1012,7 @@ instance Bits Int64 where
     {-# INLINE shift #-}
     {-# INLINE bit #-}
     {-# INLINE testBit #-}
+    {-# INLINE popCount #-}
 
     (I64# x#) .&.   (I64# y#)  = I64# (word2Int# (int2Word# x# `and#` int2Word# y#))
     (I64# x#) .|.   (I64# y#)  = I64# (word2Int# (int2Word# x# `or#`  int2Word# y#))
@@ -1078,6 +1089,8 @@ uncheckedIShiftRA64# = uncheckedIShiftRA#
 
 -- | @since 4.6.0.0
 instance FiniteBits Int64 where
+    {-# INLINE countLeadingZeros #-}
+    {-# INLINE countTrailingZeros #-}
     finiteBitSize _ = 64
 #if WORD_SIZE_IN_BITS < 64
     countLeadingZeros  (I64# x#) = I# (word2Int# (clz64# (int64ToWord64# x#)))
diff --git a/libraries/base/GHC/Word.hs b/libraries/base/GHC/Word.hs
index d19a31dfb2..e714392e9c 100644
--- a/libraries/base/GHC/Word.hs
+++ b/libraries/base/GHC/Word.hs
@@ -168,6 +168,7 @@ instance Bits Word8 where
     {-# INLINE shift #-}
     {-# INLINE bit #-}
     {-# INLINE testBit #-}
+    {-# INLINE popCount #-}
 
     (W8# x#) .&.   (W8# y#)   = W8# (x# `and#` y#)
     (W8# x#) .|.   (W8# y#)   = W8# (x# `or#`  y#)
@@ -201,6 +202,8 @@ instance Bits Word8 where
 
 -- | @since 4.6.0.0
 instance FiniteBits Word8 where
+    {-# INLINE countLeadingZeros #-}
+    {-# INLINE countTrailingZeros #-}
     finiteBitSize _ = 8
     countLeadingZeros  (W8# x#) = I# (word2Int# (clz8# x#))
     countTrailingZeros (W8# x#) = I# (word2Int# (ctz8# x#))
@@ -356,6 +359,7 @@ instance Bits Word16 where
     {-# INLINE shift #-}
     {-# INLINE bit #-}
     {-# INLINE testBit #-}
+    {-# INLINE popCount #-}
 
     (W16# x#) .&.   (W16# y#)  = W16# (x# `and#` y#)
     (W16# x#) .|.   (W16# y#)  = W16# (x# `or#`  y#)
@@ -389,6 +393,8 @@ instance Bits Word16 where
 
 -- | @since 4.6.0.0
 instance FiniteBits Word16 where
+    {-# INLINE countLeadingZeros #-}
+    {-# INLINE countTrailingZeros #-}
     finiteBitSize _ = 16
     countLeadingZeros  (W16# x#) = I# (word2Int# (clz16# x#))
     countTrailingZeros (W16# x#) = I# (word2Int# (ctz16# x#))
@@ -590,6 +596,7 @@ instance Bits Word32 where
     {-# INLINE shift #-}
     {-# INLINE bit #-}
     {-# INLINE testBit #-}
+    {-# INLINE popCount #-}
 
     (W32# x#) .&.   (W32# y#)  = W32# (x# `and#` y#)
     (W32# x#) .|.   (W32# y#)  = W32# (x# `or#`  y#)
@@ -623,6 +630,8 @@ instance Bits Word32 where
 
 -- | @since 4.6.0.0
 instance FiniteBits Word32 where
+    {-# INLINE countLeadingZeros #-}
+    {-# INLINE countTrailingZeros #-}
     finiteBitSize _ = 32
     countLeadingZeros  (W32# x#) = I# (word2Int# (clz32# x#))
     countTrailingZeros (W32# x#) = I# (word2Int# (ctz32# x#))
@@ -762,6 +771,7 @@ instance Bits Word64 where
     {-# INLINE shift #-}
     {-# INLINE bit #-}
     {-# INLINE testBit #-}
+    {-# INLINE popCount #-}
 
     (W64# x#) .&.   (W64# y#)  = W64# (x# `and64#` y#)
     (W64# x#) .|.   (W64# y#)  = W64# (x# `or64#`  y#)
@@ -914,6 +924,7 @@ instance Bits Word64 where
     {-# INLINE shift #-}
     {-# INLINE bit #-}
     {-# INLINE testBit #-}
+    {-# INLINE popCount #-}
 
     (W64# x#) .&.   (W64# y#)  = W64# (x# `and#` y#)
     (W64# x#) .|.   (W64# y#)  = W64# (x# `or#`  y#)
@@ -959,6 +970,8 @@ uncheckedShiftRL64# = uncheckedShiftRL#
 
 -- | @since 4.6.0.0
 instance FiniteBits Word64 where
+    {-# INLINE countLeadingZeros #-}
+    {-# INLINE countTrailingZeros #-}
     finiteBitSize _ = 64
     countLeadingZeros  (W64# x#) = I# (word2Int# (clz64# x#))
     countTrailingZeros (W64# x#) = I# (word2Int# (ctz64# x#))
author	Dmitry Ivanov <ethercrow@gmail.com>	2018-12-08 16:45:02 +0100
committer	Ben Gamari <ben@smart-cactus.org>	2019-01-30 10:06:31 -0500
commit	c1d9416f2672b8d844141c0393fe773676749777 (patch)
tree	30d388f8070e3316e40397293ce4150c24351536
parent	cc2261d42f6a954d88e355aaad41f001f65c95da (diff)
download	haskell-c1d9416f2672b8d844141c0393fe773676749777.tar.gz