diff options
author | Dmitry Ivanov <ethercrow@gmail.com> | 2018-12-08 16:45:02 +0100 |
---|---|---|
committer | Ben Gamari <ben@smart-cactus.org> | 2019-01-30 10:06:31 -0500 |
commit | c1d9416f2672b8d844141c0393fe773676749777 (patch) | |
tree | 30d388f8070e3316e40397293ce4150c24351536 | |
parent | cc2261d42f6a954d88e355aaad41f001f65c95da (diff) | |
download | haskell-c1d9416f2672b8d844141c0393fe773676749777.tar.gz |
Compile count{Leading,Trailing}Zeros to corresponding x86_64 instructions under -mbmi2
This works similarly to existing implementation for popCount.
Trac ticket: #16086.
-rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 83 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Instr.hs | 6 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Ppr.hs | 2 | ||||
-rw-r--r-- | docs/users_guide/using.rst | 10 | ||||
-rw-r--r-- | libraries/base/Data/Bits.hs | 8 | ||||
-rw-r--r-- | libraries/base/GHC/Int.hs | 13 | ||||
-rw-r--r-- | libraries/base/GHC/Word.hs | 13 |
7 files changed, 107 insertions, 28 deletions
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index 37080b990e..9591c42ede 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -2045,25 +2045,37 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] b | otherwise = do code_src <- getAnyReg src - src_r <- getNewRegNat format - tmp_r <- getNewRegNat format let dst_r = getRegisterReg platform False (CmmLocal dst) - - -- The following insn sequence makes sure 'clz 0' has a defined value. - -- starting with Haswell, one could use the LZCNT insn instead. - return $ code_src src_r `appOL` toOL - ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ - [ BSR format (OpReg src_r) tmp_r - , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r) - , CMOV NE format (OpReg tmp_r) dst_r - , XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r) - ]) -- NB: We don't need to zero-extend the result for the - -- W8/W16 cases because the 'MOV' insn already - -- took care of implicitly clearing the upper bits + if isBmi2Enabled dflags + then do + src_r <- getNewRegNat (intFormat width) + return $ appOL (code_src src_r) $ case width of + W8 -> toOL + [ MOVZxL II8 (OpReg src_r) (OpReg src_r) -- zero-extend to 32 bit + , LZCNT II32 (OpReg src_r) dst_r -- lzcnt with extra 24 zeros + , SUB II32 (OpImm (ImmInt 24)) (OpReg dst_r) -- compensate for extra zeros + ] + W16 -> toOL + [ LZCNT II16 (OpReg src_r) dst_r + , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) -- zero-extend from 16 bit + ] + _ -> unitOL (LZCNT (intFormat width) (OpReg src_r) dst_r) + else do + let format = if width == W8 then II16 else intFormat width + src_r <- getNewRegNat format + tmp_r <- getNewRegNat format + return $ code_src src_r `appOL` toOL + ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ + [ BSR format (OpReg src_r) tmp_r + , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r) + , CMOV NE format (OpReg tmp_r) dst_r + , XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r) + ]) -- NB: We don't need to zero-extend the result for the + -- W8/W16 cases because the 'MOV' insn already + -- took care of implicitly clearing the upper bits where bw = widthInBits width platform = targetPlatform dflags - format = if width == W8 then II16 else intFormat width lbl = mkCmmCodeLabel primUnitId (fsLit (clzLabel width)) genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid @@ -2073,6 +2085,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid dst_r = getRegisterReg platform False (CmmLocal dst) lbl1 <- getBlockIdNat lbl2 <- getBlockIdNat + let format = if width == W8 then II16 else intFormat width tmp_r <- getNewRegNat format -- New CFG Edges: @@ -2109,24 +2122,38 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid | otherwise = do code_src <- getAnyReg src - src_r <- getNewRegNat format - tmp_r <- getNewRegNat format let dst_r = getRegisterReg platform False (CmmLocal dst) - -- The following insn sequence makes sure 'ctz 0' has a defined value. - -- starting with Haswell, one could use the TZCNT insn instead. - return $ code_src src_r `appOL` toOL - ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ - [ BSF format (OpReg src_r) tmp_r - , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r) - , CMOV NE format (OpReg tmp_r) dst_r - ]) -- NB: We don't need to zero-extend the result for the - -- W8/W16 cases because the 'MOV' insn already - -- took care of implicitly clearing the upper bits + if isBmi2Enabled dflags + then do + src_r <- getNewRegNat (intFormat width) + return $ appOL (code_src src_r) $ case width of + W8 -> toOL + [ OR II32 (OpImm (ImmInt 0xFFFFFF00)) (OpReg src_r) + , TZCNT II32 (OpReg src_r) dst_r + ] + W16 -> toOL + [ TZCNT II16 (OpReg src_r) dst_r + , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) + ] + _ -> unitOL $ TZCNT (intFormat width) (OpReg src_r) dst_r + else do + -- The following insn sequence makes sure 'ctz 0' has a defined value. + -- starting with Haswell, one could use the TZCNT insn instead. + let format = if width == W8 then II16 else intFormat width + src_r <- getNewRegNat format + tmp_r <- getNewRegNat format + return $ code_src src_r `appOL` toOL + ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ + [ BSF format (OpReg src_r) tmp_r + , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r) + , CMOV NE format (OpReg tmp_r) dst_r + ]) -- NB: We don't need to zero-extend the result for the + -- W8/W16 cases because the 'MOV' insn already + -- took care of implicitly clearing the upper bits where bw = widthInBits width platform = targetPlatform dflags - format = if width == W8 then II16 else intFormat width genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do targetExpr <- cmmMakeDynamicReference dflags diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs index c47e1fae83..5e790e481e 100644 --- a/compiler/nativeGen/X86/Instr.hs +++ b/compiler/nativeGen/X86/Instr.hs @@ -342,6 +342,8 @@ data Instr -- bit counting instructions | POPCNT Format Operand Reg -- [SSE4.2] count number of bits set to 1 + | LZCNT Format Operand Reg -- [BMI2] count number of leading zeros + | TZCNT Format Operand Reg -- [BMI2] count number of trailing zeros | BSF Format Operand Reg -- bit scan forward | BSR Format Operand Reg -- bit scan reverse @@ -471,6 +473,8 @@ x86_regUsageOfInstr platform instr DELTA _ -> noUsage POPCNT _ src dst -> mkRU (use_R src []) [dst] + LZCNT _ src dst -> mkRU (use_R src []) [dst] + TZCNT _ src dst -> mkRU (use_R src []) [dst] BSF _ src dst -> mkRU (use_R src []) [dst] BSR _ src dst -> mkRU (use_R src []) [dst] @@ -653,6 +657,8 @@ x86_patchRegsOfInstr instr env CLTD _ -> instr POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst) + LZCNT fmt src dst -> LZCNT fmt (patchOp src) (env dst) + TZCNT fmt src dst -> TZCNT fmt (patchOp src) (env dst) PDEP fmt src mask dst -> PDEP fmt (patchOp src) (patchOp mask) (env dst) PEXT fmt src mask dst -> PEXT fmt (patchOp src) (patchOp mask) (env dst) BSF fmt src dst -> BSF fmt (patchOp src) (env dst) diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs index bf449d044e..075bb26337 100644 --- a/compiler/nativeGen/X86/Ppr.hs +++ b/compiler/nativeGen/X86/Ppr.hs @@ -693,6 +693,8 @@ pprInstr (XOR FF64 src dst) = pprOpOp (sLit "xorpd") FF64 src dst pprInstr (XOR format src dst) = pprFormatOpOp (sLit "xor") format src dst pprInstr (POPCNT format src dst) = pprOpOp (sLit "popcnt") format src (OpReg dst) +pprInstr (LZCNT format src dst) = pprOpOp (sLit "lzcnt") format src (OpReg dst) +pprInstr (TZCNT format src dst) = pprOpOp (sLit "tzcnt") format src (OpReg dst) pprInstr (BSF format src dst) = pprOpOp (sLit "bsf") format src (OpReg dst) pprInstr (BSR format src dst) = pprOpOp (sLit "bsr") format src (OpReg dst) diff --git a/docs/users_guide/using.rst b/docs/users_guide/using.rst index 420e2d3bfc..83eeb51eed 100644 --- a/docs/users_guide/using.rst +++ b/docs/users_guide/using.rst @@ -1088,6 +1088,16 @@ Some flags only make sense for particular target platforms. SSE4.2 if your processor supports it but detects this automatically so no flag is required. +.. ghc-flag:: -mbmi2 + :shortdesc: (x86 only) Use BMI2 for bit manipulation operations + :type: dynamic + :category: platform-options + + (x86 only, added in GHC 7.4.1) Use the BMI2 instruction set to + implement some bit operations when using the + :ref:`native code generator <native-code-gen>`. The resulting compiled + code will only run on processors that support BMI2 (Intel Haswell and newer, AMD Excavator, Zen and newer). + Miscellaneous flags ------------------- diff --git a/libraries/base/Data/Bits.hs b/libraries/base/Data/Bits.hs index 000e663b83..69446f9adc 100644 --- a/libraries/base/Data/Bits.hs +++ b/libraries/base/Data/Bits.hs @@ -438,6 +438,9 @@ instance Bits Int where {-# INLINE shift #-} {-# INLINE bit #-} {-# INLINE testBit #-} + -- We want popCnt# to be inlined in user code so that `ghc -msse4.2` + -- can compile it down to a popcnt instruction without an extra function call + {-# INLINE popCount #-} zeroBits = 0 @@ -478,13 +481,16 @@ instance Bits Int where instance FiniteBits Int where finiteBitSize _ = WORD_SIZE_IN_BITS countLeadingZeros (I# x#) = I# (word2Int# (clz# (int2Word# x#))) + {-# INLINE countLeadingZeros #-} countTrailingZeros (I# x#) = I# (word2Int# (ctz# (int2Word# x#))) + {-# INLINE countTrailingZeros #-} -- | @since 2.01 instance Bits Word where {-# INLINE shift #-} {-# INLINE bit #-} {-# INLINE testBit #-} + {-# INLINE popCount #-} (W# x#) .&. (W# y#) = W# (x# `and#` y#) (W# x#) .|. (W# y#) = W# (x# `or#` y#) @@ -519,7 +525,9 @@ instance Bits Word where instance FiniteBits Word where finiteBitSize _ = WORD_SIZE_IN_BITS countLeadingZeros (W# x#) = I# (word2Int# (clz# x#)) + {-# INLINE countLeadingZeros #-} countTrailingZeros (W# x#) = I# (word2Int# (ctz# x#)) + {-# INLINE countTrailingZeros #-} -- | @since 2.01 instance Bits Integer where diff --git a/libraries/base/GHC/Int.hs b/libraries/base/GHC/Int.hs index 2c5ca9d5a8..d87d352cb7 100644 --- a/libraries/base/GHC/Int.hs +++ b/libraries/base/GHC/Int.hs @@ -177,6 +177,7 @@ instance Bits Int8 where {-# INLINE shift #-} {-# INLINE bit #-} {-# INLINE testBit #-} + {-# INLINE popCount #-} (I8# x#) .&. (I8# y#) = I8# (word2Int# (int2Word# x# `and#` int2Word# y#)) (I8# x#) .|. (I8# y#) = I8# (word2Int# (int2Word# x# `or#` int2Word# y#)) @@ -211,6 +212,8 @@ instance Bits Int8 where -- | @since 4.6.0.0 instance FiniteBits Int8 where + {-# INLINE countLeadingZeros #-} + {-# INLINE countTrailingZeros #-} finiteBitSize _ = 8 countLeadingZeros (I8# x#) = I# (word2Int# (clz8# (int2Word# x#))) countTrailingZeros (I8# x#) = I# (word2Int# (ctz8# (int2Word# x#))) @@ -381,6 +384,7 @@ instance Bits Int16 where {-# INLINE shift #-} {-# INLINE bit #-} {-# INLINE testBit #-} + {-# INLINE popCount #-} (I16# x#) .&. (I16# y#) = I16# (word2Int# (int2Word# x# `and#` int2Word# y#)) (I16# x#) .|. (I16# y#) = I16# (word2Int# (int2Word# x# `or#` int2Word# y#)) @@ -415,6 +419,8 @@ instance Bits Int16 where -- | @since 4.6.0.0 instance FiniteBits Int16 where + {-# INLINE countLeadingZeros #-} + {-# INLINE countTrailingZeros #-} finiteBitSize _ = 16 countLeadingZeros (I16# x#) = I# (word2Int# (clz16# (int2Word# x#))) countTrailingZeros (I16# x#) = I# (word2Int# (ctz16# (int2Word# x#))) @@ -587,6 +593,7 @@ instance Bits Int32 where {-# INLINE shift #-} {-# INLINE bit #-} {-# INLINE testBit #-} + {-# INLINE popCount #-} (I32# x#) .&. (I32# y#) = I32# (word2Int# (int2Word# x# `and#` int2Word# y#)) (I32# x#) .|. (I32# y#) = I32# (word2Int# (int2Word# x# `or#` int2Word# y#)) @@ -622,6 +629,8 @@ instance Bits Int32 where -- | @since 4.6.0.0 instance FiniteBits Int32 where + {-# INLINE countLeadingZeros #-} + {-# INLINE countTrailingZeros #-} finiteBitSize _ = 32 countLeadingZeros (I32# x#) = I# (word2Int# (clz32# (int2Word# x#))) countTrailingZeros (I32# x#) = I# (word2Int# (ctz32# (int2Word# x#))) @@ -825,6 +834,7 @@ instance Bits Int64 where {-# INLINE shift #-} {-# INLINE bit #-} {-# INLINE testBit #-} + {-# INLINE popCount #-} (I64# x#) .&. (I64# y#) = I64# (word64ToInt64# (int64ToWord64# x# `and64#` int64ToWord64# y#)) (I64# x#) .|. (I64# y#) = I64# (word64ToInt64# (int64ToWord64# x# `or64#` int64ToWord64# y#)) @@ -1002,6 +1012,7 @@ instance Bits Int64 where {-# INLINE shift #-} {-# INLINE bit #-} {-# INLINE testBit #-} + {-# INLINE popCount #-} (I64# x#) .&. (I64# y#) = I64# (word2Int# (int2Word# x# `and#` int2Word# y#)) (I64# x#) .|. (I64# y#) = I64# (word2Int# (int2Word# x# `or#` int2Word# y#)) @@ -1078,6 +1089,8 @@ uncheckedIShiftRA64# = uncheckedIShiftRA# -- | @since 4.6.0.0 instance FiniteBits Int64 where + {-# INLINE countLeadingZeros #-} + {-# INLINE countTrailingZeros #-} finiteBitSize _ = 64 #if WORD_SIZE_IN_BITS < 64 countLeadingZeros (I64# x#) = I# (word2Int# (clz64# (int64ToWord64# x#))) diff --git a/libraries/base/GHC/Word.hs b/libraries/base/GHC/Word.hs index d19a31dfb2..e714392e9c 100644 --- a/libraries/base/GHC/Word.hs +++ b/libraries/base/GHC/Word.hs @@ -168,6 +168,7 @@ instance Bits Word8 where {-# INLINE shift #-} {-# INLINE bit #-} {-# INLINE testBit #-} + {-# INLINE popCount #-} (W8# x#) .&. (W8# y#) = W8# (x# `and#` y#) (W8# x#) .|. (W8# y#) = W8# (x# `or#` y#) @@ -201,6 +202,8 @@ instance Bits Word8 where -- | @since 4.6.0.0 instance FiniteBits Word8 where + {-# INLINE countLeadingZeros #-} + {-# INLINE countTrailingZeros #-} finiteBitSize _ = 8 countLeadingZeros (W8# x#) = I# (word2Int# (clz8# x#)) countTrailingZeros (W8# x#) = I# (word2Int# (ctz8# x#)) @@ -356,6 +359,7 @@ instance Bits Word16 where {-# INLINE shift #-} {-# INLINE bit #-} {-# INLINE testBit #-} + {-# INLINE popCount #-} (W16# x#) .&. (W16# y#) = W16# (x# `and#` y#) (W16# x#) .|. (W16# y#) = W16# (x# `or#` y#) @@ -389,6 +393,8 @@ instance Bits Word16 where -- | @since 4.6.0.0 instance FiniteBits Word16 where + {-# INLINE countLeadingZeros #-} + {-# INLINE countTrailingZeros #-} finiteBitSize _ = 16 countLeadingZeros (W16# x#) = I# (word2Int# (clz16# x#)) countTrailingZeros (W16# x#) = I# (word2Int# (ctz16# x#)) @@ -590,6 +596,7 @@ instance Bits Word32 where {-# INLINE shift #-} {-# INLINE bit #-} {-# INLINE testBit #-} + {-# INLINE popCount #-} (W32# x#) .&. (W32# y#) = W32# (x# `and#` y#) (W32# x#) .|. (W32# y#) = W32# (x# `or#` y#) @@ -623,6 +630,8 @@ instance Bits Word32 where -- | @since 4.6.0.0 instance FiniteBits Word32 where + {-# INLINE countLeadingZeros #-} + {-# INLINE countTrailingZeros #-} finiteBitSize _ = 32 countLeadingZeros (W32# x#) = I# (word2Int# (clz32# x#)) countTrailingZeros (W32# x#) = I# (word2Int# (ctz32# x#)) @@ -762,6 +771,7 @@ instance Bits Word64 where {-# INLINE shift #-} {-# INLINE bit #-} {-# INLINE testBit #-} + {-# INLINE popCount #-} (W64# x#) .&. (W64# y#) = W64# (x# `and64#` y#) (W64# x#) .|. (W64# y#) = W64# (x# `or64#` y#) @@ -914,6 +924,7 @@ instance Bits Word64 where {-# INLINE shift #-} {-# INLINE bit #-} {-# INLINE testBit #-} + {-# INLINE popCount #-} (W64# x#) .&. (W64# y#) = W64# (x# `and#` y#) (W64# x#) .|. (W64# y#) = W64# (x# `or#` y#) @@ -959,6 +970,8 @@ uncheckedShiftRL64# = uncheckedShiftRL# -- | @since 4.6.0.0 instance FiniteBits Word64 where + {-# INLINE countLeadingZeros #-} + {-# INLINE countTrailingZeros #-} finiteBitSize _ = 64 countLeadingZeros (W64# x#) = I# (word2Int# (clz64# x#)) countTrailingZeros (W64# x#) = I# (word2Int# (ctz64# x#)) |