diff options
author | Herbert Valerio Riedel <hvr@gnu.org> | 2014-08-11 18:56:57 +0200 |
---|---|---|
committer | Herbert Valerio Riedel <hvr@gnu.org> | 2014-08-14 11:34:23 +0200 |
commit | e0c1767d0ea8d12e0a4badf43682a08784e379c6 (patch) | |
tree | 6662fe33cd7e803253458f91307b1b5826e30b0f /compiler | |
parent | 6b5ea617dcd162e682886d5843df51a2866218d3 (diff) | |
download | haskell-e0c1767d0ea8d12e0a4badf43682a08784e379c6.tar.gz |
Implement new CLZ and CTZ primops (re #9340)
This implements the new primops
clz#, clz32#, clz64#,
ctz#, ctz32#, ctz64#
which provide efficient implementations of the popular
count-leading-zero and count-trailing-zero respectively
(see testcase for a pure Haskell reference implementation).
On x86, NCG as well as LLVM generates code based on the BSF/BSR
instructions (which need extra logic to make the 0-case well-defined).
Test Plan: validate and succesful tests on i686 and amd64
Reviewers: rwbarton, simonmar, ezyang, austin
Subscribers: simonmar, relrod, ezyang, carter
Differential Revision: https://phabricator.haskell.org/D144
GHC Trac Issues: #9340
Diffstat (limited to 'compiler')
-rw-r--r-- | compiler/cmm/CmmMachOp.hs | 3 | ||||
-rw-r--r-- | compiler/cmm/PprC.hs | 2 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmPrim.hs | 28 | ||||
-rw-r--r-- | compiler/llvmGen/LlvmCodeGen/CodeGen.hs | 9 | ||||
-rw-r--r-- | compiler/nativeGen/CPrim.hs | 20 | ||||
-rw-r--r-- | compiler/nativeGen/PPC/CodeGen.hs | 2 | ||||
-rw-r--r-- | compiler/nativeGen/SPARC/CodeGen.hs | 2 | ||||
-rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 65 | ||||
-rw-r--r-- | compiler/prelude/primops.txt.pp | 22 |
9 files changed, 152 insertions, 1 deletions
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs index d8ce492de1..a7b2c85175 100644 --- a/compiler/cmm/CmmMachOp.hs +++ b/compiler/cmm/CmmMachOp.hs @@ -549,6 +549,9 @@ data CallishMachOp | MO_Memmove | MO_PopCnt Width + | MO_Clz Width + | MO_Ctz Width + | MO_BSwap Width -- Atomic read-modify-write. diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs index 455c79ba02..93a5d06259 100644 --- a/compiler/cmm/PprC.hs +++ b/compiler/cmm/PprC.hs @@ -753,6 +753,8 @@ pprCallishMachOp_for_C mop MO_Memmove -> ptext (sLit "memmove") (MO_BSwap w) -> ptext (sLit $ bSwapLabel w) (MO_PopCnt w) -> ptext (sLit $ popCntLabel w) + (MO_Clz w) -> ptext (sLit $ clzLabel w) + (MO_Ctz w) -> ptext (sLit $ ctzLabel w) (MO_AtomicRMW w amop) -> ptext (sLit $ atomicRMWLabel w amop) (MO_Cmpxchg w) -> ptext (sLit $ cmpxchgLabel w) (MO_AtomicRead w) -> ptext (sLit $ atomicReadLabel w) diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs index 77739fe0fb..9e12427355 100644 --- a/compiler/codeGen/StgCmmPrim.hs +++ b/compiler/codeGen/StgCmmPrim.hs @@ -563,6 +563,20 @@ emitPrimOp _ [res] PopCnt32Op [w] = emitPopCntCall res w W32 emitPrimOp _ [res] PopCnt64Op [w] = emitPopCntCall res w W64 emitPrimOp dflags [res] PopCntOp [w] = emitPopCntCall res w (wordWidth dflags) +-- count leading zeros +emitPrimOp _ [res] Clz8Op [w] = emitClzCall res w W8 +emitPrimOp _ [res] Clz16Op [w] = emitClzCall res w W16 +emitPrimOp _ [res] Clz32Op [w] = emitClzCall res w W32 +emitPrimOp _ [res] Clz64Op [w] = emitClzCall res w W64 +emitPrimOp dflags [res] ClzOp [w] = emitClzCall res w (wordWidth dflags) + +-- count trailing zeros +emitPrimOp _ [res] Ctz8Op [w] = emitCtzCall res w W8 +emitPrimOp _ [res] Ctz16Op [w] = emitCtzCall res w W16 +emitPrimOp _ [res] Ctz32Op [w] = emitCtzCall res w W32 +emitPrimOp _ [res] Ctz64Op [w] = emitCtzCall res w W64 +emitPrimOp dflags [res] CtzOp [w] = emitCtzCall res w (wordWidth dflags) + -- Unsigned int to floating point conversions emitPrimOp _ [res] Word2FloatOp [w] = emitPrimCall [res] (MO_UF_Conv W32) [w] @@ -2096,3 +2110,17 @@ emitPopCntCall res x width = do [ res ] (MO_PopCnt width) [ x ] + +emitClzCall :: LocalReg -> CmmExpr -> Width -> FCode () +emitClzCall res x width = do + emitPrimCall + [ res ] + (MO_Clz width) + [ x ] + +emitCtzCall :: LocalReg -> CmmExpr -> Width -> FCode () +emitCtzCall res x width = do + emitPrimCall + [ res ] + (MO_Ctz width) + [ x ] diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs index 4a56600937..2673eed6b8 100644 --- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs +++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs @@ -224,9 +224,14 @@ genCall t@(PrimTarget (MO_Prefetch_Data localityInt)) [] args return (stmts, top1 ++ top2) | otherwise = panic $ "prefetch locality level integer must be between 0 and 3, given: " ++ (show localityInt) --- Handle PopCnt and BSwap that need to only convert arg and return types +-- Handle PopCnt, Clz, Ctz, and BSwap that need to only convert arg +-- and return types genCall t@(PrimTarget (MO_PopCnt w)) dsts args = genCallSimpleCast w t dsts args +genCall t@(PrimTarget (MO_Clz w)) dsts args = + genCallSimpleCast w t dsts args +genCall t@(PrimTarget (MO_Ctz w)) dsts args = + genCallSimpleCast w t dsts args genCall t@(PrimTarget (MO_BSwap w)) dsts args = genCallSimpleCast w t dsts args @@ -558,6 +563,8 @@ cmmPrimOpFunctions mop = do (MO_PopCnt w) -> fsLit $ "llvm.ctpop." ++ showSDoc dflags (ppr $ widthToLlvmInt w) (MO_BSwap w) -> fsLit $ "llvm.bswap." ++ showSDoc dflags (ppr $ widthToLlvmInt w) + (MO_Clz w) -> fsLit $ "llvm.ctlz." ++ showSDoc dflags (ppr $ widthToLlvmInt w) + (MO_Ctz w) -> fsLit $ "llvm.cttz." ++ showSDoc dflags (ppr $ widthToLlvmInt w) (MO_Prefetch_Data _ )-> fsLit "llvm.prefetch" diff --git a/compiler/nativeGen/CPrim.hs b/compiler/nativeGen/CPrim.hs index 34782dfc1c..c52fe10b13 100644 --- a/compiler/nativeGen/CPrim.hs +++ b/compiler/nativeGen/CPrim.hs @@ -6,6 +6,8 @@ module CPrim , cmpxchgLabel , popCntLabel , bSwapLabel + , clzLabel + , ctzLabel , word2FloatLabel ) where @@ -30,6 +32,24 @@ bSwapLabel w = "hs_bswap" ++ pprWidth w pprWidth W64 = "64" pprWidth w = pprPanic "bSwapLabel: Unsupported word width " (ppr w) +clzLabel :: Width -> String +clzLabel w = "hs_clz" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "clzLabel: Unsupported word width " (ppr w) + +ctzLabel :: Width -> String +ctzLabel w = "hs_ctz" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "ctzLabel: Unsupported word width " (ppr w) + word2FloatLabel :: Width -> String word2FloatLabel w = "hs_word2float" ++ pprWidth w where diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs index 014117dd4c..3d3dff2e73 100644 --- a/compiler/nativeGen/PPC/CodeGen.hs +++ b/compiler/nativeGen/PPC/CodeGen.hs @@ -1151,6 +1151,8 @@ genCCall' dflags gcp target dest_regs args0 MO_BSwap w -> (fsLit $ bSwapLabel w, False) MO_PopCnt w -> (fsLit $ popCntLabel w, False) + MO_Clz w -> (fsLit $ clzLabel w, False) + MO_Ctz w -> (fsLit $ ctzLabel w, False) MO_AtomicRMW w amop -> (fsLit $ atomicRMWLabel w amop, False) MO_Cmpxchg w -> (fsLit $ cmpxchgLabel w, False) MO_AtomicRead w -> (fsLit $ atomicReadLabel w, False) diff --git a/compiler/nativeGen/SPARC/CodeGen.hs b/compiler/nativeGen/SPARC/CodeGen.hs index 51f89d629f..c192b8bda6 100644 --- a/compiler/nativeGen/SPARC/CodeGen.hs +++ b/compiler/nativeGen/SPARC/CodeGen.hs @@ -654,6 +654,8 @@ outOfLineMachOp_table mop MO_BSwap w -> fsLit $ bSwapLabel w MO_PopCnt w -> fsLit $ popCntLabel w + MO_Clz w -> fsLit $ clzLabel w + MO_Ctz w -> fsLit $ ctzLabel w MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop MO_Cmpxchg w -> fsLit $ cmpxchgLabel w MO_AtomicRead w -> fsLit $ atomicReadLabel w diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index ce7120e24b..bc79e5e264 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -1767,6 +1767,69 @@ genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst] size = intSize width lbl = mkCmmCodeLabel primPackageKey (fsLit (popCntLabel width)) +genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] + | is32Bit && width == W64 = do + -- Fallback to `hs_clz64` on i386 + targetExpr <- cmmMakeDynamicReference dflags CallReference lbl + let target = ForeignTarget targetExpr (ForeignConvention CCallConv + [NoHint] [NoHint] + CmmMayReturn) + genCCall dflags is32Bit target dest_regs args + + | otherwise = do + code_src <- getAnyReg src + src_r <- getNewRegNat size + tmp_r <- getNewRegNat size + let dst_r = getRegisterReg platform False (CmmLocal dst) + + -- The following insn sequence makes sure 'clz 0' has a defined value. + -- starting with Haswell, one could use the LZCNT insn instead. + return $ code_src src_r `appOL` toOL + ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ + [ BSR size (OpReg src_r) tmp_r + , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r) + , CMOV NE size (OpReg tmp_r) dst_r + , XOR size (OpImm (ImmInt (bw-1))) (OpReg dst_r) + ]) -- NB: We don't need to zero-extend the result for the + -- W8/W16 cases because the 'MOV' insn already + -- took care of implicitly clearing the upper bits + where + bw = widthInBits width + platform = targetPlatform dflags + size = if width == W8 then II16 else intSize width + lbl = mkCmmCodeLabel primPackageKey (fsLit (clzLabel width)) + +genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src] + | is32Bit, width == W64 = do + -- Fallback to `hs_ctz64` on i386 + targetExpr <- cmmMakeDynamicReference dflags CallReference lbl + let target = ForeignTarget targetExpr (ForeignConvention CCallConv + [NoHint] [NoHint] + CmmMayReturn) + genCCall dflags is32Bit target dest_regs args + + | otherwise = do + code_src <- getAnyReg src + src_r <- getNewRegNat size + tmp_r <- getNewRegNat size + let dst_r = getRegisterReg platform False (CmmLocal dst) + + -- The following insn sequence makes sure 'ctz 0' has a defined value. + -- starting with Haswell, one could use the TZCNT insn instead. + return $ code_src src_r `appOL` toOL + ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++ + [ BSF size (OpReg src_r) tmp_r + , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r) + , CMOV NE size (OpReg tmp_r) dst_r + ]) -- NB: We don't need to zero-extend the result for the + -- W8/W16 cases because the 'MOV' insn already + -- took care of implicitly clearing the upper bits + where + bw = widthInBits width + platform = targetPlatform dflags + size = if width == W8 then II16 else intSize width + lbl = mkCmmCodeLabel primPackageKey (fsLit (ctzLabel width)) + genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do targetExpr <- cmmMakeDynamicReference dflags CallReference lbl @@ -2403,6 +2466,8 @@ outOfLineCmmOp mop res args MO_PopCnt _ -> fsLit "popcnt" MO_BSwap _ -> fsLit "bswap" + MO_Clz w -> fsLit $ clzLabel w + MO_Ctz w -> fsLit $ ctzLabel w MO_AtomicRMW _ _ -> fsLit "atomicrmw" MO_AtomicRead _ -> fsLit "atomicread" diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp index 0c33233e88..6844f42a5c 100644 --- a/compiler/prelude/primops.txt.pp +++ b/compiler/prelude/primops.txt.pp @@ -386,6 +386,28 @@ primop PopCnt64Op "popCnt64#" GenPrimOp WORD64 -> Word# primop PopCntOp "popCnt#" Monadic Word# -> Word# {Count the number of set bits in a word.} +primop Clz8Op "clz8#" Monadic Word# -> Word# + {Count leading zeros in the lower 8 bits of a word.} +primop Clz16Op "clz16#" Monadic Word# -> Word# + {Count leading zeros in the lower 16 bits of a word.} +primop Clz32Op "clz32#" Monadic Word# -> Word# + {Count leading zeros in the lower 32 bits of a word.} +primop Clz64Op "clz64#" GenPrimOp WORD64 -> Word# + {Count leading zeros in a 64-bit word.} +primop ClzOp "clz#" Monadic Word# -> Word# + {Count leading zeros in a word.} + +primop Ctz8Op "ctz8#" Monadic Word# -> Word# + {Count trailing zeros in the lower 8 bits of a word.} +primop Ctz16Op "ctz16#" Monadic Word# -> Word# + {Count trailing zeros in the lower 16 bits of a word.} +primop Ctz32Op "ctz32#" Monadic Word# -> Word# + {Count trailing zeros in the lower 32 bits of a word.} +primop Ctz64Op "ctz64#" GenPrimOp WORD64 -> Word# + {Count trailing zeros in a 64-bit word.} +primop CtzOp "ctz#" Monadic Word# -> Word# + {Count trailing zeros in a word.} + primop BSwap16Op "byteSwap16#" Monadic Word# -> Word# {Swap bytes in the lower 16 bits of a word. The higher bytes are undefined. } primop BSwap32Op "byteSwap32#" Monadic Word# -> Word# |