summaryrefslogtreecommitdiff
path: root/compiler
diff options
context:
space:
mode:
authorHerbert Valerio Riedel <hvr@gnu.org>2014-08-11 18:56:57 +0200
committerHerbert Valerio Riedel <hvr@gnu.org>2014-08-14 11:34:23 +0200
commite0c1767d0ea8d12e0a4badf43682a08784e379c6 (patch)
tree6662fe33cd7e803253458f91307b1b5826e30b0f /compiler
parent6b5ea617dcd162e682886d5843df51a2866218d3 (diff)
downloadhaskell-e0c1767d0ea8d12e0a4badf43682a08784e379c6.tar.gz
Implement new CLZ and CTZ primops (re #9340)
This implements the new primops clz#, clz32#, clz64#, ctz#, ctz32#, ctz64# which provide efficient implementations of the popular count-leading-zero and count-trailing-zero respectively (see testcase for a pure Haskell reference implementation). On x86, NCG as well as LLVM generates code based on the BSF/BSR instructions (which need extra logic to make the 0-case well-defined). Test Plan: validate and succesful tests on i686 and amd64 Reviewers: rwbarton, simonmar, ezyang, austin Subscribers: simonmar, relrod, ezyang, carter Differential Revision: https://phabricator.haskell.org/D144 GHC Trac Issues: #9340
Diffstat (limited to 'compiler')
-rw-r--r--compiler/cmm/CmmMachOp.hs3
-rw-r--r--compiler/cmm/PprC.hs2
-rw-r--r--compiler/codeGen/StgCmmPrim.hs28
-rw-r--r--compiler/llvmGen/LlvmCodeGen/CodeGen.hs9
-rw-r--r--compiler/nativeGen/CPrim.hs20
-rw-r--r--compiler/nativeGen/PPC/CodeGen.hs2
-rw-r--r--compiler/nativeGen/SPARC/CodeGen.hs2
-rw-r--r--compiler/nativeGen/X86/CodeGen.hs65
-rw-r--r--compiler/prelude/primops.txt.pp22
9 files changed, 152 insertions, 1 deletions
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index d8ce492de1..a7b2c85175 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -549,6 +549,9 @@ data CallishMachOp
| MO_Memmove
| MO_PopCnt Width
+ | MO_Clz Width
+ | MO_Ctz Width
+
| MO_BSwap Width
-- Atomic read-modify-write.
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index 455c79ba02..93a5d06259 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -753,6 +753,8 @@ pprCallishMachOp_for_C mop
MO_Memmove -> ptext (sLit "memmove")
(MO_BSwap w) -> ptext (sLit $ bSwapLabel w)
(MO_PopCnt w) -> ptext (sLit $ popCntLabel w)
+ (MO_Clz w) -> ptext (sLit $ clzLabel w)
+ (MO_Ctz w) -> ptext (sLit $ ctzLabel w)
(MO_AtomicRMW w amop) -> ptext (sLit $ atomicRMWLabel w amop)
(MO_Cmpxchg w) -> ptext (sLit $ cmpxchgLabel w)
(MO_AtomicRead w) -> ptext (sLit $ atomicReadLabel w)
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index 77739fe0fb..9e12427355 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -563,6 +563,20 @@ emitPrimOp _ [res] PopCnt32Op [w] = emitPopCntCall res w W32
emitPrimOp _ [res] PopCnt64Op [w] = emitPopCntCall res w W64
emitPrimOp dflags [res] PopCntOp [w] = emitPopCntCall res w (wordWidth dflags)
+-- count leading zeros
+emitPrimOp _ [res] Clz8Op [w] = emitClzCall res w W8
+emitPrimOp _ [res] Clz16Op [w] = emitClzCall res w W16
+emitPrimOp _ [res] Clz32Op [w] = emitClzCall res w W32
+emitPrimOp _ [res] Clz64Op [w] = emitClzCall res w W64
+emitPrimOp dflags [res] ClzOp [w] = emitClzCall res w (wordWidth dflags)
+
+-- count trailing zeros
+emitPrimOp _ [res] Ctz8Op [w] = emitCtzCall res w W8
+emitPrimOp _ [res] Ctz16Op [w] = emitCtzCall res w W16
+emitPrimOp _ [res] Ctz32Op [w] = emitCtzCall res w W32
+emitPrimOp _ [res] Ctz64Op [w] = emitCtzCall res w W64
+emitPrimOp dflags [res] CtzOp [w] = emitCtzCall res w (wordWidth dflags)
+
-- Unsigned int to floating point conversions
emitPrimOp _ [res] Word2FloatOp [w] = emitPrimCall [res]
(MO_UF_Conv W32) [w]
@@ -2096,3 +2110,17 @@ emitPopCntCall res x width = do
[ res ]
(MO_PopCnt width)
[ x ]
+
+emitClzCall :: LocalReg -> CmmExpr -> Width -> FCode ()
+emitClzCall res x width = do
+ emitPrimCall
+ [ res ]
+ (MO_Clz width)
+ [ x ]
+
+emitCtzCall :: LocalReg -> CmmExpr -> Width -> FCode ()
+emitCtzCall res x width = do
+ emitPrimCall
+ [ res ]
+ (MO_Ctz width)
+ [ x ]
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index 4a56600937..2673eed6b8 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -224,9 +224,14 @@ genCall t@(PrimTarget (MO_Prefetch_Data localityInt)) [] args
return (stmts, top1 ++ top2)
| otherwise = panic $ "prefetch locality level integer must be between 0 and 3, given: " ++ (show localityInt)
--- Handle PopCnt and BSwap that need to only convert arg and return types
+-- Handle PopCnt, Clz, Ctz, and BSwap that need to only convert arg
+-- and return types
genCall t@(PrimTarget (MO_PopCnt w)) dsts args =
genCallSimpleCast w t dsts args
+genCall t@(PrimTarget (MO_Clz w)) dsts args =
+ genCallSimpleCast w t dsts args
+genCall t@(PrimTarget (MO_Ctz w)) dsts args =
+ genCallSimpleCast w t dsts args
genCall t@(PrimTarget (MO_BSwap w)) dsts args =
genCallSimpleCast w t dsts args
@@ -558,6 +563,8 @@ cmmPrimOpFunctions mop = do
(MO_PopCnt w) -> fsLit $ "llvm.ctpop." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
(MO_BSwap w) -> fsLit $ "llvm.bswap." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
+ (MO_Clz w) -> fsLit $ "llvm.ctlz." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
+ (MO_Ctz w) -> fsLit $ "llvm.cttz." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
(MO_Prefetch_Data _ )-> fsLit "llvm.prefetch"
diff --git a/compiler/nativeGen/CPrim.hs b/compiler/nativeGen/CPrim.hs
index 34782dfc1c..c52fe10b13 100644
--- a/compiler/nativeGen/CPrim.hs
+++ b/compiler/nativeGen/CPrim.hs
@@ -6,6 +6,8 @@ module CPrim
, cmpxchgLabel
, popCntLabel
, bSwapLabel
+ , clzLabel
+ , ctzLabel
, word2FloatLabel
) where
@@ -30,6 +32,24 @@ bSwapLabel w = "hs_bswap" ++ pprWidth w
pprWidth W64 = "64"
pprWidth w = pprPanic "bSwapLabel: Unsupported word width " (ppr w)
+clzLabel :: Width -> String
+clzLabel w = "hs_clz" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "clzLabel: Unsupported word width " (ppr w)
+
+ctzLabel :: Width -> String
+ctzLabel w = "hs_ctz" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "ctzLabel: Unsupported word width " (ppr w)
+
word2FloatLabel :: Width -> String
word2FloatLabel w = "hs_word2float" ++ pprWidth w
where
diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs
index 014117dd4c..3d3dff2e73 100644
--- a/compiler/nativeGen/PPC/CodeGen.hs
+++ b/compiler/nativeGen/PPC/CodeGen.hs
@@ -1151,6 +1151,8 @@ genCCall' dflags gcp target dest_regs args0
MO_BSwap w -> (fsLit $ bSwapLabel w, False)
MO_PopCnt w -> (fsLit $ popCntLabel w, False)
+ MO_Clz w -> (fsLit $ clzLabel w, False)
+ MO_Ctz w -> (fsLit $ ctzLabel w, False)
MO_AtomicRMW w amop -> (fsLit $ atomicRMWLabel w amop, False)
MO_Cmpxchg w -> (fsLit $ cmpxchgLabel w, False)
MO_AtomicRead w -> (fsLit $ atomicReadLabel w, False)
diff --git a/compiler/nativeGen/SPARC/CodeGen.hs b/compiler/nativeGen/SPARC/CodeGen.hs
index 51f89d629f..c192b8bda6 100644
--- a/compiler/nativeGen/SPARC/CodeGen.hs
+++ b/compiler/nativeGen/SPARC/CodeGen.hs
@@ -654,6 +654,8 @@ outOfLineMachOp_table mop
MO_BSwap w -> fsLit $ bSwapLabel w
MO_PopCnt w -> fsLit $ popCntLabel w
+ MO_Clz w -> fsLit $ clzLabel w
+ MO_Ctz w -> fsLit $ ctzLabel w
MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
MO_Cmpxchg w -> fsLit $ cmpxchgLabel w
MO_AtomicRead w -> fsLit $ atomicReadLabel w
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index ce7120e24b..bc79e5e264 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1767,6 +1767,69 @@ genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
size = intSize width
lbl = mkCmmCodeLabel primPackageKey (fsLit (popCntLabel width))
+genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src]
+ | is32Bit && width == W64 = do
+ -- Fallback to `hs_clz64` on i386
+ targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
+ let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+ [NoHint] [NoHint]
+ CmmMayReturn)
+ genCCall dflags is32Bit target dest_regs args
+
+ | otherwise = do
+ code_src <- getAnyReg src
+ src_r <- getNewRegNat size
+ tmp_r <- getNewRegNat size
+ let dst_r = getRegisterReg platform False (CmmLocal dst)
+
+ -- The following insn sequence makes sure 'clz 0' has a defined value.
+ -- starting with Haswell, one could use the LZCNT insn instead.
+ return $ code_src src_r `appOL` toOL
+ ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+ [ BSR size (OpReg src_r) tmp_r
+ , MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
+ , CMOV NE size (OpReg tmp_r) dst_r
+ , XOR size (OpImm (ImmInt (bw-1))) (OpReg dst_r)
+ ]) -- NB: We don't need to zero-extend the result for the
+ -- W8/W16 cases because the 'MOV' insn already
+ -- took care of implicitly clearing the upper bits
+ where
+ bw = widthInBits width
+ platform = targetPlatform dflags
+ size = if width == W8 then II16 else intSize width
+ lbl = mkCmmCodeLabel primPackageKey (fsLit (clzLabel width))
+
+genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src]
+ | is32Bit, width == W64 = do
+ -- Fallback to `hs_ctz64` on i386
+ targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
+ let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+ [NoHint] [NoHint]
+ CmmMayReturn)
+ genCCall dflags is32Bit target dest_regs args
+
+ | otherwise = do
+ code_src <- getAnyReg src
+ src_r <- getNewRegNat size
+ tmp_r <- getNewRegNat size
+ let dst_r = getRegisterReg platform False (CmmLocal dst)
+
+ -- The following insn sequence makes sure 'ctz 0' has a defined value.
+ -- starting with Haswell, one could use the TZCNT insn instead.
+ return $ code_src src_r `appOL` toOL
+ ([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+ [ BSF size (OpReg src_r) tmp_r
+ , MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r)
+ , CMOV NE size (OpReg tmp_r) dst_r
+ ]) -- NB: We don't need to zero-extend the result for the
+ -- W8/W16 cases because the 'MOV' insn already
+ -- took care of implicitly clearing the upper bits
+ where
+ bw = widthInBits width
+ platform = targetPlatform dflags
+ size = if width == W8 then II16 else intSize width
+ lbl = mkCmmCodeLabel primPackageKey (fsLit (ctzLabel width))
+
genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do
targetExpr <- cmmMakeDynamicReference dflags
CallReference lbl
@@ -2403,6 +2466,8 @@ outOfLineCmmOp mop res args
MO_PopCnt _ -> fsLit "popcnt"
MO_BSwap _ -> fsLit "bswap"
+ MO_Clz w -> fsLit $ clzLabel w
+ MO_Ctz w -> fsLit $ ctzLabel w
MO_AtomicRMW _ _ -> fsLit "atomicrmw"
MO_AtomicRead _ -> fsLit "atomicread"
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index 0c33233e88..6844f42a5c 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -386,6 +386,28 @@ primop PopCnt64Op "popCnt64#" GenPrimOp WORD64 -> Word#
primop PopCntOp "popCnt#" Monadic Word# -> Word#
{Count the number of set bits in a word.}
+primop Clz8Op "clz8#" Monadic Word# -> Word#
+ {Count leading zeros in the lower 8 bits of a word.}
+primop Clz16Op "clz16#" Monadic Word# -> Word#
+ {Count leading zeros in the lower 16 bits of a word.}
+primop Clz32Op "clz32#" Monadic Word# -> Word#
+ {Count leading zeros in the lower 32 bits of a word.}
+primop Clz64Op "clz64#" GenPrimOp WORD64 -> Word#
+ {Count leading zeros in a 64-bit word.}
+primop ClzOp "clz#" Monadic Word# -> Word#
+ {Count leading zeros in a word.}
+
+primop Ctz8Op "ctz8#" Monadic Word# -> Word#
+ {Count trailing zeros in the lower 8 bits of a word.}
+primop Ctz16Op "ctz16#" Monadic Word# -> Word#
+ {Count trailing zeros in the lower 16 bits of a word.}
+primop Ctz32Op "ctz32#" Monadic Word# -> Word#
+ {Count trailing zeros in the lower 32 bits of a word.}
+primop Ctz64Op "ctz64#" GenPrimOp WORD64 -> Word#
+ {Count trailing zeros in a 64-bit word.}
+primop CtzOp "ctz#" Monadic Word# -> Word#
+ {Count trailing zeros in a word.}
+
primop BSwap16Op "byteSwap16#" Monadic Word# -> Word#
{Swap bytes in the lower 16 bits of a word. The higher bytes are undefined. }
primop BSwap32Op "byteSwap32#" Monadic Word# -> Word#