summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Ky <newhoggy@gmail.com>2017-11-15 11:35:42 -0500
committerBen Gamari <ben@smart-cactus.org>2017-11-15 11:37:00 -0500
commitf5dc8ccc29429d0a1d011f62b6b430f6ae50290c (patch)
tree3b3e3d9cf1a1276efd977523b5ea18fa65ecf812
parent47ad6578ea460999b53eb4293c3a3b3017a56d65 (diff)
downloadhaskell-f5dc8ccc29429d0a1d011f62b6b430f6ae50290c.tar.gz
Add new mbmi and mbmi2 compiler flags
This adds support for the bit deposit and extraction operations provided by the BMI and BMI2 instruction set extensions on modern amd64 machines. Test Plan: Validate Reviewers: austin, simonmar, bgamari, hvr, goldfire, erikd Reviewed By: bgamari Subscribers: goldfire, erikd, trommler, newhoggy, rwbarton, thomie GHC Trac Issues: #14206 Differential Revision: https://phabricator.haskell.org/D4063
-rw-r--r--compiler/cmm/CmmMachOp.hs2
-rw-r--r--compiler/cmm/CmmParse.y10
-rw-r--r--compiler/cmm/PprC.hs2
-rw-r--r--compiler/codeGen/StgCmmPrim.hs78
-rw-r--r--compiler/coreSyn/MkCore.hs1
-rw-r--r--compiler/llvmGen/LlvmCodeGen/CodeGen.hs6
-rw-r--r--compiler/main/DynFlags.hs27
-rw-r--r--compiler/nativeGen/CPrim.hs20
-rw-r--r--compiler/nativeGen/PPC/CodeGen.hs2
-rw-r--r--compiler/nativeGen/SPARC/CodeGen.hs2
-rw-r--r--compiler/nativeGen/X86/CodeGen.hs69
-rw-r--r--compiler/nativeGen/X86/Instr.hs9
-rw-r--r--compiler/nativeGen/X86/Ppr.hs13
-rw-r--r--compiler/prelude/primops.txt.pp22
-rw-r--r--libraries/ghc-prim/cbits/pdep.c71
-rw-r--r--libraries/ghc-prim/cbits/pext.c67
-rw-r--r--libraries/ghc-prim/ghc-prim.cabal2
-rw-r--r--testsuite/tests/codeGen/should_run/all.T2
-rw-r--r--testsuite/tests/codeGen/should_run/cgrun075.hs115
-rw-r--r--testsuite/tests/codeGen/should_run/cgrun075.stdout6
-rw-r--r--testsuite/tests/codeGen/should_run/cgrun076.hs115
-rw-r--r--testsuite/tests/codeGen/should_run/cgrun076.stdout6
22 files changed, 646 insertions, 1 deletions
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index fdbfd6e857..8ac4a6fa7b 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -587,6 +587,8 @@ data CallishMachOp
| MO_Memcmp Int
| MO_PopCnt Width
+ | MO_Pdep Width
+ | MO_Pext Width
| MO_Clz Width
| MO_Ctz Width
diff --git a/compiler/cmm/CmmParse.y b/compiler/cmm/CmmParse.y
index 7ffb4fbe42..8afbd2f9d9 100644
--- a/compiler/cmm/CmmParse.y
+++ b/compiler/cmm/CmmParse.y
@@ -1006,6 +1006,16 @@ callishMachOps = listToUFM $
( "popcnt32", (,) $ MO_PopCnt W32 ),
( "popcnt64", (,) $ MO_PopCnt W64 ),
+ ( "pdep8", (,) $ MO_Pdep W8 ),
+ ( "pdep16", (,) $ MO_Pdep W16 ),
+ ( "pdep32", (,) $ MO_Pdep W32 ),
+ ( "pdep64", (,) $ MO_Pdep W64 ),
+
+ ( "pext8", (,) $ MO_Pext W8 ),
+ ( "pext16", (,) $ MO_Pext W16 ),
+ ( "pext32", (,) $ MO_Pext W32 ),
+ ( "pext64", (,) $ MO_Pext W64 ),
+
( "cmpxchg8", (,) $ MO_Cmpxchg W8 ),
( "cmpxchg16", (,) $ MO_Cmpxchg W16 ),
( "cmpxchg32", (,) $ MO_Cmpxchg W32 ),
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index 1ddd1cd266..76e4d4cb94 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -789,6 +789,8 @@ pprCallishMachOp_for_C mop
MO_Memcmp _ -> text "memcmp"
(MO_BSwap w) -> ptext (sLit $ bSwapLabel w)
(MO_PopCnt w) -> ptext (sLit $ popCntLabel w)
+ (MO_Pext w) -> ptext (sLit $ pextLabel w)
+ (MO_Pdep w) -> ptext (sLit $ pdepLabel w)
(MO_Clz w) -> ptext (sLit $ clzLabel w)
(MO_Ctz w) -> ptext (sLit $ ctzLabel w)
(MO_AtomicRMW w amop) -> ptext (sLit $ atomicRMWLabel w amop)
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index da652bf1b0..18074991b3 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -584,6 +584,20 @@ emitPrimOp _ [res] PopCnt32Op [w] = emitPopCntCall res w W32
emitPrimOp _ [res] PopCnt64Op [w] = emitPopCntCall res w W64
emitPrimOp dflags [res] PopCntOp [w] = emitPopCntCall res w (wordWidth dflags)
+-- Parallel bit deposit
+emitPrimOp _ [res] Pdep8Op [src, mask] = emitPdepCall res src mask W8
+emitPrimOp _ [res] Pdep16Op [src, mask] = emitPdepCall res src mask W16
+emitPrimOp _ [res] Pdep32Op [src, mask] = emitPdepCall res src mask W32
+emitPrimOp _ [res] Pdep64Op [src, mask] = emitPdepCall res src mask W64
+emitPrimOp dflags [res] PdepOp [src, mask] = emitPdepCall res src mask (wordWidth dflags)
+
+-- Parallel bit extract
+emitPrimOp _ [res] Pext8Op [src, mask] = emitPextCall res src mask W8
+emitPrimOp _ [res] Pext16Op [src, mask] = emitPextCall res src mask W16
+emitPrimOp _ [res] Pext32Op [src, mask] = emitPextCall res src mask W32
+emitPrimOp _ [res] Pext64Op [src, mask] = emitPextCall res src mask W64
+emitPrimOp dflags [res] PextOp [src, mask] = emitPextCall res src mask (wordWidth dflags)
+
-- count leading zeros
emitPrimOp _ [res] Clz8Op [w] = emitClzCall res w W8
emitPrimOp _ [res] Clz16Op [w] = emitClzCall res w W16
@@ -865,6 +879,56 @@ callishPrimOpSupported dflags op
|| llvm -> Left MO_F64_Fabs
| otherwise -> Right $ genericFabsOp W64
+ -- Pdep8Op | (ncg && (x86ish
+ -- || ppc))
+ -- || llvm -> Left (MO_Pdep (wordWidth dflags))
+ -- | otherwise -> error "TODO: Implement (Right genericPdep8Op)"
+
+ -- Pdep16Op | (ncg && (x86ish
+ -- || ppc))
+ -- || llvm -> Left (MO_Pdep (wordWidth dflags))
+ -- | otherwise -> error "TODO: Implement (Right genericPdep16Op)"
+
+ -- Pdep32Op | (ncg && (x86ish
+ -- || ppc))
+ -- || llvm -> Left (MO_Pdep (wordWidth dflags))
+
+ -- | otherwise -> error "TODO: Implement (Right genericPdep32Op)"
+ -- Pdep64Op | (ncg && (x86ish
+ -- || ppc))
+ -- || llvm -> Left (MO_Pdep (wordWidth dflags))
+ -- | otherwise -> error "TODO: Implement (Right genericPdep64Op)"
+
+ -- PdepOp | (ncg && (x86ish
+ -- || ppc))
+ -- || llvm -> Left (MO_Pdep (wordWidth dflags))
+ -- | otherwise -> error "TODO: Implement (Right genericPdepOp)"
+
+ -- Pext8Op | (ncg && (x86ish
+ -- || ppc))
+ -- || llvm -> Left (MO_Pext (wordWidth dflags))
+ -- | otherwise -> error "TODO: Implement (Right genericPext8Op)"
+
+ -- Pext16Op | (ncg && (x86ish
+ -- || ppc))
+ -- || llvm -> Left (MO_Pext (wordWidth dflags))
+ -- | otherwise -> error "TODO: Implement (Right genericPext16Op)"
+
+ -- Pext32Op | (ncg && (x86ish
+ -- || ppc))
+ -- || llvm -> Left (MO_Pext (wordWidth dflags))
+ -- | otherwise -> error "TODO: Implement (Right genericPext32Op)"
+
+ -- Pext64Op | (ncg && (x86ish
+ -- || ppc))
+ -- || llvm -> Left (MO_Pext (wordWidth dflags))
+ -- | otherwise -> error "TODO: Implement (Right genericPext64Op)"
+
+ -- PextOp | (ncg && (x86ish
+ -- || ppc))
+ -- || llvm -> Left (MO_Pext (wordWidth dflags))
+ -- | otherwise -> error "TODO: Implement (Right genericPextOp)"
+
_ -> pprPanic "emitPrimOp: can't translate PrimOp " (ppr op)
where
ncg = case hscTarget dflags of
@@ -2266,6 +2330,20 @@ emitPopCntCall res x width = do
(MO_PopCnt width)
[ x ]
+emitPdepCall :: LocalReg -> CmmExpr -> CmmExpr -> Width -> FCode ()
+emitPdepCall res x y width = do
+ emitPrimCall
+ [ res ]
+ (MO_Pdep width)
+ [ x, y ]
+
+emitPextCall :: LocalReg -> CmmExpr -> CmmExpr -> Width -> FCode ()
+emitPextCall res x y width = do
+ emitPrimCall
+ [ res ]
+ (MO_Pext width)
+ [ x, y ]
+
emitClzCall :: LocalReg -> CmmExpr -> Width -> FCode ()
emitClzCall res x width = do
emitPrimCall
diff --git a/compiler/coreSyn/MkCore.hs b/compiler/coreSyn/MkCore.hs
index c8f7366288..93b767e0a4 100644
--- a/compiler/coreSyn/MkCore.hs
+++ b/compiler/coreSyn/MkCore.hs
@@ -870,4 +870,3 @@ mkAbsentErrorApp res_ty err_msg
= mkApps (Var aBSENT_ERROR_ID) [ Type res_ty, err_string ]
where
err_string = Lit (mkMachString err_msg)
-
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index a88642b531..8ee9a6737f 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -218,6 +218,10 @@ genCall t@(PrimTarget (MO_Prefetch_Data localityInt)) [] args
-- and return types
genCall t@(PrimTarget (MO_PopCnt w)) dsts args =
genCallSimpleCast w t dsts args
+genCall t@(PrimTarget (MO_Pdep w)) dsts args =
+ genCallSimpleCast w t dsts args
+genCall t@(PrimTarget (MO_Pext w)) dsts args =
+ genCallSimpleCast w t dsts args
genCall t@(PrimTarget (MO_Clz w)) dsts args =
genCallSimpleCast w t dsts args
genCall t@(PrimTarget (MO_Ctz w)) dsts args =
@@ -731,6 +735,8 @@ cmmPrimOpFunctions mop = do
MO_Memcmp _ -> fsLit $ "memcmp"
(MO_PopCnt w) -> fsLit $ "llvm.ctpop." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
+ (MO_Pdep w) -> fsLit $ "llvm.pdep." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
+ (MO_Pext w) -> fsLit $ "llvm.pext." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
(MO_BSwap w) -> fsLit $ "llvm.bswap." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
(MO_Clz w) -> fsLit $ "llvm.ctlz." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
(MO_Ctz w) -> fsLit $ "llvm.cttz." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
diff --git a/compiler/main/DynFlags.hs b/compiler/main/DynFlags.hs
index 0e6310e4d9..53a4033db7 100644
--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -149,6 +149,8 @@ module DynFlags (
isSseEnabled,
isSse2Enabled,
isSse4_2Enabled,
+ isBmiEnabled,
+ isBmi2Enabled,
isAvxEnabled,
isAvx2Enabled,
isAvx512cdEnabled,
@@ -937,6 +939,7 @@ data DynFlags = DynFlags {
-- | Machine dependent flags (-m<blah> stuff)
sseVersion :: Maybe SseVersion,
+ bmiVersion :: Maybe BmiVersion,
avx :: Bool,
avx2 :: Bool,
avx512cd :: Bool, -- Enable AVX-512 Conflict Detection Instructions.
@@ -1737,6 +1740,7 @@ defaultDynFlags mySettings myLlvmTargets =
interactivePrint = Nothing,
nextWrapperNum = panic "defaultDynFlags: No nextWrapperNum",
sseVersion = Nothing,
+ bmiVersion = Nothing,
avx = False,
avx2 = False,
avx512cd = False,
@@ -3126,6 +3130,10 @@ dynamic_flags_deps = [
d { sseVersion = Just SSE4 }))
, make_ord_flag defGhcFlag "msse4.2" (noArg (\d ->
d { sseVersion = Just SSE42 }))
+ , make_ord_flag defGhcFlag "mbmi" (noArg (\d ->
+ d { bmiVersion = Just BMI1 }))
+ , make_ord_flag defGhcFlag "mbmi2" (noArg (\d ->
+ d { bmiVersion = Just BMI2 }))
, make_ord_flag defGhcFlag "mavx" (noArg (\d -> d { avx = True }))
, make_ord_flag defGhcFlag "mavx2" (noArg (\d -> d { avx2 = True }))
, make_ord_flag defGhcFlag "mavx512cd" (noArg (\d ->
@@ -5368,6 +5376,25 @@ isAvx512pfEnabled :: DynFlags -> Bool
isAvx512pfEnabled dflags = avx512pf dflags
-- -----------------------------------------------------------------------------
+-- BMI2
+
+data BmiVersion = BMI1
+ | BMI2
+ deriving (Eq, Ord)
+
+isBmiEnabled :: DynFlags -> Bool
+isBmiEnabled dflags = case platformArch (targetPlatform dflags) of
+ ArchX86_64 -> bmiVersion dflags >= Just BMI1
+ ArchX86 -> bmiVersion dflags >= Just BMI1
+ _ -> False
+
+isBmi2Enabled :: DynFlags -> Bool
+isBmi2Enabled dflags = case platformArch (targetPlatform dflags) of
+ ArchX86_64 -> bmiVersion dflags >= Just BMI2
+ ArchX86 -> bmiVersion dflags >= Just BMI2
+ _ -> False
+
+-- -----------------------------------------------------------------------------
-- Linker/compiler information
-- LinkerInfo contains any extra options needed by the system linker.
diff --git a/compiler/nativeGen/CPrim.hs b/compiler/nativeGen/CPrim.hs
index ad61a002d3..399d646000 100644
--- a/compiler/nativeGen/CPrim.hs
+++ b/compiler/nativeGen/CPrim.hs
@@ -5,6 +5,8 @@ module CPrim
, atomicRMWLabel
, cmpxchgLabel
, popCntLabel
+ , pdepLabel
+ , pextLabel
, bSwapLabel
, clzLabel
, ctzLabel
@@ -26,6 +28,24 @@ popCntLabel w = "hs_popcnt" ++ pprWidth w
pprWidth W64 = "64"
pprWidth w = pprPanic "popCntLabel: Unsupported word width " (ppr w)
+pdepLabel :: Width -> String
+pdepLabel w = "hs_pdep" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "pdepLabel: Unsupported word width " (ppr w)
+
+pextLabel :: Width -> String
+pextLabel w = "hs_pext" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "pextLabel: Unsupported word width " (ppr w)
+
bSwapLabel :: Width -> String
bSwapLabel w = "hs_bswap" ++ pprWidth w
where
diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs
index 898a31a657..e2c568c836 100644
--- a/compiler/nativeGen/PPC/CodeGen.hs
+++ b/compiler/nativeGen/PPC/CodeGen.hs
@@ -2004,6 +2004,8 @@ genCCall' dflags gcp target dest_regs args
MO_BSwap w -> (fsLit $ bSwapLabel w, False)
MO_PopCnt w -> (fsLit $ popCntLabel w, False)
+ MO_Pdep w -> (fsLit $ pdepLabel w, False)
+ MO_Pext w -> (fsLit $ pextLabel w, False)
MO_Clz _ -> unsupported
MO_Ctz _ -> unsupported
MO_AtomicRMW {} -> unsupported
diff --git a/compiler/nativeGen/SPARC/CodeGen.hs b/compiler/nativeGen/SPARC/CodeGen.hs
index 55c1d1531d..6dfd58950e 100644
--- a/compiler/nativeGen/SPARC/CodeGen.hs
+++ b/compiler/nativeGen/SPARC/CodeGen.hs
@@ -654,6 +654,8 @@ outOfLineMachOp_table mop
MO_BSwap w -> fsLit $ bSwapLabel w
MO_PopCnt w -> fsLit $ popCntLabel w
+ MO_Pdep w -> fsLit $ pdepLabel w
+ MO_Pext w -> fsLit $ pextLabel w
MO_Clz w -> fsLit $ clzLabel w
MO_Ctz w -> fsLit $ ctzLabel w
MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 6c0e0ac783..62ed72163d 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1872,6 +1872,72 @@ genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
format = intFormat width
lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width))
+genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
+ args@[src, mask] = do
+ let platform = targetPlatform dflags
+ if isBmi2Enabled dflags
+ then do code_src <- getAnyReg src
+ code_mask <- getAnyReg mask
+ src_r <- getNewRegNat format
+ mask_r <- getNewRegNat format
+ let dst_r = getRegisterReg platform False (CmmLocal dst)
+ return $ code_src src_r `appOL` code_mask mask_r `appOL`
+ (if width == W8 then
+ -- The PDEP instruction doesn't take a r/m8
+ unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
+ unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
+ unitOL (PDEP II16 (OpReg mask_r) (OpReg src_r ) dst_r)
+ else
+ unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
+ (if width == W8 || width == W16 then
+ -- We used a 16-bit destination register above,
+ -- so zero-extend
+ unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+ else nilOL)
+ else do
+ targetExpr <- cmmMakeDynamicReference dflags
+ CallReference lbl
+ let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+ [NoHint] [NoHint]
+ CmmMayReturn)
+ genCCall dflags is32Bit target dest_regs args
+ where
+ format = intFormat width
+ lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width))
+
+genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
+ args@[src, mask] = do
+ let platform = targetPlatform dflags
+ if isBmi2Enabled dflags
+ then do code_src <- getAnyReg src
+ code_mask <- getAnyReg mask
+ src_r <- getNewRegNat format
+ mask_r <- getNewRegNat format
+ let dst_r = getRegisterReg platform False (CmmLocal dst)
+ return $ code_src src_r `appOL` code_mask mask_r `appOL`
+ (if width == W8 then
+ -- The PEXT instruction doesn't take a r/m8
+ unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
+ unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
+ unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r)
+ else
+ unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
+ (if width == W8 || width == W16 then
+ -- We used a 16-bit destination register above,
+ -- so zero-extend
+ unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+ else nilOL)
+ else do
+ targetExpr <- cmmMakeDynamicReference dflags
+ CallReference lbl
+ let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+ [NoHint] [NoHint]
+ CmmMayReturn)
+ genCCall dflags is32Bit target dest_regs args
+ where
+ format = intFormat width
+ lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width))
+
genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src]
| is32Bit && width == W64 = do
-- Fallback to `hs_clz64` on i386
@@ -2689,6 +2755,9 @@ outOfLineCmmOp mop res args
MO_Clz w -> fsLit $ clzLabel w
MO_Ctz _ -> unsupported
+ MO_Pdep _ -> fsLit "hs_pdep"
+ MO_Pext _ -> fsLit "hs_pext"
+
MO_AtomicRMW _ _ -> fsLit "atomicrmw"
MO_AtomicRead _ -> fsLit "atomicread"
MO_AtomicWrite _ -> fsLit "atomicwrite"
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index 1bb682ad87..fbe7383187 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -345,6 +345,10 @@ data Instr
| BSF Format Operand Reg -- bit scan forward
| BSR Format Operand Reg -- bit scan reverse
+ -- bit manipulation instructions
+ | PDEP Format Operand Operand Reg -- [BMI2] deposit bits to the specified mask
+ | PEXT Format Operand Operand Reg -- [BMI2] extract bits from the specified mask
+
-- prefetch
| PREFETCH PrefetchVariant Format Operand -- prefetch Variant, addr size, address to prefetch
-- variant can be NTA, Lvl0, Lvl1, or Lvl2
@@ -464,6 +468,9 @@ x86_regUsageOfInstr platform instr
BSF _ src dst -> mkRU (use_R src []) [dst]
BSR _ src dst -> mkRU (use_R src []) [dst]
+ PDEP _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
+ PEXT _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
+
-- note: might be a better way to do this
PREFETCH _ _ src -> mkRU (use_R src []) []
LOCK i -> x86_regUsageOfInstr platform i
@@ -640,6 +647,8 @@ x86_patchRegsOfInstr instr env
CLTD _ -> instr
POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst)
+ PDEP fmt src mask dst -> PDEP fmt (patchOp src) (patchOp mask) (env dst)
+ PEXT fmt src mask dst -> PEXT fmt (patchOp src) (patchOp mask) (env dst)
BSF fmt src dst -> BSF fmt (patchOp src) (env dst)
BSR fmt src dst -> BSR fmt (patchOp src) (env dst)
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 84ce7516b5..f5011b2a95 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -648,6 +648,9 @@ pprInstr (POPCNT format src dst) = pprOpOp (sLit "popcnt") format src (OpReg dst
pprInstr (BSF format src dst) = pprOpOp (sLit "bsf") format src (OpReg dst)
pprInstr (BSR format src dst) = pprOpOp (sLit "bsr") format src (OpReg dst)
+pprInstr (PDEP format src mask dst) = pprFormatOpOpReg (sLit "pdep") format src mask dst
+pprInstr (PEXT format src mask dst) = pprFormatOpOpReg (sLit "pext") format src mask dst
+
pprInstr (PREFETCH NTA format src ) = pprFormatOp_ (sLit "prefetchnta") format src
pprInstr (PREFETCH Lvl0 format src) = pprFormatOp_ (sLit "prefetcht0") format src
pprInstr (PREFETCH Lvl1 format src) = pprFormatOp_ (sLit "prefetcht1") format src
@@ -1262,6 +1265,16 @@ pprFormatRegRegReg name format reg1 reg2 reg3
pprReg format reg3
]
+pprFormatOpOpReg :: LitString -> Format -> Operand -> Operand -> Reg -> SDoc
+pprFormatOpOpReg name format op1 op2 reg3
+ = hcat [
+ pprMnemonic name format,
+ pprOperand format op1,
+ comma,
+ pprOperand format op2,
+ comma,
+ pprReg format reg3
+ ]
pprFormatAddrReg :: LitString -> Format -> AddrMode -> Reg -> SDoc
pprFormatAddrReg name format op dst
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index ce72036387..952d4746ff 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -403,6 +403,28 @@ primop PopCnt64Op "popCnt64#" GenPrimOp WORD64 -> Word#
primop PopCntOp "popCnt#" Monadic Word# -> Word#
{Count the number of set bits in a word.}
+primop Pdep8Op "pdep8#" Dyadic Word# -> Word# -> Word#
+ {Deposit bits to lower 8 bits of a word at locations specified by a mask.}
+primop Pdep16Op "pdep16#" Dyadic Word# -> Word# -> Word#
+ {Deposit bits to lower 16 bits of a word at locations specified by a mask.}
+primop Pdep32Op "pdep32#" Dyadic Word# -> Word# -> Word#
+ {Deposit bits to lower 32 bits of a word at locations specified by a mask.}
+primop Pdep64Op "pdep64#" GenPrimOp WORD64 -> WORD64 -> Word#
+ {Deposit bits to a word at locations specified by a mask.}
+primop PdepOp "pdep#" Dyadic Word# -> Word# -> Word#
+ {Deposit bits to a word at locations specified by a mask.}
+
+primop Pext8Op "pext8#" Dyadic Word# -> Word# -> Word#
+ {Extract bits from lower 8 bits of a word at locations specified by a mask.}
+primop Pext16Op "pext16#" Dyadic Word# -> Word# -> Word#
+ {Extract bits from lower 16 bits of a word at locations specified by a mask.}
+primop Pext32Op "pext32#" Dyadic Word# -> Word# -> Word#
+ {Extract bits from lower 32 bits of a word at locations specified by a mask.}
+primop Pext64Op "pext64#" GenPrimOp WORD64 -> WORD64 -> Word#
+ {Extract bits from a word at locations specified by a mask.}
+primop PextOp "pext#" Dyadic Word# -> Word# -> Word#
+ {Extract bits from a word at locations specified by a mask.}
+
primop Clz8Op "clz8#" Monadic Word# -> Word#
{Count leading zeros in the lower 8 bits of a word.}
primop Clz16Op "clz16#" Monadic Word# -> Word#
diff --git a/libraries/ghc-prim/cbits/pdep.c b/libraries/ghc-prim/cbits/pdep.c
new file mode 100644
index 0000000000..a3b7da3367
--- /dev/null
+++ b/libraries/ghc-prim/cbits/pdep.c
@@ -0,0 +1,71 @@
+#include "Rts.h"
+#include "MachDeps.h"
+
+extern StgWord hs_pdep64(StgWord64 src, StgWord mask);
+StgWord
+hs_pdep64(StgWord src, StgWord mask)
+{
+ uint64_t result = 0;
+
+ while (1) {
+ // Mask out all but the lowest bit
+ const uint64_t lowest = (-mask & mask);
+
+ if (lowest == 0) {
+ break;
+ }
+
+ const uint64_t lsb = (uint64_t)((int64_t)(src << 63) >> 63);
+
+ result |= lsb & lowest;
+ mask &= ~lowest;
+ src >>= 1;
+ }
+
+ return result;
+}
+
+extern StgWord hs_pdep32(StgWord src, StgWord mask);
+StgWord
+hs_pdep32(StgWord src, StgWord mask)
+{
+ return hs_pdep64(src, mask);
+}
+
+extern StgWord hs_pdep16(StgWord src, StgWord mask);
+StgWord
+hs_pdep16(StgWord src, StgWord mask)
+{
+ return hs_pdep64(src, mask);
+}
+
+extern StgWord hs_pdep8(StgWord src, StgWord mask);
+StgWord
+hs_pdep8(StgWord src, StgWord mask)
+{
+ return hs_pdep64(src, mask);
+}
+
+#if WORD_SIZE_IN_BITS == 32
+
+extern StgWord hs_pdep(StgWord src, StgWord mask);
+StgWord
+hs_pdep(StgWord src, StgWord mask)
+{
+ return hs_pdep64(src, mask);
+}
+
+#elif WORD_SIZE_IN_BITS == 64
+
+extern StgWord hs_pdep(StgWord src, StgWord mask);
+StgWord
+hs_pdep(StgWord src, StgWord mask)
+{
+ return hs_pdep64(src, mask);
+}
+
+#else
+
+#error Unknown machine word size
+
+#endif
diff --git a/libraries/ghc-prim/cbits/pext.c b/libraries/ghc-prim/cbits/pext.c
new file mode 100644
index 0000000000..d08fb94f0e
--- /dev/null
+++ b/libraries/ghc-prim/cbits/pext.c
@@ -0,0 +1,67 @@
+#include "Rts.h"
+#include "MachDeps.h"
+
+extern StgWord hs_pext64(StgWord src, StgWord mask);
+StgWord
+hs_pext64(StgWord src, StgWord mask)
+{
+ uint64_t result = 0;
+ int offset = 0;
+
+ for (int bit = 0; bit != sizeof(uint64_t) * 8; ++bit) {
+ const uint64_t src_bit = (src >> bit) & 1;
+ const uint64_t mask_bit = (mask >> bit) & 1;
+
+ if (mask_bit) {
+ result |= (uint64_t)(src_bit) << offset;
+ ++offset;
+ }
+ }
+
+ return result;
+}
+
+extern StgWord hs_pext32(StgWord src, StgWord mask);
+StgWord
+hs_pext32(StgWord src, StgWord mask)
+{
+ return hs_pext64(src, mask);
+}
+
+extern StgWord hs_pext16(StgWord src, StgWord mask);
+StgWord
+hs_pext16(StgWord src, StgWord mask)
+{
+ return hs_pext64(src, mask);
+}
+
+extern StgWord hs_pext8(StgWord src, StgWord mask);
+StgWord
+hs_pext8(StgWord src, StgWord mask)
+{
+ return hs_pext64(src, mask);
+}
+
+#if WORD_SIZE_IN_BITS == 32
+
+extern StgWord hs_pext(StgWord src, StgWord mask);
+StgWord
+hs_pext(StgWord src, StgWord mask)
+{
+ return hs_pext64(src, mask);
+}
+
+#elif WORD_SIZE_IN_BITS == 64
+
+extern StgWord hs_pext(StgWord src, StgWord mask);
+StgWord
+hs_pext(StgWord src, StgWord mask)
+{
+ return hs_pext64(src, mask);
+}
+
+#else
+
+#error Unknown machine word size
+
+#endif
diff --git a/libraries/ghc-prim/ghc-prim.cabal b/libraries/ghc-prim/ghc-prim.cabal
index 5b6b857ffb..ca50808469 100644
--- a/libraries/ghc-prim/ghc-prim.cabal
+++ b/libraries/ghc-prim/ghc-prim.cabal
@@ -77,6 +77,8 @@ Library
cbits/ctz.c
cbits/debug.c
cbits/longlong.c
+ cbits/pdep.c
+ cbits/pext.c
cbits/popcnt.c
cbits/word2float.c
diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T
index 214a9d5704..42d8a2f767 100644
--- a/testsuite/tests/codeGen/should_run/all.T
+++ b/testsuite/tests/codeGen/should_run/all.T
@@ -77,6 +77,8 @@ test('cgrun069', omit_ways(['ghci']), multi_compile_and_run,
test('cgrun070', normal, compile_and_run, [''])
test('cgrun071', normal, compile_and_run, [''])
test('cgrun072', normal, compile_and_run, [''])
+test('cgrun075', normal, compile_and_run, [''])
+test('cgrun076', normal, compile_and_run, [''])
test('T1852', normal, compile_and_run, [''])
test('T1861', extra_run_opts('0'), compile_and_run, [''])
diff --git a/testsuite/tests/codeGen/should_run/cgrun075.hs b/testsuite/tests/codeGen/should_run/cgrun075.hs
new file mode 100644
index 0000000000..09e35b4d8a
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/cgrun075.hs
@@ -0,0 +1,115 @@
+{-# LANGUAGE BangPatterns, CPP, MagicHash #-}
+
+module Main ( main ) where
+
+import Data.Bits
+import GHC.Int
+import GHC.Prim
+import GHC.Word
+import Data.Int
+import Data.Word
+
+#include "MachDeps.h"
+
+main = putStr
+ ( test_pdep ++ "\n"
+ ++ test_pdep8 ++ "\n"
+ ++ test_pdep16 ++ "\n"
+ ++ test_pdep32 ++ "\n"
+ ++ test_pdep64 ++ "\n"
+ ++ "\n"
+ )
+
+class Pdep a where
+ pdep :: a -> a -> a
+
+instance Pdep Word where
+ pdep (W# src#) (W# mask#) = W# (pdep# src# mask#)
+
+instance Pdep Word8 where
+ pdep (W8# src#) (W8# mask#) = W8# (pdep8# src# mask#)
+
+instance Pdep Word16 where
+ pdep (W16# src#) (W16# mask#) = W16# (pdep16# src# mask#)
+
+instance Pdep Word32 where
+ pdep (W32# src#) (W32# mask#) = W32# (pdep32# src# mask#)
+
+instance Pdep Word64 where
+ pdep (W64# src#) (W64# mask#) = W64# (pdep64# src# mask#)
+
+class SlowPdep a where
+ slowPdep :: a -> a -> a
+
+instance SlowPdep Word where
+ slowPdep s m = fromIntegral (slowPdep64 (fromIntegral s) (fromIntegral m))
+
+instance SlowPdep Word8 where
+ slowPdep s m = fromIntegral (slowPdep64 (fromIntegral s) (fromIntegral m))
+
+instance SlowPdep Word16 where
+ slowPdep s m = fromIntegral (slowPdep64 (fromIntegral s) (fromIntegral m))
+
+instance SlowPdep Word32 where
+ slowPdep s m = fromIntegral (slowPdep64 (fromIntegral s) (fromIntegral m))
+
+instance SlowPdep Word64 where
+ slowPdep s m = fromIntegral (slowPdep64 (fromIntegral s) (fromIntegral m))
+
+slowPdep64 :: Word64 -> Word64 -> Word64
+slowPdep64 = slowPdep64' 0
+
+slowPdep32 :: Word32 -> Word32 -> Word32
+slowPdep32 s m = fromIntegral (slowPdep64 (fromIntegral s) (fromIntegral m))
+
+lsb :: Word64 -> Word64
+lsb src = fromIntegral ((fromIntegral (src `shiftL` 63) :: Int64) `shiftR` 63)
+
+slowPdep64' :: Word64 -> Word64 -> Word64 -> Word64
+slowPdep64' result src mask = if lowest /= 0
+ then slowPdep64' newResult (src `shiftR` 1) (mask .&. complement lowest)
+ else result
+ where lowest = (-mask) .&. mask
+ newResult = (result .|. ((lsb src) .&. lowest))
+
+test_pdep = test (0 :: Word ) pdep slowPdep
+test_pdep8 = test (0 :: Word8 ) pdep slowPdep
+test_pdep16 = test (0 :: Word16) pdep slowPdep
+test_pdep32 = test (0 :: Word32) pdep slowPdep
+test_pdep64 = test (0 :: Word64) pdep slowPdep
+
+mask n = (2 ^ n) - 1
+
+fst4 :: (a, b, c, d) -> a
+fst4 (a, _, _, _) = a
+
+runCase :: Eq a
+ => (a -> a -> a)
+ -> (a -> a -> a)
+ -> (a, a)
+ -> (Bool, a, a, (a, a))
+runCase fast slow (x, y) = (slow x y == fast x y, slow x y, fast x y, (x, y))
+
+test :: (Show a, Num a, Eq a) => a -> (a -> a -> a) -> (a -> a -> a) -> String
+test _ fast slow = case failing of
+ [] -> "OK"
+ ((_, e, a, i):xs) ->
+ "FAIL\n" ++ " Input: " ++ show i ++ "\nExpected: " ++ show e ++
+ "\n Actual: " ++ show a
+ where failing = dropWhile fst4 . map (runCase fast slow) $ cases
+ cases = (,) <$> numbers <*> numbers
+ -- 10 random numbers
+#if SIZEOF_HSWORD == 4
+ numbers = [ 1480294021, 1626858410, 2316287658, 1246556957, 3806579062
+ , 65945563 , 1521588071, 791321966 , 1355466914, 2284998160
+ ]
+#elif SIZEOF_HSWORD == 8
+ numbers = [ 11004539497957619752, 5625461252166958202
+ , 1799960778872209546 , 16979826074020750638
+ , 12789915432197771481, 11680809699809094550
+ , 13208678822802632247, 13794454868797172383
+ , 13364728999716654549, 17516539991479925226
+ ]
+#else
+# error Unexpected word size
+#endif
diff --git a/testsuite/tests/codeGen/should_run/cgrun075.stdout b/testsuite/tests/codeGen/should_run/cgrun075.stdout
new file mode 100644
index 0000000000..e22e2cd950
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/cgrun075.stdout
@@ -0,0 +1,6 @@
+OK
+OK
+OK
+OK
+OK
+
diff --git a/testsuite/tests/codeGen/should_run/cgrun076.hs b/testsuite/tests/codeGen/should_run/cgrun076.hs
new file mode 100644
index 0000000000..7fa42d74e0
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/cgrun076.hs
@@ -0,0 +1,115 @@
+{-# LANGUAGE BangPatterns, CPP, MagicHash #-}
+
+module Main ( main ) where
+
+import Data.Bits
+import GHC.Int
+import GHC.Prim
+import GHC.Word
+import Data.Int
+import Data.Word
+
+#include "MachDeps.h"
+
+main = putStr
+ ( test_pext ++ "\n"
+ ++ test_pext8 ++ "\n"
+ ++ test_pext16 ++ "\n"
+ ++ test_pext32 ++ "\n"
+ ++ test_pext64 ++ "\n"
+ ++ "\n"
+ )
+
+class Pext a where
+ pext :: a -> a -> a
+
+instance Pext Word where
+ pext (W# src#) (W# mask#) = W# (pext# src# mask#)
+
+instance Pext Word8 where
+ pext (W8# src#) (W8# mask#) = W8# (pext8# src# mask#)
+
+instance Pext Word16 where
+ pext (W16# src#) (W16# mask#) = W16# (pext16# src# mask#)
+
+instance Pext Word32 where
+ pext (W32# src#) (W32# mask#) = W32# (pext32# src# mask#)
+
+instance Pext Word64 where
+ pext (W64# src#) (W64# mask#) = W64# (pext64# src# mask#)
+
+class SlowPext a where
+ slowPext :: a -> a -> a
+
+instance SlowPext Word where
+ slowPext s m = fromIntegral (slowPext64 (fromIntegral s) (fromIntegral m))
+
+instance SlowPext Word8 where
+ slowPext s m = fromIntegral (slowPext64 (fromIntegral s) (fromIntegral m))
+
+instance SlowPext Word16 where
+ slowPext s m = fromIntegral (slowPext64 (fromIntegral s) (fromIntegral m))
+
+instance SlowPext Word32 where
+ slowPext s m = fromIntegral (slowPext64 (fromIntegral s) (fromIntegral m))
+
+instance SlowPext Word64 where
+ slowPext s m = fromIntegral (slowPext64 (fromIntegral s) (fromIntegral m))
+
+slowPext64 :: Word64 -> Word64 -> Word64
+slowPext64 = slowPext64' 0 0 0
+
+slowPext32 :: Word32 -> Word32 -> Word32
+slowPext32 s m = fromIntegral (slowPext64 (fromIntegral s) (fromIntegral m))
+
+slowPext64' :: Word64 -> Int -> Int -> Word64 -> Word64 -> Word64
+slowPext64' result offset index src mask = if index /= 64
+ then if maskBit /= 0
+ then slowPext64' nextResult (offset + 1) (index + 1) src mask
+ else slowPext64' result offset (index + 1) src mask
+ else result
+ where srcBit = (src `shiftR` index) .&. 1
+ maskBit = (mask `shiftR` index) .&. 1
+ nextResult = result .|. (srcBit `shiftL` offset)
+
+test_pext = test (0 :: Word ) pext slowPext
+test_pext8 = test (0 :: Word8 ) pext slowPext
+test_pext16 = test (0 :: Word16) pext slowPext
+test_pext32 = test (0 :: Word32) pext slowPext
+test_pext64 = test (0 :: Word64) pext slowPext
+
+mask n = (2 ^ n) - 1
+
+fst4 :: (a, b, c, d) -> a
+fst4 (a, _, _, _) = a
+
+runCase :: Eq a
+ => (a -> a -> a)
+ -> (a -> a -> a)
+ -> (a, a)
+ -> (Bool, a, a, (a, a))
+runCase fast slow (x, y) = (slow x y == fast x y, slow x y, fast x y, (x, y))
+
+test :: (Show a, Num a, Eq a) => a -> (a -> a -> a) -> (a -> a -> a) -> String
+test _ fast slow = case failing of
+ [] -> "OK"
+ ((_, e, a, i):xs) ->
+ "FAIL\n" ++ " Input: " ++ show i ++ "\nExpected: " ++ show e ++
+ "\n Actual: " ++ show a
+ where failing = dropWhile fst4 . map (runCase fast slow) $ cases
+ cases = (,) <$> numbers <*> numbers
+ -- 10 random numbers
+#if SIZEOF_HSWORD == 4
+ numbers = [ 1480294021, 1626858410, 2316287658, 1246556957, 3806579062
+ , 65945563 , 1521588071, 791321966 , 1355466914, 2284998160
+ ]
+#elif SIZEOF_HSWORD == 8
+ numbers = [ 11004539497957619752, 5625461252166958202
+ , 1799960778872209546 , 16979826074020750638
+ , 12789915432197771481, 11680809699809094550
+ , 13208678822802632247, 13794454868797172383
+ , 13364728999716654549, 17516539991479925226
+ ]
+#else
+# error Unexpected word size
+#endif
diff --git a/testsuite/tests/codeGen/should_run/cgrun076.stdout b/testsuite/tests/codeGen/should_run/cgrun076.stdout
new file mode 100644
index 0000000000..e22e2cd950
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/cgrun076.stdout
@@ -0,0 +1,6 @@
+OK
+OK
+OK
+OK
+OK
+