diff options
author | Austin Seipp <austin@well-typed.com> | 2013-10-01 21:13:14 -0500 |
---|---|---|
committer | Austin Seipp <austin@well-typed.com> | 2013-10-01 21:26:47 -0500 |
commit | fd74014079f14bd3ab50e328e52c44ef97d40e05 (patch) | |
tree | da31c992a76d3816a4f1012ceb1eb4e68d0fb556 | |
parent | 627d1e008cbe4d9318b2466394420a968d1659da (diff) | |
download | haskell-fd74014079f14bd3ab50e328e52c44ef97d40e05.tar.gz |
Add support for prefetch with locality levels.
This patch adds support for several new primitive operations which
support using processor-specific instructions to help guide data and
cache locality decisions. We have levels ranging from [0..3]
For LLVM, we generate llvm.prefetch intrinsics at the proper locality
level (similar to GCC.)
For x86 we generate prefetch{NTA, t2, t1, t0} instructions. On SPARC and
PowerPC, the locality levels are ignored.
This closes #8256.
Authored-by: Carter Tazio Schonwald <carter.schonwald@gmail.com>
Signed-off-by: Austin Seipp <austin@well-typed.com>
-rw-r--r-- | compiler/cmm/CmmMachOp.hs | 18 | ||||
-rw-r--r-- | compiler/cmm/CmmParse.y | 10 | ||||
-rw-r--r-- | compiler/cmm/PprC.hs | 4 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmPrim.hs | 53 | ||||
-rw-r--r-- | compiler/llvmGen/LlvmCodeGen/CodeGen.hs | 11 | ||||
-rw-r--r-- | compiler/nativeGen/PPC/CodeGen.hs | 5 | ||||
-rw-r--r-- | compiler/nativeGen/SPARC/CodeGen.hs | 7 | ||||
-rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 23 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Instr.hs | 16 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Ppr.hs | 12 | ||||
-rw-r--r-- | compiler/prelude/primops.txt.pp | 85 |
11 files changed, 194 insertions, 50 deletions
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs index c009d15e25..684a4b9729 100644 --- a/compiler/cmm/CmmMachOp.hs +++ b/compiler/cmm/CmmMachOp.hs @@ -107,10 +107,10 @@ data MachOp -- Vector element insertion and extraction operations | MO_V_Insert Length Width -- Insert scalar into vector | MO_V_Extract Length Width -- Extract scalar from vector - + -- Integer vector operations - | MO_V_Add Length Width - | MO_V_Sub Length Width + | MO_V_Add Length Width + | MO_V_Sub Length Width | MO_V_Mul Length Width -- Signed vector multiply/divide @@ -127,8 +127,8 @@ data MachOp | MO_VF_Extract Length Width -- Extract scalar from vector -- Floating point vector operations - | MO_VF_Add Length Width - | MO_VF_Sub Length Width + | MO_VF_Add Length Width + | MO_VF_Sub Length Width | MO_VF_Neg Length Width -- unary - | MO_VF_Mul Length Width | MO_VF_Quot Length Width @@ -528,8 +528,14 @@ data CallishMachOp | MO_Touch -- Keep variables live (when using interior pointers) -- Prefetch - | MO_Prefetch_Data -- Prefetch hint. May change program performance but not + | MO_Prefetch_Data Int -- Prefetch hint. May change program performance but not -- program behavior. + -- the Int can be 0-3. Needs to be known at compile time + -- to interact with code generation correctly. + -- TODO: add support for prefetch WRITES, + -- currently only exposes prefetch reads, which + -- would the majority of use cases in ghc anyways + -- Note that these three MachOps all take 1 extra parameter than the -- standard C lib versions. The extra (last) parameter contains diff --git a/compiler/cmm/CmmParse.y b/compiler/cmm/CmmParse.y index ebd9278e15..a0c9bc4eb5 100644 --- a/compiler/cmm/CmmParse.y +++ b/compiler/cmm/CmmParse.y @@ -952,8 +952,16 @@ callishMachOps = listToUFM $ ( "write_barrier", MO_WriteBarrier ), ( "memcpy", MO_Memcpy ), ( "memset", MO_Memset ), - ( "memmove", MO_Memmove ) + ( "memmove", MO_Memmove ), + + ("prefetch0",MO_Prefetch_Data 0), + ("prefetch1",MO_Prefetch_Data 1), + ("prefetch2",MO_Prefetch_Data 2), + ("prefetch3",MO_Prefetch_Data 3) + -- ToDo: the rest, maybe + -- edit: which rest? + -- also: how do we tell CMM Lint how to type check callish macops? ] parseSafety :: String -> P Safety diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs index c468161c73..32fd8b4feb 100644 --- a/compiler/cmm/PprC.hs +++ b/compiler/cmm/PprC.hs @@ -759,7 +759,9 @@ pprCallishMachOp_for_C mop MO_Add2 {} -> unsupported MO_U_Mul2 {} -> unsupported MO_Touch -> unsupported - MO_Prefetch_Data -> unsupported + (MO_Prefetch_Data _ ) -> unsupported + --- we could support prefetch via "__builtin_prefetch" + --- Not adding it for now where unsupported = panic ("pprCallishMachOp_for_C: " ++ show mop ++ " not supported!") diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs index 523fcb21f9..6411e89a54 100644 --- a/compiler/codeGen/StgCmmPrim.hs +++ b/compiler/codeGen/StgCmmPrim.hs @@ -255,15 +255,6 @@ emitPrimOp dflags [res] SizeofMutableByteArrayOp [arg] emitPrimOp _ res@[] TouchOp args@[_arg] = do emitPrimCall res MO_Touch args -emitPrimOp _ res@[] PrefetchByteArrayOp args@[_arg] - = do emitPrimCall res MO_Prefetch_Data args - -emitPrimOp _ res@[] PrefetchMutableByteArrayOp args@[_arg] - = do emitPrimCall res MO_Prefetch_Data args - -emitPrimOp _ res@[] PrefetchAddrOp args@[_arg] - = do emitPrimCall res MO_Prefetch_Data args - -- #define byteArrayContentszh(r,a) r = BYTE_ARR_CTS(a) emitPrimOp dflags [res] ByteArrayContents_Char [arg] = emitAssign (CmmLocal res) (cmmOffsetB dflags arg (arrWordsHdrSize dflags)) @@ -656,9 +647,22 @@ emitPrimOp dflags res (VecWriteScalarOffAddrOp vcat n w) args = do ty = vecCmmCat vcat w -- Prefetch -emitPrimOp _ res PrefetchByteArrayOp args = doPrefetchByteArrayOp res args -emitPrimOp _ res PrefetchMutableByteArrayOp args = doPrefetchByteArrayOp res args -emitPrimOp _ res PrefetchAddrOp args = doPrefetchAddrOp res args +emitPrimOp _ res PrefetchByteArrayOp3 args = doPrefetchByteArrayOp 3 res args +emitPrimOp _ res PrefetchMutableByteArrayOp3 args = doPrefetchByteArrayOp 3 res args +emitPrimOp _ res PrefetchAddrOp3 args = doPrefetchAddrOp 3 res args + +emitPrimOp _ res PrefetchByteArrayOp2 args = doPrefetchByteArrayOp 2 res args +emitPrimOp _ res PrefetchMutableByteArrayOp2 args = doPrefetchByteArrayOp 2 res args +emitPrimOp _ res PrefetchAddrOp2 args = doPrefetchAddrOp 2 res args + +emitPrimOp _ res PrefetchByteArrayOp1 args = doPrefetchByteArrayOp 1 res args +emitPrimOp _ res PrefetchMutableByteArrayOp1 args = doPrefetchByteArrayOp 1 res args +emitPrimOp _ res PrefetchAddrOp1 args = doPrefetchAddrOp 1 res args + +emitPrimOp _ res PrefetchByteArrayOp0 args = doPrefetchByteArrayOp 0 res args +emitPrimOp _ res PrefetchMutableByteArrayOp0 args = doPrefetchByteArrayOp 0 res args +emitPrimOp _ res PrefetchAddrOp0 args = doPrefetchAddrOp 0 res args + -- The rest just translate straightforwardly emitPrimOp dflags [res] op [arg] @@ -1370,31 +1374,34 @@ doVecInsertOp maybe_pre_write_cast ty src e idx res = do ------------------------------------------------------------------------------ -- Helpers for translating prefetching. -doPrefetchByteArrayOp :: [LocalReg] +doPrefetchByteArrayOp :: Int + -> [LocalReg] -> [CmmExpr] -> FCode () -doPrefetchByteArrayOp res [addr,idx] +doPrefetchByteArrayOp locality res [addr,idx] = do dflags <- getDynFlags - mkBasicPrefetch (arrWordsHdrSize dflags) res addr idx -doPrefetchByteArrayOp _ _ + mkBasicPrefetch locality (arrWordsHdrSize dflags) res addr idx +doPrefetchByteArrayOp _ _ _ = panic "StgCmmPrim: doPrefetchByteArrayOp" -doPrefetchAddrOp :: [LocalReg] +doPrefetchAddrOp ::Int + -> [LocalReg] -> [CmmExpr] -> FCode () -doPrefetchAddrOp res [addr,idx] - = mkBasicPrefetch 0 res addr idx -doPrefetchAddrOp _ _ +doPrefetchAddrOp locality res [addr,idx] + = mkBasicPrefetch locality 0 res addr idx +doPrefetchAddrOp _ _ _ = panic "StgCmmPrim: doPrefetchAddrOp" -mkBasicPrefetch :: ByteOff -- Initial offset in bytes +mkBasicPrefetch :: Int -- Locality level 0-3 + -> ByteOff -- Initial offset in bytes -> [LocalReg] -- Destination -> CmmExpr -- Base address -> CmmExpr -- Index -> FCode () -mkBasicPrefetch off res base idx +mkBasicPrefetch locality off res base idx = do dflags <- getDynFlags - emitPrimCall [] MO_Prefetch_Data [cmmIndexExpr dflags W8 (cmmOffsetB dflags base off) idx] + emitPrimCall [] (MO_Prefetch_Data locality) [cmmIndexExpr dflags W8 (cmmOffsetB dflags base off) idx] case res of [] -> return () [reg] -> emitAssign (CmmLocal reg) base diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs index 5002b89b72..808c591d92 100644 --- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs +++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs @@ -200,7 +200,8 @@ genCall (PrimTarget (MO_UF_Conv _)) [_] args = "Can only handle 1, given" ++ show (length args) ++ "." -- Handle prefetching data -genCall t@(PrimTarget MO_Prefetch_Data) [] args = do +genCall t@(PrimTarget (MO_Prefetch_Data localityInt)) [] args + | 0 <= localityInt && localityInt <= 3 = do ver <- getLlvmVer let argTy | ver <= 29 = [i8Ptr, i32, i32] | otherwise = [i8Ptr, i32, i32, i32] @@ -214,12 +215,13 @@ genCall t@(PrimTarget MO_Prefetch_Data) [] args = do (argVars', stmts3) <- castVars $ zip argVars argTy trash <- getTrashStmts - let argSuffix | ver <= 29 = [mkIntLit i32 0, mkIntLit i32 3] - | otherwise = [mkIntLit i32 0, mkIntLit i32 3, mkIntLit i32 1] + let argSuffix | ver <= 29 = [mkIntLit i32 0, mkIntLit i32 localityInt] + | otherwise = [mkIntLit i32 0, mkIntLit i32 localityInt, mkIntLit i32 1] call = Expr $ Call StdCall fptr (argVars' ++ argSuffix) [] stmts = stmts1 `appOL` stmts2 `appOL` stmts3 `appOL` trash `snocOL` call return (stmts, top1 ++ top2) + | otherwise = panic $ "prefetch locality level integer must be between 0 and 3, given: " ++ (show localityInt) -- Handle PopCnt and BSwap that need to only convert arg and return types genCall t@(PrimTarget (MO_PopCnt w)) dsts args = @@ -545,7 +547,8 @@ cmmPrimOpFunctions mop = do (MO_PopCnt w) -> fsLit $ "llvm.ctpop." ++ showSDoc dflags (ppr $ widthToLlvmInt w) (MO_BSwap w) -> fsLit $ "llvm.bswap." ++ showSDoc dflags (ppr $ widthToLlvmInt w) - MO_Prefetch_Data -> fsLit "llvm.prefetch" + (MO_Prefetch_Data _ )-> fsLit "llvm.prefetch" + MO_S_QuotRem {} -> unsupported MO_U_QuotRem {} -> unsupported diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs index 65533d8f9a..3f0e7632f8 100644 --- a/compiler/nativeGen/PPC/CodeGen.hs +++ b/compiler/nativeGen/PPC/CodeGen.hs @@ -912,6 +912,9 @@ genCCall' _ _ (PrimTarget MO_WriteBarrier) _ _ genCCall' _ _ (PrimTarget MO_Touch) _ _ = return $ nilOL +genCCall' _ _ (PrimTarget (MO_Prefetch_Data _)) _ _ + = return $ nilOL + genCCall' dflags gcp target dest_regs args0 = ASSERT(not $ any (`elem` [II16]) $ map cmmTypeSize argReps) -- we rely on argument promotion in the codeGen @@ -1165,7 +1168,7 @@ genCCall' dflags gcp target dest_regs args0 MO_U_Mul2 {} -> unsupported MO_WriteBarrier -> unsupported MO_Touch -> unsupported - MO_Prefetch_Data -> unsupported + (MO_Prefetch_Data _ ) -> unsupported unsupported = panic ("outOfLineCmmOp: " ++ show mop ++ " not supported") diff --git a/compiler/nativeGen/SPARC/CodeGen.hs b/compiler/nativeGen/SPARC/CodeGen.hs index 5d2b9a9d6d..5d65b427e1 100644 --- a/compiler/nativeGen/SPARC/CodeGen.hs +++ b/compiler/nativeGen/SPARC/CodeGen.hs @@ -392,7 +392,10 @@ genCCall -- In the SPARC case we don't need a barrier. -- genCCall (PrimTarget MO_WriteBarrier) _ _ - = do return nilOL + = return $ nilOL + +genCCall (PrimTarget (MO_Prefetch_Data _)) _ _ + = return $ nilOL genCCall target dest_regs args0 = do @@ -657,7 +660,7 @@ outOfLineMachOp_table mop MO_U_Mul2 {} -> unsupported MO_WriteBarrier -> unsupported MO_Touch -> unsupported - MO_Prefetch_Data -> unsupported + (MO_Prefetch_Data _) -> unsupported where unsupported = panic ("outOfLineCmmOp: " ++ show mop ++ " not supported here") diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index e18da25347..2456688744 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -1658,7 +1658,26 @@ genCCall _ (PrimTarget MO_WriteBarrier) _ _ = return nilOL genCCall _ (PrimTarget MO_Touch) _ _ = return nilOL -genCCall _ (PrimTarget MO_Prefetch_Data) _ _ = return nilOL +genCCall is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] = + case n of + 0 -> genPrefetch src $ PREFETCH NTA size + 1 -> genPrefetch src $ PREFETCH Lvl2 size + 2 -> genPrefetch src $ PREFETCH Lvl1 size + 3 -> genPrefetch src $ PREFETCH Lvl0 size + l -> panic $ "unexpected prefetch level in genCCall MO_Prefetch_Data: " ++ (show l) + -- the c / llvm prefetch convention is 0, 1, 2, and 3 + -- the x86 corresponding names are : NTA, 2 , 1, and 0 + where + size = archWordSize is32bit + -- need to know what register width for pointers! + genPrefetch inRegSrc prefetchCTor = + do + code_src <- getAnyReg inRegSrc + src_r <- getNewRegNat size + return $ code_src src_r `appOL` + (unitOL (prefetchCTor (OpAddr + ((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) )) + -- prefetch always takes an address genCCall is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] = do dflags <- getDynFlags @@ -2361,7 +2380,7 @@ outOfLineCmmOp mop res args MO_U_Mul2 {} -> unsupported MO_WriteBarrier -> unsupported MO_Touch -> unsupported - MO_Prefetch_Data -> unsupported + (MO_Prefetch_Data _ ) -> unsupported unsupported = panic ("outOfLineCmmOp: " ++ show mop ++ " not supported here") diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs index e584ffe8b9..d10591e37f 100644 --- a/compiler/nativeGen/X86/Instr.hs +++ b/compiler/nativeGen/X86/Instr.hs @@ -9,7 +9,7 @@ #include "HsVersions.h" #include "nativeGen/NCG.h" -module X86.Instr (Instr(..), Operand(..), JumpDest, +module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..), JumpDest, getJumpDestBlockId, canShortcut, shortcutStatics, shortcutJump, i386_insert_ffrees, allocMoreStack, maxSpillSlots, archWordSize) @@ -319,7 +319,14 @@ data Instr -- 1: popl %reg -- SSE4.2 - | POPCNT Size Operand Reg -- src, dst + | POPCNT Size Operand Reg -- src, dst + + -- prefetch + | PREFETCH PrefetchVariant Size Operand -- prefetch Variant, addr size, address to prefetch + -- variant can be NTA, Lvl0, Lvl1, or Lvl2 + +data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2 + data Operand = OpReg Reg -- register @@ -417,6 +424,9 @@ x86_regUsageOfInstr platform instr POPCNT _ src dst -> mkRU (use_R src []) [dst] + -- note: might be a better way to do this + PREFETCH _ _ src -> mkRU (use_R src []) [] + _other -> panic "regUsage: unrecognised instr" where @@ -557,6 +567,8 @@ x86_patchRegsOfInstr instr env POPCNT sz src dst -> POPCNT sz (patchOp src) (env dst) + PREFETCH lvl size src -> PREFETCH lvl size (patchOp src) + _other -> panic "patchRegs: unrecognised instr" where diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs index 7f9c6901da..f38a04d069 100644 --- a/compiler/nativeGen/X86/Ppr.hs +++ b/compiler/nativeGen/X86/Ppr.hs @@ -577,6 +577,11 @@ pprInstr (XOR size src dst) = pprSizeOpOp (sLit "xor") size src dst pprInstr (POPCNT size src dst) = pprOpOp (sLit "popcnt") size src (OpReg dst) +pprInstr (PREFETCH NTA size src ) = pprSizeOp_ (sLit "prefetchnta") size src +pprInstr (PREFETCH Lvl0 size src) = pprSizeOp_ (sLit "prefetcht0") size src +pprInstr (PREFETCH Lvl1 size src) = pprSizeOp_ (sLit "prefetcht1") size src +pprInstr (PREFETCH Lvl2 size src) = pprSizeOp_ (sLit "prefetcht2") size src + pprInstr (NOT size op) = pprSizeOp (sLit "not") size op pprInstr (BSWAP size op) = pprSizeOp (sLit "bswap") size (OpReg op) pprInstr (NEGI size op) = pprSizeOp (sLit "neg") size op @@ -1025,6 +1030,13 @@ pprSizeImmOp name size imm op1 ] +pprSizeOp_ :: LitString -> Size -> Operand -> SDoc +pprSizeOp_ name size op1 + = hcat [ + pprMnemonic_ name , + pprOperand size op1 + ] + pprSizeOp :: LitString -> Size -> Operand -> SDoc pprSizeOp name size op1 = hcat [ diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp index dcd536eeae..5bedc31a7b 100644 --- a/compiler/prelude/primops.txt.pp +++ b/compiler/prelude/primops.txt.pp @@ -2596,22 +2596,91 @@ primop VecWriteScalarOffAddrOp "writeOffAddrAs#" GenPrimOp vector = ALL_VECTOR_TYPES ------------------------------------------------------------------------ + section "Prefetch" - {Prefetch operations} + {Prefetch operations: Note how every prefetch operation has a name + with the pattern prefetch*N#, where N is either 0,1,2, or 3. + + This suffix number, N, is the "locality level" of the prefetch, following the + convention in GCC and other compilers. + Higher locality numbers correspond to the memory being loaded in more + levels of the cpu cache, and being retained after initial use. + + On the LLVM backend, prefetch*N# uses the LLVM prefetch intrinsic + with locality level N. The code generated by LLVM is target architecture + dependent, but should agree with the GHC NCG on x86 systems. + + On the Sparc and PPC native backends, prefetch*N is a No-Op. + + On the x86 NCG, N=0 will generate prefetchNTA, + N=1 generates prefetcht2, N=2 generates prefetcht1, and + N=3 generates prefetcht0. + + For streaming workloads, the prefetch*0 operations are recommended. + For workloads which do many reads or writes to a memory location in a short period of time, + prefetch*3 operations are recommended. + } ------------------------------------------------------------------------ -primop PrefetchByteArrayOp "prefetchByteArray#" GenPrimOp + +--- the Int# argument for prefetch is the byte offset on the byteArray or Addr# + +--- +primop PrefetchByteArrayOp3 "prefetchByteArray3#" GenPrimOp ByteArray# -> Int# -> ByteArray# - with llvm_only = True + with can_fail = True -primop PrefetchMutableByteArrayOp "prefetchMutableByteArray#" GenPrimOp +primop PrefetchMutableByteArrayOp3 "prefetchMutableByteArray3#" GenPrimOp MutableByteArray# s -> Int# -> State# s -> State# s - with has_side_effects = True - llvm_only = True + with can_fail = True + +primop PrefetchAddrOp3 "prefetchAddr3#" GenPrimOp + Addr# -> Int# -> Addr# + with can_fail = True -primop PrefetchAddrOp "prefetchAddr#" GenPrimOp +---- + +primop PrefetchByteArrayOp2 "prefetchByteArray2#" GenPrimOp + ByteArray# -> Int# -> ByteArray# + with can_fail = True + +primop PrefetchMutableByteArrayOp2 "prefetchMutableByteArray2#" GenPrimOp + MutableByteArray# s -> Int# -> State# s -> State# s + with can_fail = True + +primop PrefetchAddrOp2 "prefetchAddr2#" GenPrimOp Addr# -> Int# -> Addr# - with llvm_only = True + with can_fail = True + +---- + +primop PrefetchByteArrayOp1 "prefetchByteArray1#" GenPrimOp + ByteArray# -> Int# -> ByteArray# + with can_fail = True + +primop PrefetchMutableByteArrayOp1 "prefetchMutableByteArray1#" GenPrimOp + MutableByteArray# s -> Int# -> State# s -> State# s + with can_fail = True + +primop PrefetchAddrOp1 "prefetchAddr1#" GenPrimOp + Addr# -> Int# -> Addr# + with can_fail = True + +---- + +primop PrefetchByteArrayOp0 "prefetchByteArray0#" GenPrimOp + ByteArray# -> Int# -> ByteArray# + with can_fail = True + +primop PrefetchMutableByteArrayOp0 "prefetchMutableByteArray0#" GenPrimOp + MutableByteArray# s -> Int# -> State# s -> State# s + with can_fail = True + +primop PrefetchAddrOp0 "prefetchAddr0#" GenPrimOp + Addr# -> Int# -> Addr# + with can_fail = True + + ------------------------------------------------------------------------ --- --- |