diff options
author | Johan Tibell <johan.tibell@gmail.com> | 2014-06-27 13:48:24 +0200 |
---|---|---|
committer | Johan Tibell <johan.tibell@gmail.com> | 2014-06-30 22:12:45 +0200 |
commit | 4ee4ab01c1d97845aecb7707ad2f9a80933e7a49 (patch) | |
tree | 08991e97b0109b81447968ba62e4753e8209a738 | |
parent | aed1723f97e0539d5ab35222b180c1552a5f4cfc (diff) | |
download | haskell-4ee4ab01c1d97845aecb7707ad2f9a80933e7a49.tar.gz |
Re-add more primops for atomic ops on byte arrays
This is the second attempt to add this functionality. The first
attempt was reverted in 950fcae46a82569e7cd1fba1637a23b419e00ecd, due
to register allocator failure on x86. Given how the register
allocator currently works, we don't have enough registers on x86 to
support cmpxchg using complicated addressing modes. Instead we fall
back to a simpler addressing mode on x86.
Adds the following primops:
* atomicReadIntArray#
* atomicWriteIntArray#
* fetchSubIntArray#
* fetchOrIntArray#
* fetchXorIntArray#
* fetchAndIntArray#
Makes these pre-existing out-of-line primops inline:
* fetchAddIntArray#
* casIntArray#
-rw-r--r-- | compiler/cmm/CmmMachOp.hs | 19 | ||||
-rw-r--r-- | compiler/cmm/CmmSink.hs | 4 | ||||
-rw-r--r-- | compiler/cmm/PprC.hs | 4 | ||||
-rw-r--r-- | compiler/codeGen/StgCmmPrim.hs | 94 | ||||
-rw-r--r-- | compiler/llvmGen/Llvm/AbsSyn.hs | 7 | ||||
-rw-r--r-- | compiler/llvmGen/Llvm/PpLlvm.hs | 18 | ||||
-rw-r--r-- | compiler/llvmGen/LlvmCodeGen/CodeGen.hs | 71 | ||||
-rw-r--r-- | compiler/nativeGen/CPrim.hs | 50 | ||||
-rw-r--r-- | compiler/nativeGen/PPC/CodeGen.hs | 4 | ||||
-rw-r--r-- | compiler/nativeGen/SPARC/CodeGen.hs | 4 | ||||
-rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 110 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Instr.hs | 38 | ||||
-rw-r--r-- | compiler/nativeGen/X86/Ppr.hs | 8 | ||||
-rw-r--r-- | compiler/prelude/primops.txt.pp | 76 | ||||
-rw-r--r-- | includes/stg/MiscClosures.h | 1 | ||||
-rw-r--r-- | libraries/ghc-prim/cbits/atomic.c | 306 | ||||
-rw-r--r-- | libraries/ghc-prim/ghc-prim.cabal | 1 | ||||
-rw-r--r-- | rts/Linker.c | 1 | ||||
-rw-r--r-- | rts/PrimOps.cmm | 12 | ||||
-rw-r--r-- | testsuite/tests/concurrent/should_run/AtomicPrimops.hs | 245 | ||||
-rw-r--r-- | testsuite/tests/concurrent/should_run/AtomicPrimops.stdout | 7 | ||||
-rw-r--r-- | testsuite/tests/concurrent/should_run/all.T | 1 |
22 files changed, 1027 insertions, 54 deletions
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs index c4ec393ad6..d8ce492de1 100644 --- a/compiler/cmm/CmmMachOp.hs +++ b/compiler/cmm/CmmMachOp.hs @@ -19,6 +19,9 @@ module CmmMachOp -- CallishMachOp , CallishMachOp(..), callishMachOpHints , pprCallishMachOp + + -- Atomic read-modify-write + , AtomicMachOp(..) ) where @@ -547,8 +550,24 @@ data CallishMachOp | MO_PopCnt Width | MO_BSwap Width + + -- Atomic read-modify-write. + | MO_AtomicRMW Width AtomicMachOp + | MO_AtomicRead Width + | MO_AtomicWrite Width + | MO_Cmpxchg Width deriving (Eq, Show) +-- | The operation to perform atomically. +data AtomicMachOp = + AMO_Add + | AMO_Sub + | AMO_And + | AMO_Nand + | AMO_Or + | AMO_Xor + deriving (Eq, Show) + pprCallishMachOp :: CallishMachOp -> SDoc pprCallishMachOp mo = text (show mo) diff --git a/compiler/cmm/CmmSink.hs b/compiler/cmm/CmmSink.hs index 4c025425ab..4dced9afd2 100644 --- a/compiler/cmm/CmmSink.hs +++ b/compiler/cmm/CmmSink.hs @@ -650,6 +650,10 @@ data AbsMem -- perhaps we ought to have a special annotation for calls that can -- modify heap/stack memory. For now we just use the conservative -- definition here. +-- +-- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and +-- therefore we should never float any memory operations across one of +-- these calls. bothMems :: AbsMem -> AbsMem -> AbsMem diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs index 47b247e278..455c79ba02 100644 --- a/compiler/cmm/PprC.hs +++ b/compiler/cmm/PprC.hs @@ -753,6 +753,10 @@ pprCallishMachOp_for_C mop MO_Memmove -> ptext (sLit "memmove") (MO_BSwap w) -> ptext (sLit $ bSwapLabel w) (MO_PopCnt w) -> ptext (sLit $ popCntLabel w) + (MO_AtomicRMW w amop) -> ptext (sLit $ atomicRMWLabel w amop) + (MO_Cmpxchg w) -> ptext (sLit $ cmpxchgLabel w) + (MO_AtomicRead w) -> ptext (sLit $ atomicReadLabel w) + (MO_AtomicWrite w) -> ptext (sLit $ atomicWriteLabel w) (MO_UF_Conv w) -> ptext (sLit $ word2FloatLabel w) MO_S_QuotRem {} -> unsupported diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs index 40a5e3649b..e4c682bf02 100644 --- a/compiler/codeGen/StgCmmPrim.hs +++ b/compiler/codeGen/StgCmmPrim.hs @@ -769,6 +769,25 @@ emitPrimOp _ res PrefetchByteArrayOp0 args = doPrefetchByteArrayOp 0 res emitPrimOp _ res PrefetchMutableByteArrayOp0 args = doPrefetchByteArrayOp 0 res args emitPrimOp _ res PrefetchAddrOp0 args = doPrefetchAddrOp 0 res args +-- Atomic read-modify-write +emitPrimOp dflags [res] FetchAddByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Add mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchSubByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Sub mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchAndByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_And mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchNandByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Nand mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchOrByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Or mba ix (bWord dflags) n +emitPrimOp dflags [res] FetchXorByteArrayOp_Int [mba, ix, n] = + doAtomicRMW res AMO_Xor mba ix (bWord dflags) n +emitPrimOp dflags [res] AtomicReadByteArrayOp_Int [mba, ix] = + doAtomicReadByteArray res mba ix (bWord dflags) +emitPrimOp dflags [] AtomicWriteByteArrayOp_Int [mba, ix, val] = + doAtomicWriteByteArray mba ix (bWord dflags) val +emitPrimOp dflags [res] CasByteArrayOp_Int [mba, ix, old, new] = + doCasByteArray res mba ix (bWord dflags) old new -- The rest just translate straightforwardly emitPrimOp dflags [res] op [arg] @@ -1933,6 +1952,81 @@ doWriteSmallPtrArrayOp addr idx val = do emit (setInfo addr (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel))) ------------------------------------------------------------------------------ +-- Atomic read-modify-write + +-- | Emit an atomic modification to a byte array element. The result +-- reg contains that previous value of the element. Implies a full +-- memory barrier. +doAtomicRMW :: LocalReg -- ^ Result reg + -> AtomicMachOp -- ^ Atomic op (e.g. add) + -> CmmExpr -- ^ MutableByteArray# + -> CmmExpr -- ^ Index + -> CmmType -- ^ Type of element by which we are indexing + -> CmmExpr -- ^ Op argument (e.g. amount to add) + -> FCode () +doAtomicRMW res amop mba idx idx_ty n = do + dflags <- getDynFlags + let width = typeWidth idx_ty + addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags) + width mba idx + emitPrimCall + [ res ] + (MO_AtomicRMW width amop) + [ addr, n ] + +-- | Emit an atomic read to a byte array that acts as a memory barrier. +doAtomicReadByteArray + :: LocalReg -- ^ Result reg + -> CmmExpr -- ^ MutableByteArray# + -> CmmExpr -- ^ Index + -> CmmType -- ^ Type of element by which we are indexing + -> FCode () +doAtomicReadByteArray res mba idx idx_ty = do + dflags <- getDynFlags + let width = typeWidth idx_ty + addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags) + width mba idx + emitPrimCall + [ res ] + (MO_AtomicRead width) + [ addr ] + +-- | Emit an atomic write to a byte array that acts as a memory barrier. +doAtomicWriteByteArray + :: CmmExpr -- ^ MutableByteArray# + -> CmmExpr -- ^ Index + -> CmmType -- ^ Type of element by which we are indexing + -> CmmExpr -- ^ Value to write + -> FCode () +doAtomicWriteByteArray mba idx idx_ty val = do + dflags <- getDynFlags + let width = typeWidth idx_ty + addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags) + width mba idx + emitPrimCall + [ {- no results -} ] + (MO_AtomicWrite width) + [ addr, val ] + +doCasByteArray + :: LocalReg -- ^ Result reg + -> CmmExpr -- ^ MutableByteArray# + -> CmmExpr -- ^ Index + -> CmmType -- ^ Type of element by which we are indexing + -> CmmExpr -- ^ Old value + -> CmmExpr -- ^ New value + -> FCode () +doCasByteArray res mba idx idx_ty old new = do + dflags <- getDynFlags + let width = (typeWidth idx_ty) + addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags) + width mba idx + emitPrimCall + [ res ] + (MO_Cmpxchg width) + [ addr, old, new ] + +------------------------------------------------------------------------------ -- Helpers for emitting function calls -- | Emit a call to @memcpy@. diff --git a/compiler/llvmGen/Llvm/AbsSyn.hs b/compiler/llvmGen/Llvm/AbsSyn.hs index f92bd89c5c..24d0856ea3 100644 --- a/compiler/llvmGen/Llvm/AbsSyn.hs +++ b/compiler/llvmGen/Llvm/AbsSyn.hs @@ -65,6 +65,8 @@ data LlvmFunction = LlvmFunction { type LlvmFunctions = [LlvmFunction] +type SingleThreaded = Bool + -- | LLVM ordering types for synchronization purposes. (Introduced in LLVM -- 3.0). Please see the LLVM documentation for a better description. data LlvmSyncOrdering @@ -224,6 +226,11 @@ data LlvmExpression | Load LlvmVar {- | + Atomic load of the value at location ptr + -} + | ALoad LlvmSyncOrdering SingleThreaded LlvmVar + + {- | Navigate in an structure, selecting elements * inbound: Is the pointer inbounds? (computed pointer doesn't overflow) * ptr: Location of the structure diff --git a/compiler/llvmGen/Llvm/PpLlvm.hs b/compiler/llvmGen/Llvm/PpLlvm.hs index 025078226d..73077257f8 100644 --- a/compiler/llvmGen/Llvm/PpLlvm.hs +++ b/compiler/llvmGen/Llvm/PpLlvm.hs @@ -239,6 +239,7 @@ ppLlvmExpression expr Insert vec elt idx -> ppInsert vec elt idx GetElemPtr inb ptr indexes -> ppGetElementPtr inb ptr indexes Load ptr -> ppLoad ptr + ALoad ord st ptr -> ppALoad ord st ptr Malloc tp amount -> ppMalloc tp amount Phi tp precessors -> ppPhi tp precessors Asm asm c ty v se sk -> ppAsm asm c ty v se sk @@ -327,13 +328,18 @@ ppSyncOrdering SyncSeqCst = text "seq_cst" -- of specifying alignment. ppLoad :: LlvmVar -> SDoc -ppLoad var - | isVecPtrVar var = text "load" <+> ppr var <> - comma <+> text "align 1" - | otherwise = text "load" <+> ppr var +ppLoad var = text "load" <+> ppr var <> align where - isVecPtrVar :: LlvmVar -> Bool - isVecPtrVar = isVector . pLower . getVarType + align | isVector . pLower . getVarType $ var = text ", align 1" + | otherwise = empty + +ppALoad :: LlvmSyncOrdering -> SingleThreaded -> LlvmVar -> SDoc +ppALoad ord st var = sdocWithDynFlags $ \dflags -> + let alignment = (llvmWidthInBits dflags $ getVarType var) `quot` 8 + align = text ", align" <+> ppr alignment + sThreaded | st = text " singlethread" + | otherwise = empty + in text "load atomic" <+> ppr var <> sThreaded <+> ppSyncOrdering ord <> align ppStore :: LlvmVar -> LlvmVar -> SDoc ppStore val dst diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs index 517553516b..4a56600937 100644 --- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs +++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs @@ -15,6 +15,7 @@ import BlockId import CodeGen.Platform ( activeStgRegs, callerSaves ) import CLabel import Cmm +import CPrim import PprCmm import CmmUtils import Hoopl @@ -32,6 +33,7 @@ import Unique import Data.List ( nub ) import Data.Maybe ( catMaybes ) +type Atomic = Bool type LlvmStatements = OrdList LlvmStatement -- ----------------------------------------------------------------------------- @@ -228,6 +230,17 @@ genCall t@(PrimTarget (MO_PopCnt w)) dsts args = genCall t@(PrimTarget (MO_BSwap w)) dsts args = genCallSimpleCast w t dsts args +genCall (PrimTarget (MO_AtomicRead _)) [dst] [addr] = do + dstV <- getCmmReg (CmmLocal dst) + (v1, stmts, top) <- genLoad True addr (localRegType dst) + let stmt1 = Store v1 dstV + return (stmts `snocOL` stmt1, top) + +-- TODO: implement these properly rather than calling to RTS functions. +-- genCall t@(PrimTarget (MO_AtomicWrite width)) [] [addr, val] = undefined +-- genCall t@(PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = undefined +-- genCall t@(PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] = undefined + -- Handle memcpy function specifically since llvm's intrinsic version takes -- some extra parameters. genCall t@(PrimTarget op) [] args' @@ -548,7 +561,6 @@ cmmPrimOpFunctions mop = do (MO_Prefetch_Data _ )-> fsLit "llvm.prefetch" - MO_S_QuotRem {} -> unsupported MO_U_QuotRem {} -> unsupported MO_U_QuotRem2 {} -> unsupported @@ -558,6 +570,12 @@ cmmPrimOpFunctions mop = do MO_Touch -> unsupported MO_UF_Conv _ -> unsupported + MO_AtomicRead _ -> unsupported + + MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop + MO_Cmpxchg w -> fsLit $ cmpxchgLabel w + MO_AtomicWrite w -> fsLit $ atomicWriteLabel w + -- | Tail function calls genJump :: CmmExpr -> [GlobalReg] -> LlvmM StmtData @@ -849,7 +867,7 @@ exprToVarOpt opt e = case e of -> genLit opt lit CmmLoad e' ty - -> genLoad e' ty + -> genLoad False e' ty -- Cmmreg in expression is the value, so must load. If you want actual -- reg pointer, call getCmmReg directly. @@ -1268,41 +1286,41 @@ genMachOp_slow _ _ _ = panic "genMachOp: More then 2 expressions in MachOp!" -- | Handle CmmLoad expression. -genLoad :: CmmExpr -> CmmType -> LlvmM ExprData +genLoad :: Atomic -> CmmExpr -> CmmType -> LlvmM ExprData -- First we try to detect a few common cases and produce better code for -- these then the default case. We are mostly trying to detect Cmm code -- like I32[Sp + n] and use 'getelementptr' operations instead of the -- generic case that uses casts and pointer arithmetic -genLoad e@(CmmReg (CmmGlobal r)) ty - = genLoad_fast e r 0 ty +genLoad atomic e@(CmmReg (CmmGlobal r)) ty + = genLoad_fast atomic e r 0 ty -genLoad e@(CmmRegOff (CmmGlobal r) n) ty - = genLoad_fast e r n ty +genLoad atomic e@(CmmRegOff (CmmGlobal r) n) ty + = genLoad_fast atomic e r n ty -genLoad e@(CmmMachOp (MO_Add _) [ +genLoad atomic e@(CmmMachOp (MO_Add _) [ (CmmReg (CmmGlobal r)), (CmmLit (CmmInt n _))]) ty - = genLoad_fast e r (fromInteger n) ty + = genLoad_fast atomic e r (fromInteger n) ty -genLoad e@(CmmMachOp (MO_Sub _) [ +genLoad atomic e@(CmmMachOp (MO_Sub _) [ (CmmReg (CmmGlobal r)), (CmmLit (CmmInt n _))]) ty - = genLoad_fast e r (negate $ fromInteger n) ty + = genLoad_fast atomic e r (negate $ fromInteger n) ty -- generic case -genLoad e ty +genLoad atomic e ty = do other <- getTBAAMeta otherN - genLoad_slow e ty other + genLoad_slow atomic e ty other -- | Handle CmmLoad expression. -- This is a special case for loading from a global register pointer -- offset such as I32[Sp+8]. -genLoad_fast :: CmmExpr -> GlobalReg -> Int -> CmmType - -> LlvmM ExprData -genLoad_fast e r n ty = do +genLoad_fast :: Atomic -> CmmExpr -> GlobalReg -> Int -> CmmType + -> LlvmM ExprData +genLoad_fast atomic e r n ty = do dflags <- getDynFlags (gv, grt, s1) <- getCmmRegVal (CmmGlobal r) meta <- getTBAARegMeta r @@ -1315,7 +1333,7 @@ genLoad_fast e r n ty = do case grt == ty' of -- were fine True -> do - (var, s3) <- doExpr ty' (MExpr meta $ Load ptr) + (var, s3) <- doExpr ty' (MExpr meta $ loadInstr ptr) return (var, s1 `snocOL` s2 `snocOL` s3, []) @@ -1323,32 +1341,34 @@ genLoad_fast e r n ty = do False -> do let pty = pLift ty' (ptr', s3) <- doExpr pty $ Cast LM_Bitcast ptr pty - (var, s4) <- doExpr ty' (MExpr meta $ Load ptr') + (var, s4) <- doExpr ty' (MExpr meta $ loadInstr ptr') return (var, s1 `snocOL` s2 `snocOL` s3 `snocOL` s4, []) -- If its a bit type then we use the slow method since -- we can't avoid casting anyway. - False -> genLoad_slow e ty meta - + False -> genLoad_slow atomic e ty meta + where + loadInstr ptr | atomic = ALoad SyncSeqCst False ptr + | otherwise = Load ptr -- | Handle Cmm load expression. -- Generic case. Uses casts and pointer arithmetic if needed. -genLoad_slow :: CmmExpr -> CmmType -> [MetaAnnot] -> LlvmM ExprData -genLoad_slow e ty meta = do +genLoad_slow :: Atomic -> CmmExpr -> CmmType -> [MetaAnnot] -> LlvmM ExprData +genLoad_slow atomic e ty meta = do (iptr, stmts, tops) <- exprToVar e dflags <- getDynFlags case getVarType iptr of LMPointer _ -> do (dvar, load) <- doExpr (cmmToLlvmType ty) - (MExpr meta $ Load iptr) + (MExpr meta $ loadInstr iptr) return (dvar, stmts `snocOL` load, tops) i@(LMInt _) | i == llvmWord dflags -> do let pty = LMPointer $ cmmToLlvmType ty (ptr, cast) <- doExpr pty $ Cast LM_Inttoptr iptr pty (dvar, load) <- doExpr (cmmToLlvmType ty) - (MExpr meta $ Load ptr) + (MExpr meta $ loadInstr ptr) return (dvar, stmts `snocOL` cast `snocOL` load, tops) other -> do dflags <- getDynFlags @@ -1357,6 +1377,9 @@ genLoad_slow e ty meta = do "Size of Ptr: " ++ show (llvmPtrBits dflags) ++ ", Size of var: " ++ show (llvmWidthInBits dflags other) ++ ", Var: " ++ showSDoc dflags (ppr iptr))) + where + loadInstr ptr | atomic = ALoad SyncSeqCst False ptr + | otherwise = Load ptr -- | Handle CmmReg expression. This will return a pointer to the stack diff --git a/compiler/nativeGen/CPrim.hs b/compiler/nativeGen/CPrim.hs index a6f4cab7bd..34782dfc1c 100644 --- a/compiler/nativeGen/CPrim.hs +++ b/compiler/nativeGen/CPrim.hs @@ -1,11 +1,16 @@ -- | Generating C symbol names emitted by the compiler. module CPrim - ( popCntLabel + ( atomicReadLabel + , atomicWriteLabel + , atomicRMWLabel + , cmpxchgLabel + , popCntLabel , bSwapLabel , word2FloatLabel ) where import CmmType +import CmmMachOp import Outputable popCntLabel :: Width -> String @@ -31,3 +36,46 @@ word2FloatLabel w = "hs_word2float" ++ pprWidth w pprWidth W32 = "32" pprWidth W64 = "64" pprWidth w = pprPanic "word2FloatLabel: Unsupported word width " (ppr w) + +atomicRMWLabel :: Width -> AtomicMachOp -> String +atomicRMWLabel w amop = "hs_atomic_" ++ pprFunName amop ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "atomicRMWLabel: Unsupported word width " (ppr w) + + pprFunName AMO_Add = "add" + pprFunName AMO_Sub = "sub" + pprFunName AMO_And = "and" + pprFunName AMO_Nand = "nand" + pprFunName AMO_Or = "or" + pprFunName AMO_Xor = "xor" + +cmpxchgLabel :: Width -> String +cmpxchgLabel w = "hs_cmpxchg" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "cmpxchgLabel: Unsupported word width " (ppr w) + +atomicReadLabel :: Width -> String +atomicReadLabel w = "hs_atomicread" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "atomicReadLabel: Unsupported word width " (ppr w) + +atomicWriteLabel :: Width -> String +atomicWriteLabel w = "hs_atomicwrite" ++ pprWidth w + where + pprWidth W8 = "8" + pprWidth W16 = "16" + pprWidth W32 = "32" + pprWidth W64 = "64" + pprWidth w = pprPanic "atomicWriteLabel: Unsupported word width " (ppr w) diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs index 91651e6065..22a2c7cb6a 100644 --- a/compiler/nativeGen/PPC/CodeGen.hs +++ b/compiler/nativeGen/PPC/CodeGen.hs @@ -1160,6 +1160,10 @@ genCCall' dflags gcp target dest_regs args0 MO_BSwap w -> (fsLit $ bSwapLabel w, False) MO_PopCnt w -> (fsLit $ popCntLabel w, False) + MO_AtomicRMW w amop -> (fsLit $ atomicRMWLabel w amop, False) + MO_Cmpxchg w -> (fsLit $ cmpxchgLabel w, False) + MO_AtomicRead w -> (fsLit $ atomicReadLabel w, False) + MO_AtomicWrite w -> (fsLit $ atomicWriteLabel w, False) MO_S_QuotRem {} -> unsupported MO_U_QuotRem {} -> unsupported diff --git a/compiler/nativeGen/SPARC/CodeGen.hs b/compiler/nativeGen/SPARC/CodeGen.hs index f5e61d0a8f..51f89d629f 100644 --- a/compiler/nativeGen/SPARC/CodeGen.hs +++ b/compiler/nativeGen/SPARC/CodeGen.hs @@ -654,6 +654,10 @@ outOfLineMachOp_table mop MO_BSwap w -> fsLit $ bSwapLabel w MO_PopCnt w -> fsLit $ popCntLabel w + MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop + MO_Cmpxchg w -> fsLit $ cmpxchgLabel w + MO_AtomicRead w -> fsLit $ atomicReadLabel w + MO_AtomicWrite w -> fsLit $ atomicWriteLabel w MO_S_QuotRem {} -> unsupported MO_U_QuotRem {} -> unsupported diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index fa93767fa3..8e9b49d78d 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -1057,6 +1057,18 @@ getAmode' _ expr = do (reg,code) <- getSomeReg expr return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code) +-- | Like 'getAmode', but on 32-bit use simple register addressing +-- (i.e. no index register). This stops us from running out of +-- registers on x86 when using instructions such as cmpxchg, which can +-- use up to three virtual registers and one fixed register. +getSimpleAmode :: DynFlags -> Bool -> CmmExpr -> NatM Amode +getSimpleAmode dflags is32Bit addr + | is32Bit = do + addr_code <- getAnyReg addr + addr_r <- getNewRegNat (intSize (wordWidth dflags)) + let amode = AddrBaseIndex (EABaseReg addr_r) EAIndexNone (ImmInt 0) + return $! Amode amode (addr_code addr_r) + | otherwise = getAmode addr x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode x86_complex_amode base index shift offset @@ -1761,6 +1773,99 @@ genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do where lbl = mkCmmCodeLabel primPackageId (fsLit (word2FloatLabel width)) +genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = do + Amode amode addr_code <- + if amop `elem` [AMO_Add, AMO_Sub] + then getAmode addr + else getSimpleAmode dflags is32Bit addr -- See genCCall for MO_Cmpxchg + arg <- getNewRegNat size + arg_code <- getAnyReg n + use_sse2 <- sse2Enabled + let platform = targetPlatform dflags + dst_r = getRegisterReg platform use_sse2 (CmmLocal dst) + code <- op_code dst_r arg amode + return $ addr_code `appOL` arg_code arg `appOL` code + where + -- Code for the operation + op_code :: Reg -- Destination reg + -> Reg -- Register containing argument + -> AddrMode -- Address of location to mutate + -> NatM (OrdList Instr) + op_code dst_r arg amode = case amop of + -- In the common case where dst_r is a virtual register the + -- final move should go away, because it's the last use of arg + -- and the first use of dst_r. + AMO_Add -> return $ toOL [ LOCK + , XADD size (OpReg arg) (OpAddr amode) + , MOV size (OpReg arg) (OpReg dst_r) + ] + AMO_Sub -> return $ toOL [ NEGI size (OpReg arg) + , LOCK + , XADD size (OpReg arg) (OpAddr amode) + , MOV size (OpReg arg) (OpReg dst_r) + ] + AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND size src dst) + AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND size src dst + , NOT size dst + ]) + AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR size src dst) + AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR size src dst) + where + -- Simulate operation that lacks a dedicated instruction using + -- cmpxchg. + cmpxchg_code :: (Operand -> Operand -> OrdList Instr) + -> NatM (OrdList Instr) + cmpxchg_code instrs = do + lbl <- getBlockIdNat + tmp <- getNewRegNat size + return $ toOL + [ MOV size (OpAddr amode) (OpReg eax) + , JXX ALWAYS lbl + , NEWBLOCK lbl + -- Keep old value so we can return it: + , MOV size (OpReg eax) (OpReg dst_r) + , MOV size (OpReg eax) (OpReg tmp) + ] + `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL + [ LOCK + , CMPXCHG size (OpReg tmp) (OpAddr amode) + , JXX NE lbl + ] + + size = intSize width + +genCCall dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] = do + load_code <- intLoadCode (MOV (intSize width)) addr + let platform = targetPlatform dflags + use_sse2 <- sse2Enabled + return (load_code (getRegisterReg platform use_sse2 (CmmLocal dst))) + +genCCall _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] = do + assignMem_IntCode (intSize width) addr val + +genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] = do + -- On x86 we don't have enough registers to use cmpxchg with a + -- complicated addressing mode, so on that architecture we + -- pre-compute the address first. + Amode amode addr_code <- getSimpleAmode dflags is32Bit addr + newval <- getNewRegNat size + newval_code <- getAnyReg new + oldval <- getNewRegNat size + oldval_code <- getAnyReg old + use_sse2 <- sse2Enabled + let platform = targetPlatform dflags + dst_r = getRegisterReg platform use_sse2 (CmmLocal dst) + code = toOL + [ MOV size (OpReg oldval) (OpReg eax) + , LOCK + , CMPXCHG size (OpReg newval) (OpAddr amode) + , MOV size (OpReg eax) (OpReg dst_r) + ] + return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval + `appOL` code + where + size = intSize width + genCCall _ is32Bit target dest_regs args | is32Bit = genCCall32 target dest_regs args | otherwise = genCCall64 target dest_regs args @@ -2385,6 +2490,11 @@ outOfLineCmmOp mop res args MO_PopCnt _ -> fsLit "popcnt" MO_BSwap _ -> fsLit "bswap" + MO_AtomicRMW _ _ -> fsLit "atomicrmw" + MO_AtomicRead _ -> fsLit "atomicread" + MO_AtomicWrite _ -> fsLit "atomicwrite" + MO_Cmpxchg _ -> fsLit "cmpxchg" + MO_UF_Conv _ -> unsupported MO_S_QuotRem {} -> unsupported diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs index 05fff9be96..ac91747171 100644 --- a/compiler/nativeGen/X86/Instr.hs +++ b/compiler/nativeGen/X86/Instr.hs @@ -327,6 +327,10 @@ data Instr | PREFETCH PrefetchVariant Size Operand -- prefetch Variant, addr size, address to prefetch -- variant can be NTA, Lvl0, Lvl1, or Lvl2 + | LOCK -- lock prefix + | XADD Size Operand Operand -- src (r), dst (r/m) + | CMPXCHG Size Operand Operand -- src (r), dst (r/m), eax implicit + data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2 @@ -337,6 +341,8 @@ data Operand +-- | Returns which registers are read and written as a (read, written) +-- pair. x86_regUsageOfInstr :: Platform -> Instr -> RegUsage x86_regUsageOfInstr platform instr = case instr of @@ -428,10 +434,21 @@ x86_regUsageOfInstr platform instr -- note: might be a better way to do this PREFETCH _ _ src -> mkRU (use_R src []) [] + LOCK -> noUsage + XADD _ src dst -> usageMM src dst + CMPXCHG _ src dst -> usageRMM src dst (OpReg eax) _other -> panic "regUsage: unrecognised instr" - where + -- # Definitions + -- + -- Written: If the operand is a register, it's written. If it's an + -- address, registers mentioned in the address are read. + -- + -- Modified: If the operand is a register, it's both read and + -- written. If it's an address, registers mentioned in the address + -- are read. + -- 2 operand form; first operand Read; second Written usageRW :: Operand -> Operand -> RegUsage usageRW op (OpReg reg) = mkRU (use_R op []) [reg] @@ -444,6 +461,18 @@ x86_regUsageOfInstr platform instr usageRM op (OpAddr ea) = mkRUR (use_R op $! use_EA ea []) usageRM _ _ = panic "X86.RegInfo.usageRM: no match" + -- 2 operand form; first operand Modified; second Modified + usageMM :: Operand -> Operand -> RegUsage + usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst] + usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src] + usageMM _ _ = panic "X86.RegInfo.usageMM: no match" + + -- 3 operand form; first operand Read; second Modified; third Modified + usageRMM :: Operand -> Operand -> Operand -> RegUsage + usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg] + usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg] + usageRMM _ _ _ = panic "X86.RegInfo.usageRMM: no match" + -- 1 operand form; operand Modified usageM :: Operand -> RegUsage usageM (OpReg reg) = mkRU [reg] [reg] @@ -476,6 +505,7 @@ x86_regUsageOfInstr platform instr where src' = filter (interesting platform) src dst' = filter (interesting platform) dst +-- | Is this register interesting for the register allocator? interesting :: Platform -> Reg -> Bool interesting _ (RegVirtual _) = True interesting platform (RegReal (RealRegSingle i)) = isFastTrue (freeReg platform i) @@ -483,6 +513,8 @@ interesting _ (RegReal (RealRegPair{})) = panic "X86.interesting: no re +-- | Applies the supplied function to all registers in instructions. +-- Typically used to change virtual registers to real registers. x86_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr x86_patchRegsOfInstr instr env = case instr of @@ -571,6 +603,10 @@ x86_patchRegsOfInstr instr env PREFETCH lvl size src -> PREFETCH lvl size (patchOp src) + LOCK -> instr + XADD sz src dst -> patch2 (XADD sz) src dst + CMPXCHG sz src dst -> patch2 (CMPXCHG sz) src dst + _other -> panic "patchRegs: unrecognised instr" where diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs index 459c041ba5..7771c02512 100644 --- a/compiler/nativeGen/X86/Ppr.hs +++ b/compiler/nativeGen/X86/Ppr.hs @@ -886,6 +886,14 @@ pprInstr GFREE ptext (sLit "\tffree %st(4) ;ffree %st(5)") ] +-- Atomics + +pprInstr LOCK = ptext (sLit "\tlock") + +pprInstr (XADD size src dst) = pprSizeOpOp (sLit "xadd") size src dst + +pprInstr (CMPXCHG size src dst) = pprSizeOpOp (sLit "cmpxchg") size src dst + pprInstr _ = panic "X86.Ppr.pprInstr: no match" diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp index 4851315eb4..4faa585246 100644 --- a/compiler/prelude/primops.txt.pp +++ b/compiler/prelude/primops.txt.pp @@ -1363,19 +1363,79 @@ primop SetByteArrayOp "setByteArray#" GenPrimOp code_size = { primOpCodeSizeForeignCall + 4 } can_fail = True +-- Atomic operations + +primop AtomicReadByteArrayOp_Int "atomicReadIntArray#" GenPrimOp + MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #) + {Given an array and an offset in Int units, read an element. The + index is assumed to be in bounds. Implies a full memory barrier.} + with has_side_effects = True + can_fail = True + +primop AtomicWriteByteArrayOp_Int "atomicWriteIntArray#" GenPrimOp + MutableByteArray# s -> Int# -> Int# -> State# s -> State# s + {Given an array and an offset in Int units, write an element. The + index is assumed to be in bounds. Implies a full memory barrier.} + with has_side_effects = True + can_fail = True + primop CasByteArrayOp_Int "casIntArray#" GenPrimOp MutableByteArray# s -> Int# -> Int# -> Int# -> State# s -> (# State# s, Int# #) - {Machine-level atomic compare and swap on a word within a ByteArray.} - with - out_of_line = True - has_side_effects = True + {Given an array, an offset in Int units, the expected old value, and + the new value, perform an atomic compare and swap i.e. write the new + value if the current value matches the provided old value. Returns + the value of the element before the operation. Implies a full memory + barrier.} + with has_side_effects = True + can_fail = True primop FetchAddByteArrayOp_Int "fetchAddIntArray#" GenPrimOp MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) - {Machine-level word-sized fetch-and-add within a ByteArray.} - with - out_of_line = True - has_side_effects = True + {Given an array, and offset in Int units, and a value to add, + atomically add the value to the element. Returns the value of the + element before the operation. Implies a full memory barrier.} + with has_side_effects = True + can_fail = True + +primop FetchSubByteArrayOp_Int "fetchSubIntArray#" GenPrimOp + MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) + {Given an array, and offset in Int units, and a value to subtract, + atomically substract the value to the element. Returns the value of + the element before the operation. Implies a full memory barrier.} + with has_side_effects = True + can_fail = True + +primop FetchAndByteArrayOp_Int "fetchAndIntArray#" GenPrimOp + MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) + {Given an array, and offset in Int units, and a value to AND, + atomically AND the value to the element. Returns the value of the + element before the operation. Implies a full memory barrier.} + with has_side_effects = True + can_fail = True + +primop FetchNandByteArrayOp_Int "fetchNandIntArray#" GenPrimOp + MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) + {Given an array, and offset in Int units, and a value to NAND, + atomically NAND the value to the element. Returns the value of the + element before the operation. Implies a full memory barrier.} + with has_side_effects = True + can_fail = True + +primop FetchOrByteArrayOp_Int "fetchOrIntArray#" GenPrimOp + MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) + {Given an array, and offset in Int units, and a value to OR, + atomically OR the value to the element. Returns the value of the + element before the operation. Implies a full memory barrier.} + with has_side_effects = True + can_fail = True + +primop FetchXorByteArrayOp_Int "fetchXorIntArray#" GenPrimOp + MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) + {Given an array, and offset in Int units, and a value to XOR, + atomically XOR the value to the element. Returns the value of the + element before the operation. Implies a full memory barrier.} + with has_side_effects = True + can_fail = True ------------------------------------------------------------------------ diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h index 0c4d2f9eaf..ee5a119aa1 100644 --- a/includes/stg/MiscClosures.h +++ b/includes/stg/MiscClosures.h @@ -348,7 +348,6 @@ RTS_FUN_DECL(stg_newByteArrayzh); RTS_FUN_DECL(stg_newPinnedByteArrayzh); RTS_FUN_DECL(stg_newAlignedPinnedByteArrayzh); RTS_FUN_DECL(stg_casIntArrayzh); -RTS_FUN_DECL(stg_fetchAddIntArrayzh); RTS_FUN_DECL(stg_newArrayzh); RTS_FUN_DECL(stg_newArrayArrayzh); RTS_FUN_DECL(stg_copyArrayzh); diff --git a/libraries/ghc-prim/cbits/atomic.c b/libraries/ghc-prim/cbits/atomic.c new file mode 100644 index 0000000000..e3d6cc1e95 --- /dev/null +++ b/libraries/ghc-prim/cbits/atomic.c @@ -0,0 +1,306 @@ +#include "Rts.h" + +// Fallbacks for atomic primops on byte arrays. The builtins used +// below are supported on both GCC and LLVM. +// +// Ideally these function would take StgWord8, StgWord16, etc but +// older GCC versions incorrectly assume that the register that the +// argument is passed in has been zero extended, which is incorrect +// according to the ABI and is not what GHC does when it generates +// calls to these functions. + +// FetchAddByteArrayOp_Int + +extern StgWord hs_atomic_add8(volatile StgWord8 *x, StgWord val); +StgWord +hs_atomic_add8(volatile StgWord8 *x, StgWord val) +{ + return __sync_fetch_and_add(x, (StgWord8) val); +} + +extern StgWord hs_atomic_add16(volatile StgWord16 *x, StgWord val); +StgWord +hs_atomic_add16(volatile StgWord16 *x, StgWord val) +{ + return __sync_fetch_and_add(x, (StgWord16) val); +} + +extern StgWord hs_atomic_add32(volatile StgWord32 *x, StgWord val); +StgWord +hs_atomic_add32(volatile StgWord32 *x, StgWord val) +{ + return __sync_fetch_and_add(x, (StgWord32) val); +} + +extern StgWord64 hs_atomic_add64(volatile StgWord64 *x, StgWord64 val); +StgWord64 +hs_atomic_add64(volatile StgWord64 *x, StgWord64 val) +{ + return __sync_fetch_and_add(x, val); +} + +// FetchSubByteArrayOp_Int + +extern StgWord hs_atomic_sub8(volatile StgWord8 *x, StgWord val); +StgWord +hs_atomic_sub8(volatile StgWord8 *x, StgWord val) +{ + return __sync_fetch_and_sub(x, (StgWord8) val); +} + +extern StgWord hs_atomic_sub16(volatile StgWord16 *x, StgWord val); +StgWord +hs_atomic_sub16(volatile StgWord16 *x, StgWord val) +{ + return __sync_fetch_and_sub(x, (StgWord16) val); +} + +extern StgWord hs_atomic_sub32(volatile StgWord32 *x, StgWord val); +StgWord +hs_atomic_sub32(volatile StgWord32 *x, StgWord val) +{ + return __sync_fetch_and_sub(x, (StgWord32) val); +} + +extern StgWord64 hs_atomic_sub64(volatile StgWord64 *x, StgWord64 val); +StgWord64 +hs_atomic_sub64(volatile StgWord64 *x, StgWord64 val) +{ + return __sync_fetch_and_sub(x, val); +} + +// FetchAndByteArrayOp_Int + +extern StgWord hs_atomic_and8(volatile StgWord8 *x, StgWord val); +StgWord +hs_atomic_and8(volatile StgWord8 *x, StgWord val) +{ + return __sync_fetch_and_and(x, (StgWord8) val); +} + +extern StgWord hs_atomic_and16(volatile StgWord16 *x, StgWord val); +StgWord +hs_atomic_and16(volatile StgWord16 *x, StgWord val) +{ + return __sync_fetch_and_and(x, (StgWord16) val); +} + +extern StgWord hs_atomic_and32(volatile StgWord32 *x, StgWord val); +StgWord +hs_atomic_and32(volatile StgWord32 *x, StgWord val) +{ + return __sync_fetch_and_and(x, (StgWord32) val); +} + +extern StgWord64 hs_atomic_and64(volatile StgWord64 *x, StgWord64 val); +StgWord64 +hs_atomic_and64(volatile StgWord64 *x, StgWord64 val) +{ + return __sync_fetch_and_and(x, val); +} + +// FetchNandByteArrayOp_Int + +// Workaround for http://llvm.org/bugs/show_bug.cgi?id=8842 +#define CAS_NAND(x, val) \ + { \ + __typeof__ (*(x)) tmp = *(x); \ + while (!__sync_bool_compare_and_swap(x, tmp, ~(tmp & (val)))) { \ + tmp = *(x); \ + } \ + return tmp; \ + } + +extern StgWord hs_atomic_nand8(volatile StgWord8 *x, StgWord val); +StgWord +hs_atomic_nand8(volatile StgWord8 *x, StgWord val) +{ +#ifdef __clang__ + CAS_NAND(x, (StgWord8) val) +#else + return __sync_fetch_and_nand(x, (StgWord8) val); +#endif +} + +extern StgWord hs_atomic_nand16(volatile StgWord16 *x, StgWord val); +StgWord +hs_atomic_nand16(volatile StgWord16 *x, StgWord val) +{ +#ifdef __clang__ + CAS_NAND(x, (StgWord16) val); +#else + return __sync_fetch_and_nand(x, (StgWord16) val); +#endif +} + +extern StgWord hs_atomic_nand32(volatile StgWord32 *x, StgWord val); +StgWord +hs_atomic_nand32(volatile StgWord32 *x, StgWord val) +{ +#ifdef __clang__ + CAS_NAND(x, (StgWord32) val); +#else + return __sync_fetch_and_nand(x, (StgWord32) val); +#endif +} + +extern StgWord64 hs_atomic_nand64(volatile StgWord64 *x, StgWord64 val); +StgWord64 +hs_atomic_nand64(volatile StgWord64 *x, StgWord64 val) +{ +#ifdef __clang__ + CAS_NAND(x, val); +#else + return __sync_fetch_and_nand(x, val); +#endif +} + +// FetchOrByteArrayOp_Int + +extern StgWord hs_atomic_or8(volatile StgWord8 *x, StgWord val); +StgWord +hs_atomic_or8(volatile StgWord8 *x, StgWord val) +{ + return __sync_fetch_and_or(x, (StgWord8) val); +} + +extern StgWord hs_atomic_or16(volatile StgWord16 *x, StgWord val); +StgWord +hs_atomic_or16(volatile StgWord16 *x, StgWord val) +{ + return __sync_fetch_and_or(x, (StgWord16) val); +} + +extern StgWord hs_atomic_or32(volatile StgWord32 *x, StgWord val); +StgWord +hs_atomic_or32(volatile StgWord32 *x, StgWord val) +{ + return __sync_fetch_and_or(x, (StgWord32) val); +} + +extern StgWord64 hs_atomic_or64(volatile StgWord64 *x, StgWord64 val); +StgWord64 +hs_atomic_or64(volatile StgWord64 *x, StgWord64 val) +{ + return __sync_fetch_and_or(x, val); +} + +// FetchXorByteArrayOp_Int + +extern StgWord hs_atomic_xor8(volatile StgWord8 *x, StgWord val); +StgWord +hs_atomic_xor8(volatile StgWord8 *x, StgWord val) +{ + return __sync_fetch_and_xor(x, (StgWord8) val); +} + +extern StgWord hs_atomic_xor16(volatile StgWord16 *x, StgWord val); +StgWord +hs_atomic_xor16(volatile StgWord16 *x, StgWord val) +{ + return __sync_fetch_and_xor(x, (StgWord16) val); +} + +extern StgWord hs_atomic_xor32(volatile StgWord32 *x, StgWord val); +StgWord +hs_atomic_xor32(volatile StgWord32 *x, StgWord val) +{ + return __sync_fetch_and_xor(x, (StgWord32) val); +} + +extern StgWord64 hs_atomic_xor64(volatile StgWord64 *x, StgWord64 val); +StgWord64 +hs_atomic_xor64(volatile StgWord64 *x, StgWord64 val) +{ + return __sync_fetch_and_xor(x, val); +} + +// CasByteArrayOp_Int + +extern StgWord hs_cmpxchg8(volatile StgWord8 *x, StgWord old, StgWord new); +StgWord +hs_cmpxchg8(volatile StgWord8 *x, StgWord old, StgWord new) +{ + return __sync_val_compare_and_swap(x, (StgWord8) old, (StgWord8) new); +} + +extern StgWord hs_cmpxchg16(volatile StgWord16 *x, StgWord old, StgWord new); +StgWord +hs_cmpxchg16(volatile StgWord16 *x, StgWord old, StgWord new) +{ + return __sync_val_compare_and_swap(x, (StgWord16) old, (StgWord16) new); +} + +extern StgWord hs_cmpxchg32(volatile StgWord32 *x, StgWord old, StgWord new); +StgWord +hs_cmpxchg32(volatile StgWord32 *x, StgWord old, StgWord new) +{ + return __sync_val_compare_and_swap(x, (StgWord32) old, (StgWord32) new); +} + +extern StgWord hs_cmpxchg64(volatile StgWord64 *x, StgWord64 old, StgWord64 new); +StgWord +hs_cmpxchg64(volatile StgWord64 *x, StgWord64 old, StgWord64 new) +{ + return __sync_val_compare_and_swap(x, old, new); +} + +// AtomicReadByteArrayOp_Int + +extern StgWord hs_atomicread8(volatile StgWord8 *x); +StgWord +hs_atomicread8(volatile StgWord8 *x) +{ + return *x; +} + +extern StgWord hs_atomicread16(volatile StgWord16 *x); +StgWord +hs_atomicread16(volatile StgWord16 *x) +{ + return *x; +} + +extern StgWord hs_atomicread32(volatile StgWord32 *x); +StgWord +hs_atomicread32(volatile StgWord32 *x) +{ + return *x; +} + +extern StgWord64 hs_atomicread64(volatile StgWord64 *x); +StgWord64 +hs_atomicread64(volatile StgWord64 *x) +{ + return *x; +} + +// AtomicWriteByteArrayOp_Int + +extern void hs_atomicwrite8(volatile StgWord8 *x, StgWord val); +void +hs_atomicwrite8(volatile StgWord8 *x, StgWord val) +{ + *x = (StgWord8) val; +} + +extern void hs_atomicwrite16(volatile StgWord16 *x, StgWord val); +void +hs_atomicwrite16(volatile StgWord16 *x, StgWord val) +{ + *x = (StgWord16) val; +} + +extern void hs_atomicwrite32(volatile StgWord32 *x, StgWord val); +void +hs_atomicwrite32(volatile StgWord32 *x, StgWord val) +{ + *x = (StgWord32) val; +} + +extern void hs_atomicwrite64(volatile StgWord64 *x, StgWord64 val); +void +hs_atomicwrite64(volatile StgWord64 *x, StgWord64 val) +{ + *x = (StgWord64) val; +} diff --git a/libraries/ghc-prim/ghc-prim.cabal b/libraries/ghc-prim/ghc-prim.cabal index c861342b56..bc9f57126a 100644 --- a/libraries/ghc-prim/ghc-prim.cabal +++ b/libraries/ghc-prim/ghc-prim.cabal @@ -52,6 +52,7 @@ Library exposed-modules: GHC.Prim c-sources: + cbits/atomic.c cbits/bswap.c cbits/debug.c cbits/longlong.c diff --git a/rts/Linker.c b/rts/Linker.c index e5e61bb49a..ad96d74b6f 100644 --- a/rts/Linker.c +++ b/rts/Linker.c @@ -1186,7 +1186,6 @@ typedef struct _RtsSymbolVal { SymI_HasProto(stg_newBCOzh) \ SymI_HasProto(stg_newByteArrayzh) \ SymI_HasProto(stg_casIntArrayzh) \ - SymI_HasProto(stg_fetchAddIntArrayzh) \ SymI_HasProto(stg_newMVarzh) \ SymI_HasProto(stg_newMutVarzh) \ SymI_HasProto(stg_newTVarzh) \ diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm index 4d7baca824..5f04a6d732 100644 --- a/rts/PrimOps.cmm +++ b/rts/PrimOps.cmm @@ -151,18 +151,6 @@ stg_casIntArrayzh( gcptr arr, W_ ind, W_ old, W_ new ) } -stg_fetchAddIntArrayzh( gcptr arr, W_ ind, W_ incr ) -/* MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) */ -{ - W_ p, h; - - p = arr + SIZEOF_StgArrWords + WDS(ind); - (h) = ccall atomic_inc(p, incr); - - return(h); -} - - stg_newArrayzh ( W_ n /* words */, gcptr init ) { W_ words, size, p; diff --git a/testsuite/tests/concurrent/should_run/AtomicPrimops.hs b/testsuite/tests/concurrent/should_run/AtomicPrimops.hs new file mode 100644 index 0000000000..0c55aba93e --- /dev/null +++ b/testsuite/tests/concurrent/should_run/AtomicPrimops.hs @@ -0,0 +1,245 @@ +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE UnboxedTuples #-} + +module Main ( main ) where + +import Control.Concurrent +import Control.Concurrent.MVar +import Control.Monad (when) +import Foreign.Storable +import GHC.Exts +import GHC.IO + +-- | Iterations per worker. +iters :: Int +iters = 1000000 + +main :: IO () +main = do + fetchAddSubTest + fetchAndTest + fetchNandTest + fetchOrTest + fetchXorTest + casTest + readWriteTest + +-- | Test fetchAddIntArray# by having two threads concurrenctly +-- increment a counter and then checking the sum at the end. +fetchAddSubTest :: IO () +fetchAddSubTest = do + tot <- race 0 + (\ mba -> work fetchAddIntArray mba iters 2) + (\ mba -> work fetchSubIntArray mba iters 1) + assertEq 1000000 tot "fetchAddSubTest" + where + work :: (MByteArray -> Int -> Int -> IO ()) -> MByteArray -> Int -> Int + -> IO () + work op mba 0 val = return () + work op mba n val = op mba 0 val >> work op mba (n-1) val + +-- | Test fetchXorIntArray# by having two threads concurrenctly XORing +-- and then checking the result at the end. Works since XOR is +-- commutative. +-- +-- Covers the code paths for AND, NAND, and OR as well. +fetchXorTest :: IO () +fetchXorTest = do + res <- race n0 + (\ mba -> work mba iters t1pat) + (\ mba -> work mba iters t2pat) + assertEq expected res "fetchXorTest" + where + work :: MByteArray -> Int -> Int -> IO () + work mba 0 val = return () + work mba n val = fetchXorIntArray mba 0 val >> work mba (n-1) val + + -- Initial value is a large prime and the two patterns are 1010... + -- and 0101... + (n0, t1pat, t2pat) + | sizeOf (undefined :: Int) == 8 = + (0x00000000ffffffff, 0x5555555555555555, 0x9999999999999999) + | otherwise = (0x0000ffff, 0x55555555, 0x99999999) + expected + | sizeOf (undefined :: Int) == 8 = 4294967295 + | otherwise = 65535 + +-- The tests for AND, NAND, and OR are trivial for two reasons: +-- +-- * The code path is already well exercised by 'fetchXorTest'. +-- +-- * It's harder to test these operations, as a long sequence of them +-- convert to a single value but we'd like to write a test in the +-- style of 'fetchXorTest' that applies the operation repeatedly, +-- to make it likely that any race conditions are detected. +-- +-- Right now we only test that they return the correct value for a +-- single op on each thread. + +fetchOpTest :: (MByteArray -> Int -> Int -> IO ()) + -> Int -> String -> IO () +fetchOpTest op expected name = do + res <- race n0 + (\ mba -> work mba t1pat) + (\ mba -> work mba t2pat) + assertEq expected res name + where + work :: MByteArray -> Int -> IO () + work mba val = op mba 0 val + + -- Initial value is a large prime and the two patterns are 1010... + -- and 0101... + (n0, t1pat, t2pat) + | sizeOf (undefined :: Int) == 8 = + (0x00000000ffffffff, 0x5555555555555555, 0x9999999999999999) + | otherwise = (0x0000ffff, 0x55555555, 0x99999999) + +fetchAndTest :: IO () +fetchAndTest = fetchOpTest fetchAndIntArray expected "fetchAndTest" + where expected + | sizeOf (undefined :: Int) == 8 = 286331153 + | otherwise = 4369 + +fetchNandTest :: IO () +fetchNandTest = fetchOpTest fetchNandIntArray expected "fetchNandTest" + where expected + | sizeOf (undefined :: Int) == 8 = 7378697629770151799 + | otherwise = -2576976009 + +fetchOrTest :: IO () +fetchOrTest = fetchOpTest fetchOrIntArray expected "fetchOrTest" + where expected + | sizeOf (undefined :: Int) == 8 = 15987178197787607039 + | otherwise = 3722313727 + +-- | Test casIntArray# by using it to emulate fetchAddIntArray# and +-- then having two threads concurrenctly increment a counter, +-- checking the sum at the end. +casTest :: IO () +casTest = do + tot <- race 0 + (\ mba -> work mba iters 1) + (\ mba -> work mba iters 2) + assertEq 3000000 tot "casTest" + where + work :: MByteArray -> Int -> Int -> IO () + work mba 0 val = return () + work mba n val = add mba 0 val >> work mba (n-1) val + + -- Fetch-and-add implemented using CAS. + add :: MByteArray -> Int -> Int -> IO () + add mba ix n = do + old <- readIntArray mba ix + old' <- casIntArray mba ix old (old + n) + when (old /= old') $ add mba ix n + +-- | Tests atomic reads and writes by making sure that one thread sees +-- updates that are done on another. This test isn't very good at the +-- moment, as this might work even without atomic ops, but at least it +-- exercises the code. +readWriteTest :: IO () +readWriteTest = do + mba <- newByteArray (sizeOf (undefined :: Int)) + writeIntArray mba 0 0 + latch <- newEmptyMVar + done <- newEmptyMVar + forkIO $ do + takeMVar latch + n <- atomicReadIntArray mba 0 + assertEq 1 n "readWriteTest" + putMVar done () + atomicWriteIntArray mba 0 1 + putMVar latch () + takeMVar done + +-- | Create two threads that mutate the byte array passed to them +-- concurrently. The array is one word large. +race :: Int -- ^ Initial value of array element + -> (MByteArray -> IO ()) -- ^ Thread 1 action + -> (MByteArray -> IO ()) -- ^ Thread 2 action + -> IO Int -- ^ Final value of array element +race n0 thread1 thread2 = do + done1 <- newEmptyMVar + done2 <- newEmptyMVar + mba <- newByteArray (sizeOf (undefined :: Int)) + writeIntArray mba 0 n0 + forkIO $ thread1 mba >> putMVar done1 () + forkIO $ thread2 mba >> putMVar done2 () + mapM_ takeMVar [done1, done2] + readIntArray mba 0 + +------------------------------------------------------------------------ +-- Test helper + +assertEq :: (Eq a, Show a) => a -> a -> String -> IO () +assertEq expected actual name + | expected == actual = putStrLn $ name ++ ": OK" + | otherwise = do + putStrLn $ name ++ ": FAIL" + putStrLn $ "Expected: " ++ show expected + putStrLn $ " Actual: " ++ show actual + +------------------------------------------------------------------------ +-- Wrappers around MutableByteArray# + +data MByteArray = MBA (MutableByteArray# RealWorld) + +fetchAddIntArray :: MByteArray -> Int -> Int -> IO () +fetchAddIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# -> + case fetchAddIntArray# mba# ix# n# s# of + (# s2#, _ #) -> (# s2#, () #) + +fetchSubIntArray :: MByteArray -> Int -> Int -> IO () +fetchSubIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# -> + case fetchSubIntArray# mba# ix# n# s# of + (# s2#, _ #) -> (# s2#, () #) + +fetchAndIntArray :: MByteArray -> Int -> Int -> IO () +fetchAndIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# -> + case fetchAndIntArray# mba# ix# n# s# of + (# s2#, _ #) -> (# s2#, () #) + +fetchNandIntArray :: MByteArray -> Int -> Int -> IO () +fetchNandIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# -> + case fetchNandIntArray# mba# ix# n# s# of + (# s2#, _ #) -> (# s2#, () #) + +fetchOrIntArray :: MByteArray -> Int -> Int -> IO () +fetchOrIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# -> + case fetchOrIntArray# mba# ix# n# s# of + (# s2#, _ #) -> (# s2#, () #) + +fetchXorIntArray :: MByteArray -> Int -> Int -> IO () +fetchXorIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# -> + case fetchXorIntArray# mba# ix# n# s# of + (# s2#, _ #) -> (# s2#, () #) + +newByteArray :: Int -> IO MByteArray +newByteArray (I# n#) = IO $ \ s# -> + case newByteArray# n# s# of + (# s2#, mba# #) -> (# s2#, MBA mba# #) + +writeIntArray :: MByteArray -> Int -> Int -> IO () +writeIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# -> + case writeIntArray# mba# ix# n# s# of + s2# -> (# s2#, () #) + +readIntArray :: MByteArray -> Int -> IO Int +readIntArray (MBA mba#) (I# ix#) = IO $ \ s# -> + case readIntArray# mba# ix# s# of + (# s2#, n# #) -> (# s2#, I# n# #) + +atomicWriteIntArray :: MByteArray -> Int -> Int -> IO () +atomicWriteIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# -> + case atomicWriteIntArray# mba# ix# n# s# of + s2# -> (# s2#, () #) + +atomicReadIntArray :: MByteArray -> Int -> IO Int +atomicReadIntArray (MBA mba#) (I# ix#) = IO $ \ s# -> + case atomicReadIntArray# mba# ix# s# of + (# s2#, n# #) -> (# s2#, I# n# #) + +casIntArray :: MByteArray -> Int -> Int -> Int -> IO Int +casIntArray (MBA mba#) (I# ix#) (I# old#) (I# new#) = IO $ \ s# -> + case casIntArray# mba# ix# old# new# s# of + (# s2#, old2# #) -> (# s2#, I# old2# #) diff --git a/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout b/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout new file mode 100644 index 0000000000..c37041a040 --- /dev/null +++ b/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout @@ -0,0 +1,7 @@ +fetchAddSubTest: OK +fetchAndTest: OK +fetchNandTest: OK +fetchOrTest: OK +fetchXorTest: OK +casTest: OK +readWriteTest: OK diff --git a/testsuite/tests/concurrent/should_run/all.T b/testsuite/tests/concurrent/should_run/all.T index 0b502c3bc7..0a66892d82 100644 --- a/testsuite/tests/concurrent/should_run/all.T +++ b/testsuite/tests/concurrent/should_run/all.T @@ -81,6 +81,7 @@ test('tryReadMVar1', normal, compile_and_run, ['']) test('tryReadMVar2', normal, compile_and_run, ['']) test('T7970', normal, compile_and_run, ['']) +test('AtomicPrimops', normal, compile_and_run, ['']) # ----------------------------------------------------------------------------- # These tests we only do for a full run |