summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohan Tibell <johan.tibell@gmail.com>2014-06-09 11:43:21 +0200
committerJohan Tibell <johan.tibell@gmail.com>2014-06-24 19:06:53 +0200
commitd8abf85f8ca176854e9d5d0b12371c4bc402aac3 (patch)
tree9542f2b8a7ee9b9759396e4172ad5fc4ce1d2f3c
parenta4a79b5a04658ac542b1e07a6975b488fd589441 (diff)
downloadhaskell-d8abf85f8ca176854e9d5d0b12371c4bc402aac3.tar.gz
Add more primops for atomic ops on byte arrays
Summary: Add more primops for atomic ops on byte arrays Adds the following primops: * atomicReadIntArray# * atomicWriteIntArray# * fetchSubIntArray# * fetchOrIntArray# * fetchXorIntArray# * fetchAndIntArray# Makes these pre-existing out-of-line primops inline: * fetchAddIntArray# * casIntArray#
-rw-r--r--compiler/cmm/CmmMachOp.hs19
-rw-r--r--compiler/cmm/CmmSink.hs4
-rw-r--r--compiler/cmm/PprC.hs4
-rw-r--r--compiler/codeGen/StgCmmPrim.hs94
-rw-r--r--compiler/llvmGen/Llvm/AbsSyn.hs7
-rw-r--r--compiler/llvmGen/Llvm/PpLlvm.hs18
-rw-r--r--compiler/llvmGen/LlvmCodeGen/CodeGen.hs71
-rw-r--r--compiler/nativeGen/CPrim.hs50
-rw-r--r--compiler/nativeGen/PPC/CodeGen.hs4
-rw-r--r--compiler/nativeGen/SPARC/CodeGen.hs4
-rw-r--r--compiler/nativeGen/X86/CodeGen.hs92
-rw-r--r--compiler/nativeGen/X86/Instr.hs38
-rw-r--r--compiler/nativeGen/X86/Ppr.hs8
-rw-r--r--compiler/prelude/primops.txt.pp76
-rw-r--r--includes/stg/MiscClosures.h1
-rw-r--r--libraries/ghc-prim/cbits/atomic.c280
-rw-r--r--libraries/ghc-prim/ghc-prim.cabal1
-rw-r--r--rts/Linker.c1
-rw-r--r--rts/PrimOps.cmm12
-rw-r--r--testsuite/tests/concurrent/should_run/.gitignore1
-rw-r--r--testsuite/tests/concurrent/should_run/AtomicPrimops.hs245
-rw-r--r--testsuite/tests/concurrent/should_run/AtomicPrimops.stdout7
-rw-r--r--testsuite/tests/concurrent/should_run/all.T1
23 files changed, 984 insertions, 54 deletions
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index c4ec393ad6..d8ce492de1 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -19,6 +19,9 @@ module CmmMachOp
-- CallishMachOp
, CallishMachOp(..), callishMachOpHints
, pprCallishMachOp
+
+ -- Atomic read-modify-write
+ , AtomicMachOp(..)
)
where
@@ -547,8 +550,24 @@ data CallishMachOp
| MO_PopCnt Width
| MO_BSwap Width
+
+ -- Atomic read-modify-write.
+ | MO_AtomicRMW Width AtomicMachOp
+ | MO_AtomicRead Width
+ | MO_AtomicWrite Width
+ | MO_Cmpxchg Width
deriving (Eq, Show)
+-- | The operation to perform atomically.
+data AtomicMachOp =
+ AMO_Add
+ | AMO_Sub
+ | AMO_And
+ | AMO_Nand
+ | AMO_Or
+ | AMO_Xor
+ deriving (Eq, Show)
+
pprCallishMachOp :: CallishMachOp -> SDoc
pprCallishMachOp mo = text (show mo)
diff --git a/compiler/cmm/CmmSink.hs b/compiler/cmm/CmmSink.hs
index 4c025425ab..4dced9afd2 100644
--- a/compiler/cmm/CmmSink.hs
+++ b/compiler/cmm/CmmSink.hs
@@ -650,6 +650,10 @@ data AbsMem
-- perhaps we ought to have a special annotation for calls that can
-- modify heap/stack memory. For now we just use the conservative
-- definition here.
+--
+-- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
+-- therefore we should never float any memory operations across one of
+-- these calls.
bothMems :: AbsMem -> AbsMem -> AbsMem
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index 47b247e278..455c79ba02 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -753,6 +753,10 @@ pprCallishMachOp_for_C mop
MO_Memmove -> ptext (sLit "memmove")
(MO_BSwap w) -> ptext (sLit $ bSwapLabel w)
(MO_PopCnt w) -> ptext (sLit $ popCntLabel w)
+ (MO_AtomicRMW w amop) -> ptext (sLit $ atomicRMWLabel w amop)
+ (MO_Cmpxchg w) -> ptext (sLit $ cmpxchgLabel w)
+ (MO_AtomicRead w) -> ptext (sLit $ atomicReadLabel w)
+ (MO_AtomicWrite w) -> ptext (sLit $ atomicWriteLabel w)
(MO_UF_Conv w) -> ptext (sLit $ word2FloatLabel w)
MO_S_QuotRem {} -> unsupported
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index 40a5e3649b..e4c682bf02 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -769,6 +769,25 @@ emitPrimOp _ res PrefetchByteArrayOp0 args = doPrefetchByteArrayOp 0 res
emitPrimOp _ res PrefetchMutableByteArrayOp0 args = doPrefetchByteArrayOp 0 res args
emitPrimOp _ res PrefetchAddrOp0 args = doPrefetchAddrOp 0 res args
+-- Atomic read-modify-write
+emitPrimOp dflags [res] FetchAddByteArrayOp_Int [mba, ix, n] =
+ doAtomicRMW res AMO_Add mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchSubByteArrayOp_Int [mba, ix, n] =
+ doAtomicRMW res AMO_Sub mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchAndByteArrayOp_Int [mba, ix, n] =
+ doAtomicRMW res AMO_And mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchNandByteArrayOp_Int [mba, ix, n] =
+ doAtomicRMW res AMO_Nand mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchOrByteArrayOp_Int [mba, ix, n] =
+ doAtomicRMW res AMO_Or mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchXorByteArrayOp_Int [mba, ix, n] =
+ doAtomicRMW res AMO_Xor mba ix (bWord dflags) n
+emitPrimOp dflags [res] AtomicReadByteArrayOp_Int [mba, ix] =
+ doAtomicReadByteArray res mba ix (bWord dflags)
+emitPrimOp dflags [] AtomicWriteByteArrayOp_Int [mba, ix, val] =
+ doAtomicWriteByteArray mba ix (bWord dflags) val
+emitPrimOp dflags [res] CasByteArrayOp_Int [mba, ix, old, new] =
+ doCasByteArray res mba ix (bWord dflags) old new
-- The rest just translate straightforwardly
emitPrimOp dflags [res] op [arg]
@@ -1933,6 +1952,81 @@ doWriteSmallPtrArrayOp addr idx val = do
emit (setInfo addr (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel)))
------------------------------------------------------------------------------
+-- Atomic read-modify-write
+
+-- | Emit an atomic modification to a byte array element. The result
+-- reg contains that previous value of the element. Implies a full
+-- memory barrier.
+doAtomicRMW :: LocalReg -- ^ Result reg
+ -> AtomicMachOp -- ^ Atomic op (e.g. add)
+ -> CmmExpr -- ^ MutableByteArray#
+ -> CmmExpr -- ^ Index
+ -> CmmType -- ^ Type of element by which we are indexing
+ -> CmmExpr -- ^ Op argument (e.g. amount to add)
+ -> FCode ()
+doAtomicRMW res amop mba idx idx_ty n = do
+ dflags <- getDynFlags
+ let width = typeWidth idx_ty
+ addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
+ width mba idx
+ emitPrimCall
+ [ res ]
+ (MO_AtomicRMW width amop)
+ [ addr, n ]
+
+-- | Emit an atomic read to a byte array that acts as a memory barrier.
+doAtomicReadByteArray
+ :: LocalReg -- ^ Result reg
+ -> CmmExpr -- ^ MutableByteArray#
+ -> CmmExpr -- ^ Index
+ -> CmmType -- ^ Type of element by which we are indexing
+ -> FCode ()
+doAtomicReadByteArray res mba idx idx_ty = do
+ dflags <- getDynFlags
+ let width = typeWidth idx_ty
+ addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
+ width mba idx
+ emitPrimCall
+ [ res ]
+ (MO_AtomicRead width)
+ [ addr ]
+
+-- | Emit an atomic write to a byte array that acts as a memory barrier.
+doAtomicWriteByteArray
+ :: CmmExpr -- ^ MutableByteArray#
+ -> CmmExpr -- ^ Index
+ -> CmmType -- ^ Type of element by which we are indexing
+ -> CmmExpr -- ^ Value to write
+ -> FCode ()
+doAtomicWriteByteArray mba idx idx_ty val = do
+ dflags <- getDynFlags
+ let width = typeWidth idx_ty
+ addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
+ width mba idx
+ emitPrimCall
+ [ {- no results -} ]
+ (MO_AtomicWrite width)
+ [ addr, val ]
+
+doCasByteArray
+ :: LocalReg -- ^ Result reg
+ -> CmmExpr -- ^ MutableByteArray#
+ -> CmmExpr -- ^ Index
+ -> CmmType -- ^ Type of element by which we are indexing
+ -> CmmExpr -- ^ Old value
+ -> CmmExpr -- ^ New value
+ -> FCode ()
+doCasByteArray res mba idx idx_ty old new = do
+ dflags <- getDynFlags
+ let width = (typeWidth idx_ty)
+ addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
+ width mba idx
+ emitPrimCall
+ [ res ]
+ (MO_Cmpxchg width)
+ [ addr, old, new ]
+
+------------------------------------------------------------------------------
-- Helpers for emitting function calls
-- | Emit a call to @memcpy@.
diff --git a/compiler/llvmGen/Llvm/AbsSyn.hs b/compiler/llvmGen/Llvm/AbsSyn.hs
index f92bd89c5c..24d0856ea3 100644
--- a/compiler/llvmGen/Llvm/AbsSyn.hs
+++ b/compiler/llvmGen/Llvm/AbsSyn.hs
@@ -65,6 +65,8 @@ data LlvmFunction = LlvmFunction {
type LlvmFunctions = [LlvmFunction]
+type SingleThreaded = Bool
+
-- | LLVM ordering types for synchronization purposes. (Introduced in LLVM
-- 3.0). Please see the LLVM documentation for a better description.
data LlvmSyncOrdering
@@ -224,6 +226,11 @@ data LlvmExpression
| Load LlvmVar
{- |
+ Atomic load of the value at location ptr
+ -}
+ | ALoad LlvmSyncOrdering SingleThreaded LlvmVar
+
+ {- |
Navigate in an structure, selecting elements
* inbound: Is the pointer inbounds? (computed pointer doesn't overflow)
* ptr: Location of the structure
diff --git a/compiler/llvmGen/Llvm/PpLlvm.hs b/compiler/llvmGen/Llvm/PpLlvm.hs
index 025078226d..73077257f8 100644
--- a/compiler/llvmGen/Llvm/PpLlvm.hs
+++ b/compiler/llvmGen/Llvm/PpLlvm.hs
@@ -239,6 +239,7 @@ ppLlvmExpression expr
Insert vec elt idx -> ppInsert vec elt idx
GetElemPtr inb ptr indexes -> ppGetElementPtr inb ptr indexes
Load ptr -> ppLoad ptr
+ ALoad ord st ptr -> ppALoad ord st ptr
Malloc tp amount -> ppMalloc tp amount
Phi tp precessors -> ppPhi tp precessors
Asm asm c ty v se sk -> ppAsm asm c ty v se sk
@@ -327,13 +328,18 @@ ppSyncOrdering SyncSeqCst = text "seq_cst"
-- of specifying alignment.
ppLoad :: LlvmVar -> SDoc
-ppLoad var
- | isVecPtrVar var = text "load" <+> ppr var <>
- comma <+> text "align 1"
- | otherwise = text "load" <+> ppr var
+ppLoad var = text "load" <+> ppr var <> align
where
- isVecPtrVar :: LlvmVar -> Bool
- isVecPtrVar = isVector . pLower . getVarType
+ align | isVector . pLower . getVarType $ var = text ", align 1"
+ | otherwise = empty
+
+ppALoad :: LlvmSyncOrdering -> SingleThreaded -> LlvmVar -> SDoc
+ppALoad ord st var = sdocWithDynFlags $ \dflags ->
+ let alignment = (llvmWidthInBits dflags $ getVarType var) `quot` 8
+ align = text ", align" <+> ppr alignment
+ sThreaded | st = text " singlethread"
+ | otherwise = empty
+ in text "load atomic" <+> ppr var <> sThreaded <+> ppSyncOrdering ord <> align
ppStore :: LlvmVar -> LlvmVar -> SDoc
ppStore val dst
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index 517553516b..4a56600937 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -15,6 +15,7 @@ import BlockId
import CodeGen.Platform ( activeStgRegs, callerSaves )
import CLabel
import Cmm
+import CPrim
import PprCmm
import CmmUtils
import Hoopl
@@ -32,6 +33,7 @@ import Unique
import Data.List ( nub )
import Data.Maybe ( catMaybes )
+type Atomic = Bool
type LlvmStatements = OrdList LlvmStatement
-- -----------------------------------------------------------------------------
@@ -228,6 +230,17 @@ genCall t@(PrimTarget (MO_PopCnt w)) dsts args =
genCall t@(PrimTarget (MO_BSwap w)) dsts args =
genCallSimpleCast w t dsts args
+genCall (PrimTarget (MO_AtomicRead _)) [dst] [addr] = do
+ dstV <- getCmmReg (CmmLocal dst)
+ (v1, stmts, top) <- genLoad True addr (localRegType dst)
+ let stmt1 = Store v1 dstV
+ return (stmts `snocOL` stmt1, top)
+
+-- TODO: implement these properly rather than calling to RTS functions.
+-- genCall t@(PrimTarget (MO_AtomicWrite width)) [] [addr, val] = undefined
+-- genCall t@(PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = undefined
+-- genCall t@(PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] = undefined
+
-- Handle memcpy function specifically since llvm's intrinsic version takes
-- some extra parameters.
genCall t@(PrimTarget op) [] args'
@@ -548,7 +561,6 @@ cmmPrimOpFunctions mop = do
(MO_Prefetch_Data _ )-> fsLit "llvm.prefetch"
-
MO_S_QuotRem {} -> unsupported
MO_U_QuotRem {} -> unsupported
MO_U_QuotRem2 {} -> unsupported
@@ -558,6 +570,12 @@ cmmPrimOpFunctions mop = do
MO_Touch -> unsupported
MO_UF_Conv _ -> unsupported
+ MO_AtomicRead _ -> unsupported
+
+ MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
+ MO_Cmpxchg w -> fsLit $ cmpxchgLabel w
+ MO_AtomicWrite w -> fsLit $ atomicWriteLabel w
+
-- | Tail function calls
genJump :: CmmExpr -> [GlobalReg] -> LlvmM StmtData
@@ -849,7 +867,7 @@ exprToVarOpt opt e = case e of
-> genLit opt lit
CmmLoad e' ty
- -> genLoad e' ty
+ -> genLoad False e' ty
-- Cmmreg in expression is the value, so must load. If you want actual
-- reg pointer, call getCmmReg directly.
@@ -1268,41 +1286,41 @@ genMachOp_slow _ _ _ = panic "genMachOp: More then 2 expressions in MachOp!"
-- | Handle CmmLoad expression.
-genLoad :: CmmExpr -> CmmType -> LlvmM ExprData
+genLoad :: Atomic -> CmmExpr -> CmmType -> LlvmM ExprData
-- First we try to detect a few common cases and produce better code for
-- these then the default case. We are mostly trying to detect Cmm code
-- like I32[Sp + n] and use 'getelementptr' operations instead of the
-- generic case that uses casts and pointer arithmetic
-genLoad e@(CmmReg (CmmGlobal r)) ty
- = genLoad_fast e r 0 ty
+genLoad atomic e@(CmmReg (CmmGlobal r)) ty
+ = genLoad_fast atomic e r 0 ty
-genLoad e@(CmmRegOff (CmmGlobal r) n) ty
- = genLoad_fast e r n ty
+genLoad atomic e@(CmmRegOff (CmmGlobal r) n) ty
+ = genLoad_fast atomic e r n ty
-genLoad e@(CmmMachOp (MO_Add _) [
+genLoad atomic e@(CmmMachOp (MO_Add _) [
(CmmReg (CmmGlobal r)),
(CmmLit (CmmInt n _))])
ty
- = genLoad_fast e r (fromInteger n) ty
+ = genLoad_fast atomic e r (fromInteger n) ty
-genLoad e@(CmmMachOp (MO_Sub _) [
+genLoad atomic e@(CmmMachOp (MO_Sub _) [
(CmmReg (CmmGlobal r)),
(CmmLit (CmmInt n _))])
ty
- = genLoad_fast e r (negate $ fromInteger n) ty
+ = genLoad_fast atomic e r (negate $ fromInteger n) ty
-- generic case
-genLoad e ty
+genLoad atomic e ty
= do other <- getTBAAMeta otherN
- genLoad_slow e ty other
+ genLoad_slow atomic e ty other
-- | Handle CmmLoad expression.
-- This is a special case for loading from a global register pointer
-- offset such as I32[Sp+8].
-genLoad_fast :: CmmExpr -> GlobalReg -> Int -> CmmType
- -> LlvmM ExprData
-genLoad_fast e r n ty = do
+genLoad_fast :: Atomic -> CmmExpr -> GlobalReg -> Int -> CmmType
+ -> LlvmM ExprData
+genLoad_fast atomic e r n ty = do
dflags <- getDynFlags
(gv, grt, s1) <- getCmmRegVal (CmmGlobal r)
meta <- getTBAARegMeta r
@@ -1315,7 +1333,7 @@ genLoad_fast e r n ty = do
case grt == ty' of
-- were fine
True -> do
- (var, s3) <- doExpr ty' (MExpr meta $ Load ptr)
+ (var, s3) <- doExpr ty' (MExpr meta $ loadInstr ptr)
return (var, s1 `snocOL` s2 `snocOL` s3,
[])
@@ -1323,32 +1341,34 @@ genLoad_fast e r n ty = do
False -> do
let pty = pLift ty'
(ptr', s3) <- doExpr pty $ Cast LM_Bitcast ptr pty
- (var, s4) <- doExpr ty' (MExpr meta $ Load ptr')
+ (var, s4) <- doExpr ty' (MExpr meta $ loadInstr ptr')
return (var, s1 `snocOL` s2 `snocOL` s3
`snocOL` s4, [])
-- If its a bit type then we use the slow method since
-- we can't avoid casting anyway.
- False -> genLoad_slow e ty meta
-
+ False -> genLoad_slow atomic e ty meta
+ where
+ loadInstr ptr | atomic = ALoad SyncSeqCst False ptr
+ | otherwise = Load ptr
-- | Handle Cmm load expression.
-- Generic case. Uses casts and pointer arithmetic if needed.
-genLoad_slow :: CmmExpr -> CmmType -> [MetaAnnot] -> LlvmM ExprData
-genLoad_slow e ty meta = do
+genLoad_slow :: Atomic -> CmmExpr -> CmmType -> [MetaAnnot] -> LlvmM ExprData
+genLoad_slow atomic e ty meta = do
(iptr, stmts, tops) <- exprToVar e
dflags <- getDynFlags
case getVarType iptr of
LMPointer _ -> do
(dvar, load) <- doExpr (cmmToLlvmType ty)
- (MExpr meta $ Load iptr)
+ (MExpr meta $ loadInstr iptr)
return (dvar, stmts `snocOL` load, tops)
i@(LMInt _) | i == llvmWord dflags -> do
let pty = LMPointer $ cmmToLlvmType ty
(ptr, cast) <- doExpr pty $ Cast LM_Inttoptr iptr pty
(dvar, load) <- doExpr (cmmToLlvmType ty)
- (MExpr meta $ Load ptr)
+ (MExpr meta $ loadInstr ptr)
return (dvar, stmts `snocOL` cast `snocOL` load, tops)
other -> do dflags <- getDynFlags
@@ -1357,6 +1377,9 @@ genLoad_slow e ty meta = do
"Size of Ptr: " ++ show (llvmPtrBits dflags) ++
", Size of var: " ++ show (llvmWidthInBits dflags other) ++
", Var: " ++ showSDoc dflags (ppr iptr)))
+ where
+ loadInstr ptr | atomic = ALoad SyncSeqCst False ptr
+ | otherwise = Load ptr
-- | Handle CmmReg expression. This will return a pointer to the stack
diff --git a/compiler/nativeGen/CPrim.hs b/compiler/nativeGen/CPrim.hs
index a6f4cab7bd..34782dfc1c 100644
--- a/compiler/nativeGen/CPrim.hs
+++ b/compiler/nativeGen/CPrim.hs
@@ -1,11 +1,16 @@
-- | Generating C symbol names emitted by the compiler.
module CPrim
- ( popCntLabel
+ ( atomicReadLabel
+ , atomicWriteLabel
+ , atomicRMWLabel
+ , cmpxchgLabel
+ , popCntLabel
, bSwapLabel
, word2FloatLabel
) where
import CmmType
+import CmmMachOp
import Outputable
popCntLabel :: Width -> String
@@ -31,3 +36,46 @@ word2FloatLabel w = "hs_word2float" ++ pprWidth w
pprWidth W32 = "32"
pprWidth W64 = "64"
pprWidth w = pprPanic "word2FloatLabel: Unsupported word width " (ppr w)
+
+atomicRMWLabel :: Width -> AtomicMachOp -> String
+atomicRMWLabel w amop = "hs_atomic_" ++ pprFunName amop ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "atomicRMWLabel: Unsupported word width " (ppr w)
+
+ pprFunName AMO_Add = "add"
+ pprFunName AMO_Sub = "sub"
+ pprFunName AMO_And = "and"
+ pprFunName AMO_Nand = "nand"
+ pprFunName AMO_Or = "or"
+ pprFunName AMO_Xor = "xor"
+
+cmpxchgLabel :: Width -> String
+cmpxchgLabel w = "hs_cmpxchg" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "cmpxchgLabel: Unsupported word width " (ppr w)
+
+atomicReadLabel :: Width -> String
+atomicReadLabel w = "hs_atomicread" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "atomicReadLabel: Unsupported word width " (ppr w)
+
+atomicWriteLabel :: Width -> String
+atomicWriteLabel w = "hs_atomicwrite" ++ pprWidth w
+ where
+ pprWidth W8 = "8"
+ pprWidth W16 = "16"
+ pprWidth W32 = "32"
+ pprWidth W64 = "64"
+ pprWidth w = pprPanic "atomicWriteLabel: Unsupported word width " (ppr w)
diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs
index 91651e6065..22a2c7cb6a 100644
--- a/compiler/nativeGen/PPC/CodeGen.hs
+++ b/compiler/nativeGen/PPC/CodeGen.hs
@@ -1160,6 +1160,10 @@ genCCall' dflags gcp target dest_regs args0
MO_BSwap w -> (fsLit $ bSwapLabel w, False)
MO_PopCnt w -> (fsLit $ popCntLabel w, False)
+ MO_AtomicRMW w amop -> (fsLit $ atomicRMWLabel w amop, False)
+ MO_Cmpxchg w -> (fsLit $ cmpxchgLabel w, False)
+ MO_AtomicRead w -> (fsLit $ atomicReadLabel w, False)
+ MO_AtomicWrite w -> (fsLit $ atomicWriteLabel w, False)
MO_S_QuotRem {} -> unsupported
MO_U_QuotRem {} -> unsupported
diff --git a/compiler/nativeGen/SPARC/CodeGen.hs b/compiler/nativeGen/SPARC/CodeGen.hs
index f5e61d0a8f..51f89d629f 100644
--- a/compiler/nativeGen/SPARC/CodeGen.hs
+++ b/compiler/nativeGen/SPARC/CodeGen.hs
@@ -654,6 +654,10 @@ outOfLineMachOp_table mop
MO_BSwap w -> fsLit $ bSwapLabel w
MO_PopCnt w -> fsLit $ popCntLabel w
+ MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
+ MO_Cmpxchg w -> fsLit $ cmpxchgLabel w
+ MO_AtomicRead w -> fsLit $ atomicReadLabel w
+ MO_AtomicWrite w -> fsLit $ atomicWriteLabel w
MO_S_QuotRem {} -> unsupported
MO_U_QuotRem {} -> unsupported
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index fa93767fa3..140e8b2992 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1761,6 +1761,93 @@ genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do
where
lbl = mkCmmCodeLabel primPackageId (fsLit (word2FloatLabel width))
+genCCall dflags _ (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = do
+ Amode amode addr_code <- getAmode addr
+ arg <- getNewRegNat size
+ arg_code <- getAnyReg n
+ use_sse2 <- sse2Enabled
+ let platform = targetPlatform dflags
+ dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
+ code <- op_code dst_r arg amode
+ return $ addr_code `appOL` arg_code arg `appOL` code
+ where
+ -- Code for the operation
+ op_code :: Reg -- ^ The destination reg
+ -> Reg -- ^ Register containing argument
+ -> AddrMode -- ^ Address of location to mutate
+ -> NatM (OrdList Instr)
+ op_code dst_r arg amode = case amop of
+ -- In the common case where dst_r is a virtual register the
+ -- final move should go away, because it's the last use of arg
+ -- and the first use of dst_r.
+ AMO_Add -> return $ toOL [ LOCK
+ , XADD size (OpReg arg) (OpAddr amode)
+ , MOV size (OpReg arg) (OpReg dst_r)
+ ]
+ AMO_Sub -> return $ toOL [ NEGI size (OpReg arg)
+ , LOCK
+ , XADD size (OpReg arg) (OpAddr amode)
+ , MOV size (OpReg arg) (OpReg dst_r)
+ ]
+ AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND size src dst)
+ AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND size src dst
+ , NOT size dst
+ ])
+ AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR size src dst)
+ AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR size src dst)
+ where
+ -- Simulate operation that lacks a dedicated instruction using
+ -- cmpxchg.
+ cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
+ -> NatM (OrdList Instr)
+ cmpxchg_code instrs = do
+ lbl <- getBlockIdNat
+ tmp <- getNewRegNat size
+ return $ toOL
+ [ MOV size (OpAddr amode) (OpReg eax)
+ , JXX ALWAYS lbl
+ , NEWBLOCK lbl
+ -- Keep old value so we can return it:
+ , MOV size (OpReg eax) (OpReg dst_r)
+ , MOV size (OpReg eax) (OpReg tmp)
+ ]
+ `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
+ [ LOCK
+ , CMPXCHG size (OpReg tmp) (OpAddr amode)
+ , JXX NE lbl
+ ]
+
+ size = intSize width
+
+genCCall dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] = do
+ load_code <- intLoadCode (MOV (intSize width)) addr
+ let platform = targetPlatform dflags
+ use_sse2 <- sse2Enabled
+ return (load_code (getRegisterReg platform use_sse2 (CmmLocal dst)))
+
+genCCall _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] = do
+ assignMem_IntCode (intSize width) addr val
+
+genCCall dflags _ (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] = do
+ Amode amode addr_code <- getAmode addr
+ newval <- getNewRegNat size
+ newval_code <- getAnyReg new
+ oldval <- getNewRegNat size
+ oldval_code <- getAnyReg old
+ use_sse2 <- sse2Enabled
+ let platform = targetPlatform dflags
+ dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
+ code = toOL
+ [ MOV size (OpReg oldval) (OpReg eax)
+ , LOCK
+ , CMPXCHG size (OpReg newval) (OpAddr amode)
+ , MOV size (OpReg eax) (OpReg dst_r)
+ ]
+ return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval
+ `appOL` code
+ where
+ size = intSize width
+
genCCall _ is32Bit target dest_regs args
| is32Bit = genCCall32 target dest_regs args
| otherwise = genCCall64 target dest_regs args
@@ -2385,6 +2472,11 @@ outOfLineCmmOp mop res args
MO_PopCnt _ -> fsLit "popcnt"
MO_BSwap _ -> fsLit "bswap"
+ MO_AtomicRMW _ _ -> fsLit "atomicrmw"
+ MO_AtomicRead _ -> fsLit "atomicread"
+ MO_AtomicWrite _ -> fsLit "atomicwrite"
+ MO_Cmpxchg _ -> fsLit "cmpxchg"
+
MO_UF_Conv _ -> unsupported
MO_S_QuotRem {} -> unsupported
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index 05fff9be96..ac91747171 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -327,6 +327,10 @@ data Instr
| PREFETCH PrefetchVariant Size Operand -- prefetch Variant, addr size, address to prefetch
-- variant can be NTA, Lvl0, Lvl1, or Lvl2
+ | LOCK -- lock prefix
+ | XADD Size Operand Operand -- src (r), dst (r/m)
+ | CMPXCHG Size Operand Operand -- src (r), dst (r/m), eax implicit
+
data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
@@ -337,6 +341,8 @@ data Operand
+-- | Returns which registers are read and written as a (read, written)
+-- pair.
x86_regUsageOfInstr :: Platform -> Instr -> RegUsage
x86_regUsageOfInstr platform instr
= case instr of
@@ -428,10 +434,21 @@ x86_regUsageOfInstr platform instr
-- note: might be a better way to do this
PREFETCH _ _ src -> mkRU (use_R src []) []
+ LOCK -> noUsage
+ XADD _ src dst -> usageMM src dst
+ CMPXCHG _ src dst -> usageRMM src dst (OpReg eax)
_other -> panic "regUsage: unrecognised instr"
-
where
+ -- # Definitions
+ --
+ -- Written: If the operand is a register, it's written. If it's an
+ -- address, registers mentioned in the address are read.
+ --
+ -- Modified: If the operand is a register, it's both read and
+ -- written. If it's an address, registers mentioned in the address
+ -- are read.
+
-- 2 operand form; first operand Read; second Written
usageRW :: Operand -> Operand -> RegUsage
usageRW op (OpReg reg) = mkRU (use_R op []) [reg]
@@ -444,6 +461,18 @@ x86_regUsageOfInstr platform instr
usageRM op (OpAddr ea) = mkRUR (use_R op $! use_EA ea [])
usageRM _ _ = panic "X86.RegInfo.usageRM: no match"
+ -- 2 operand form; first operand Modified; second Modified
+ usageMM :: Operand -> Operand -> RegUsage
+ usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst]
+ usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src]
+ usageMM _ _ = panic "X86.RegInfo.usageMM: no match"
+
+ -- 3 operand form; first operand Read; second Modified; third Modified
+ usageRMM :: Operand -> Operand -> Operand -> RegUsage
+ usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg]
+ usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg]
+ usageRMM _ _ _ = panic "X86.RegInfo.usageRMM: no match"
+
-- 1 operand form; operand Modified
usageM :: Operand -> RegUsage
usageM (OpReg reg) = mkRU [reg] [reg]
@@ -476,6 +505,7 @@ x86_regUsageOfInstr platform instr
where src' = filter (interesting platform) src
dst' = filter (interesting platform) dst
+-- | Is this register interesting for the register allocator?
interesting :: Platform -> Reg -> Bool
interesting _ (RegVirtual _) = True
interesting platform (RegReal (RealRegSingle i)) = isFastTrue (freeReg platform i)
@@ -483,6 +513,8 @@ interesting _ (RegReal (RealRegPair{})) = panic "X86.interesting: no re
+-- | Applies the supplied function to all registers in instructions.
+-- Typically used to change virtual registers to real registers.
x86_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
x86_patchRegsOfInstr instr env
= case instr of
@@ -571,6 +603,10 @@ x86_patchRegsOfInstr instr env
PREFETCH lvl size src -> PREFETCH lvl size (patchOp src)
+ LOCK -> instr
+ XADD sz src dst -> patch2 (XADD sz) src dst
+ CMPXCHG sz src dst -> patch2 (CMPXCHG sz) src dst
+
_other -> panic "patchRegs: unrecognised instr"
where
diff --git a/compiler/nativeGen/X86/Ppr.hs b/compiler/nativeGen/X86/Ppr.hs
index 459c041ba5..7771c02512 100644
--- a/compiler/nativeGen/X86/Ppr.hs
+++ b/compiler/nativeGen/X86/Ppr.hs
@@ -886,6 +886,14 @@ pprInstr GFREE
ptext (sLit "\tffree %st(4) ;ffree %st(5)")
]
+-- Atomics
+
+pprInstr LOCK = ptext (sLit "\tlock")
+
+pprInstr (XADD size src dst) = pprSizeOpOp (sLit "xadd") size src dst
+
+pprInstr (CMPXCHG size src dst) = pprSizeOpOp (sLit "cmpxchg") size src dst
+
pprInstr _
= panic "X86.Ppr.pprInstr: no match"
diff --git a/compiler/prelude/primops.txt.pp b/compiler/prelude/primops.txt.pp
index 4851315eb4..4faa585246 100644
--- a/compiler/prelude/primops.txt.pp
+++ b/compiler/prelude/primops.txt.pp
@@ -1363,19 +1363,79 @@ primop SetByteArrayOp "setByteArray#" GenPrimOp
code_size = { primOpCodeSizeForeignCall + 4 }
can_fail = True
+-- Atomic operations
+
+primop AtomicReadByteArrayOp_Int "atomicReadIntArray#" GenPrimOp
+ MutableByteArray# s -> Int# -> State# s -> (# State# s, Int# #)
+ {Given an array and an offset in Int units, read an element. The
+ index is assumed to be in bounds. Implies a full memory barrier.}
+ with has_side_effects = True
+ can_fail = True
+
+primop AtomicWriteByteArrayOp_Int "atomicWriteIntArray#" GenPrimOp
+ MutableByteArray# s -> Int# -> Int# -> State# s -> State# s
+ {Given an array and an offset in Int units, write an element. The
+ index is assumed to be in bounds. Implies a full memory barrier.}
+ with has_side_effects = True
+ can_fail = True
+
primop CasByteArrayOp_Int "casIntArray#" GenPrimOp
MutableByteArray# s -> Int# -> Int# -> Int# -> State# s -> (# State# s, Int# #)
- {Machine-level atomic compare and swap on a word within a ByteArray.}
- with
- out_of_line = True
- has_side_effects = True
+ {Given an array, an offset in Int units, the expected old value, and
+ the new value, perform an atomic compare and swap i.e. write the new
+ value if the current value matches the provided old value. Returns
+ the value of the element before the operation. Implies a full memory
+ barrier.}
+ with has_side_effects = True
+ can_fail = True
primop FetchAddByteArrayOp_Int "fetchAddIntArray#" GenPrimOp
MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
- {Machine-level word-sized fetch-and-add within a ByteArray.}
- with
- out_of_line = True
- has_side_effects = True
+ {Given an array, and offset in Int units, and a value to add,
+ atomically add the value to the element. Returns the value of the
+ element before the operation. Implies a full memory barrier.}
+ with has_side_effects = True
+ can_fail = True
+
+primop FetchSubByteArrayOp_Int "fetchSubIntArray#" GenPrimOp
+ MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
+ {Given an array, and offset in Int units, and a value to subtract,
+ atomically substract the value to the element. Returns the value of
+ the element before the operation. Implies a full memory barrier.}
+ with has_side_effects = True
+ can_fail = True
+
+primop FetchAndByteArrayOp_Int "fetchAndIntArray#" GenPrimOp
+ MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
+ {Given an array, and offset in Int units, and a value to AND,
+ atomically AND the value to the element. Returns the value of the
+ element before the operation. Implies a full memory barrier.}
+ with has_side_effects = True
+ can_fail = True
+
+primop FetchNandByteArrayOp_Int "fetchNandIntArray#" GenPrimOp
+ MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
+ {Given an array, and offset in Int units, and a value to NAND,
+ atomically NAND the value to the element. Returns the value of the
+ element before the operation. Implies a full memory barrier.}
+ with has_side_effects = True
+ can_fail = True
+
+primop FetchOrByteArrayOp_Int "fetchOrIntArray#" GenPrimOp
+ MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
+ {Given an array, and offset in Int units, and a value to OR,
+ atomically OR the value to the element. Returns the value of the
+ element before the operation. Implies a full memory barrier.}
+ with has_side_effects = True
+ can_fail = True
+
+primop FetchXorByteArrayOp_Int "fetchXorIntArray#" GenPrimOp
+ MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
+ {Given an array, and offset in Int units, and a value to XOR,
+ atomically XOR the value to the element. Returns the value of the
+ element before the operation. Implies a full memory barrier.}
+ with has_side_effects = True
+ can_fail = True
------------------------------------------------------------------------
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index 0c4d2f9eaf..ee5a119aa1 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -348,7 +348,6 @@ RTS_FUN_DECL(stg_newByteArrayzh);
RTS_FUN_DECL(stg_newPinnedByteArrayzh);
RTS_FUN_DECL(stg_newAlignedPinnedByteArrayzh);
RTS_FUN_DECL(stg_casIntArrayzh);
-RTS_FUN_DECL(stg_fetchAddIntArrayzh);
RTS_FUN_DECL(stg_newArrayzh);
RTS_FUN_DECL(stg_newArrayArrayzh);
RTS_FUN_DECL(stg_copyArrayzh);
diff --git a/libraries/ghc-prim/cbits/atomic.c b/libraries/ghc-prim/cbits/atomic.c
new file mode 100644
index 0000000000..a2e64afd32
--- /dev/null
+++ b/libraries/ghc-prim/cbits/atomic.c
@@ -0,0 +1,280 @@
+#include "Rts.h"
+
+// Fallbacks for atomic primops on byte arrays. The builtins used
+// below are supported on both GCC and LLVM.
+//
+// Ideally these function would take StgWord8, StgWord16, etc but
+// older GCC versions incorrectly assume that the register that the
+// argument is passed in has been zero extended, which is incorrect
+// according to the ABI and is not what GHC does when it generates
+// calls to these functions.
+
+// FetchAddByteArrayOp_Int
+
+extern StgWord hs_atomic_add8(volatile StgWord8 *x, StgWord val);
+StgWord
+hs_atomic_add8(volatile StgWord8 *x, StgWord val)
+{
+ return __sync_fetch_and_add(x, (StgWord8) val);
+}
+
+extern StgWord hs_atomic_add16(volatile StgWord16 *x, StgWord val);
+StgWord
+hs_atomic_add16(volatile StgWord16 *x, StgWord val)
+{
+ return __sync_fetch_and_add(x, (StgWord16) val);
+}
+
+extern StgWord hs_atomic_add32(volatile StgWord32 *x, StgWord val);
+StgWord
+hs_atomic_add32(volatile StgWord32 *x, StgWord val)
+{
+ return __sync_fetch_and_add(x, (StgWord32) val);
+}
+
+extern StgWord64 hs_atomic_add64(volatile StgWord64 *x, StgWord64 val);
+StgWord64
+hs_atomic_add64(volatile StgWord64 *x, StgWord64 val)
+{
+ return __sync_fetch_and_add(x, val);
+}
+
+// FetchSubByteArrayOp_Int
+
+extern StgWord hs_atomic_sub8(volatile StgWord8 *x, StgWord val);
+StgWord
+hs_atomic_sub8(volatile StgWord8 *x, StgWord val)
+{
+ return __sync_fetch_and_sub(x, (StgWord8) val);
+}
+
+extern StgWord hs_atomic_sub16(volatile StgWord16 *x, StgWord val);
+StgWord
+hs_atomic_sub16(volatile StgWord16 *x, StgWord val)
+{
+ return __sync_fetch_and_sub(x, (StgWord16) val);
+}
+
+extern StgWord hs_atomic_sub32(volatile StgWord32 *x, StgWord val);
+StgWord
+hs_atomic_sub32(volatile StgWord32 *x, StgWord val)
+{
+ return __sync_fetch_and_sub(x, (StgWord32) val);
+}
+
+extern StgWord64 hs_atomic_sub64(volatile StgWord64 *x, StgWord64 val);
+StgWord64
+hs_atomic_sub64(volatile StgWord64 *x, StgWord64 val)
+{
+ return __sync_fetch_and_sub(x, val);
+}
+
+// FetchAndByteArrayOp_Int
+
+extern StgWord hs_atomic_and8(volatile StgWord8 *x, StgWord val);
+StgWord
+hs_atomic_and8(volatile StgWord8 *x, StgWord val)
+{
+ return __sync_fetch_and_and(x, (StgWord8) val);
+}
+
+extern StgWord hs_atomic_and16(volatile StgWord16 *x, StgWord val);
+StgWord
+hs_atomic_and16(volatile StgWord16 *x, StgWord val)
+{
+ return __sync_fetch_and_and(x, (StgWord16) val);
+}
+
+extern StgWord hs_atomic_and32(volatile StgWord32 *x, StgWord val);
+StgWord
+hs_atomic_and32(volatile StgWord32 *x, StgWord val)
+{
+ return __sync_fetch_and_and(x, (StgWord32) val);
+}
+
+extern StgWord64 hs_atomic_and64(volatile StgWord64 *x, StgWord64 val);
+StgWord64
+hs_atomic_and64(volatile StgWord64 *x, StgWord64 val)
+{
+ return __sync_fetch_and_and(x, val);
+}
+
+// FetchNandByteArrayOp_Int
+
+extern StgWord hs_atomic_nand8(volatile StgWord8 *x, StgWord val);
+StgWord
+hs_atomic_nand8(volatile StgWord8 *x, StgWord val)
+{
+ return __sync_fetch_and_nand(x, (StgWord8) val);
+}
+
+extern StgWord hs_atomic_nand16(volatile StgWord16 *x, StgWord val);
+StgWord
+hs_atomic_nand16(volatile StgWord16 *x, StgWord val)
+{
+ return __sync_fetch_and_nand(x, (StgWord16) val);
+}
+
+extern StgWord hs_atomic_nand32(volatile StgWord32 *x, StgWord val);
+StgWord
+hs_atomic_nand32(volatile StgWord32 *x, StgWord val)
+{
+ return __sync_fetch_and_nand(x, (StgWord32) val);
+}
+
+extern StgWord64 hs_atomic_nand64(volatile StgWord64 *x, StgWord64 val);
+StgWord64
+hs_atomic_nand64(volatile StgWord64 *x, StgWord64 val)
+{
+ return __sync_fetch_and_nand(x, val);
+}
+
+// FetchOrByteArrayOp_Int
+
+extern StgWord hs_atomic_or8(volatile StgWord8 *x, StgWord val);
+StgWord
+hs_atomic_or8(volatile StgWord8 *x, StgWord val)
+{
+ return __sync_fetch_and_or(x, (StgWord8) val);
+}
+
+extern StgWord hs_atomic_or16(volatile StgWord16 *x, StgWord val);
+StgWord
+hs_atomic_or16(volatile StgWord16 *x, StgWord val)
+{
+ return __sync_fetch_and_or(x, (StgWord16) val);
+}
+
+extern StgWord hs_atomic_or32(volatile StgWord32 *x, StgWord val);
+StgWord
+hs_atomic_or32(volatile StgWord32 *x, StgWord val)
+{
+ return __sync_fetch_and_or(x, (StgWord32) val);
+}
+
+extern StgWord64 hs_atomic_or64(volatile StgWord64 *x, StgWord64 val);
+StgWord64
+hs_atomic_or64(volatile StgWord64 *x, StgWord64 val)
+{
+ return __sync_fetch_and_or(x, val);
+}
+
+// FetchXorByteArrayOp_Int
+
+extern StgWord hs_atomic_xor8(volatile StgWord8 *x, StgWord val);
+StgWord
+hs_atomic_xor8(volatile StgWord8 *x, StgWord val)
+{
+ return __sync_fetch_and_xor(x, (StgWord8) val);
+}
+
+extern StgWord hs_atomic_xor16(volatile StgWord16 *x, StgWord val);
+StgWord
+hs_atomic_xor16(volatile StgWord16 *x, StgWord val)
+{
+ return __sync_fetch_and_xor(x, (StgWord16) val);
+}
+
+extern StgWord hs_atomic_xor32(volatile StgWord32 *x, StgWord val);
+StgWord
+hs_atomic_xor32(volatile StgWord32 *x, StgWord val)
+{
+ return __sync_fetch_and_xor(x, (StgWord32) val);
+}
+
+extern StgWord64 hs_atomic_xor64(volatile StgWord64 *x, StgWord64 val);
+StgWord64
+hs_atomic_xor64(volatile StgWord64 *x, StgWord64 val)
+{
+ return __sync_fetch_and_xor(x, val);
+}
+
+// CasByteArrayOp_Int
+
+extern StgWord hs_cmpxchg8(volatile StgWord8 *x, StgWord old, StgWord new);
+StgWord
+hs_cmpxchg8(volatile StgWord8 *x, StgWord old, StgWord new)
+{
+ return __sync_val_compare_and_swap(x, (StgWord8) old, (StgWord8) new);
+}
+
+extern StgWord hs_cmpxchg16(volatile StgWord16 *x, StgWord old, StgWord new);
+StgWord
+hs_cmpxchg16(volatile StgWord16 *x, StgWord old, StgWord new)
+{
+ return __sync_val_compare_and_swap(x, (StgWord16) old, (StgWord16) new);
+}
+
+extern StgWord hs_cmpxchg32(volatile StgWord32 *x, StgWord old, StgWord new);
+StgWord
+hs_cmpxchg32(volatile StgWord32 *x, StgWord old, StgWord new)
+{
+ return __sync_val_compare_and_swap(x, (StgWord32) old, (StgWord32) new);
+}
+
+extern StgWord hs_cmpxchg64(volatile StgWord64 *x, StgWord64 old, StgWord64 new);
+StgWord
+hs_cmpxchg64(volatile StgWord64 *x, StgWord64 old, StgWord64 new)
+{
+ return __sync_val_compare_and_swap(x, old, new);
+}
+
+// AtomicReadByteArrayOp_Int
+
+extern StgWord hs_atomicread8(volatile StgWord8 *x);
+StgWord
+hs_atomicread8(volatile StgWord8 *x)
+{
+ return *x;
+}
+
+extern StgWord hs_atomicread16(volatile StgWord16 *x);
+StgWord
+hs_atomicread16(volatile StgWord16 *x)
+{
+ return *x;
+}
+
+extern StgWord hs_atomicread32(volatile StgWord32 *x);
+StgWord
+hs_atomicread32(volatile StgWord32 *x)
+{
+ return *x;
+}
+
+extern StgWord64 hs_atomicread64(volatile StgWord64 *x);
+StgWord64
+hs_atomicread64(volatile StgWord64 *x)
+{
+ return *x;
+}
+
+// AtomicWriteByteArrayOp_Int
+
+extern void hs_atomicwrite8(volatile StgWord8 *x, StgWord val);
+void
+hs_atomicwrite8(volatile StgWord8 *x, StgWord val)
+{
+ *x = (StgWord8) val;
+}
+
+extern void hs_atomicwrite16(volatile StgWord16 *x, StgWord val);
+void
+hs_atomicwrite16(volatile StgWord16 *x, StgWord val)
+{
+ *x = (StgWord16) val;
+}
+
+extern void hs_atomicwrite32(volatile StgWord32 *x, StgWord val);
+void
+hs_atomicwrite32(volatile StgWord32 *x, StgWord val)
+{
+ *x = (StgWord32) val;
+}
+
+extern void hs_atomicwrite64(volatile StgWord64 *x, StgWord64 val);
+void
+hs_atomicwrite64(volatile StgWord64 *x, StgWord64 val)
+{
+ *x = (StgWord64) val;
+}
diff --git a/libraries/ghc-prim/ghc-prim.cabal b/libraries/ghc-prim/ghc-prim.cabal
index c861342b56..bc9f57126a 100644
--- a/libraries/ghc-prim/ghc-prim.cabal
+++ b/libraries/ghc-prim/ghc-prim.cabal
@@ -52,6 +52,7 @@ Library
exposed-modules: GHC.Prim
c-sources:
+ cbits/atomic.c
cbits/bswap.c
cbits/debug.c
cbits/longlong.c
diff --git a/rts/Linker.c b/rts/Linker.c
index f7f554ce6c..fb07d58452 100644
--- a/rts/Linker.c
+++ b/rts/Linker.c
@@ -1186,7 +1186,6 @@ typedef struct _RtsSymbolVal {
SymI_HasProto(stg_newBCOzh) \
SymI_HasProto(stg_newByteArrayzh) \
SymI_HasProto(stg_casIntArrayzh) \
- SymI_HasProto(stg_fetchAddIntArrayzh) \
SymI_HasProto(stg_newMVarzh) \
SymI_HasProto(stg_newMutVarzh) \
SymI_HasProto(stg_newTVarzh) \
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index 4d7baca824..5f04a6d732 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -151,18 +151,6 @@ stg_casIntArrayzh( gcptr arr, W_ ind, W_ old, W_ new )
}
-stg_fetchAddIntArrayzh( gcptr arr, W_ ind, W_ incr )
-/* MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) */
-{
- W_ p, h;
-
- p = arr + SIZEOF_StgArrWords + WDS(ind);
- (h) = ccall atomic_inc(p, incr);
-
- return(h);
-}
-
-
stg_newArrayzh ( W_ n /* words */, gcptr init )
{
W_ words, size, p;
diff --git a/testsuite/tests/concurrent/should_run/.gitignore b/testsuite/tests/concurrent/should_run/.gitignore
index 4f0a3175da..d64f644233 100644
--- a/testsuite/tests/concurrent/should_run/.gitignore
+++ b/testsuite/tests/concurrent/should_run/.gitignore
@@ -1,3 +1,4 @@
+AtomicPrimops
T7970
compareAndSwap
readMVar1
diff --git a/testsuite/tests/concurrent/should_run/AtomicPrimops.hs b/testsuite/tests/concurrent/should_run/AtomicPrimops.hs
new file mode 100644
index 0000000000..0c55aba93e
--- /dev/null
+++ b/testsuite/tests/concurrent/should_run/AtomicPrimops.hs
@@ -0,0 +1,245 @@
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnboxedTuples #-}
+
+module Main ( main ) where
+
+import Control.Concurrent
+import Control.Concurrent.MVar
+import Control.Monad (when)
+import Foreign.Storable
+import GHC.Exts
+import GHC.IO
+
+-- | Iterations per worker.
+iters :: Int
+iters = 1000000
+
+main :: IO ()
+main = do
+ fetchAddSubTest
+ fetchAndTest
+ fetchNandTest
+ fetchOrTest
+ fetchXorTest
+ casTest
+ readWriteTest
+
+-- | Test fetchAddIntArray# by having two threads concurrenctly
+-- increment a counter and then checking the sum at the end.
+fetchAddSubTest :: IO ()
+fetchAddSubTest = do
+ tot <- race 0
+ (\ mba -> work fetchAddIntArray mba iters 2)
+ (\ mba -> work fetchSubIntArray mba iters 1)
+ assertEq 1000000 tot "fetchAddSubTest"
+ where
+ work :: (MByteArray -> Int -> Int -> IO ()) -> MByteArray -> Int -> Int
+ -> IO ()
+ work op mba 0 val = return ()
+ work op mba n val = op mba 0 val >> work op mba (n-1) val
+
+-- | Test fetchXorIntArray# by having two threads concurrenctly XORing
+-- and then checking the result at the end. Works since XOR is
+-- commutative.
+--
+-- Covers the code paths for AND, NAND, and OR as well.
+fetchXorTest :: IO ()
+fetchXorTest = do
+ res <- race n0
+ (\ mba -> work mba iters t1pat)
+ (\ mba -> work mba iters t2pat)
+ assertEq expected res "fetchXorTest"
+ where
+ work :: MByteArray -> Int -> Int -> IO ()
+ work mba 0 val = return ()
+ work mba n val = fetchXorIntArray mba 0 val >> work mba (n-1) val
+
+ -- Initial value is a large prime and the two patterns are 1010...
+ -- and 0101...
+ (n0, t1pat, t2pat)
+ | sizeOf (undefined :: Int) == 8 =
+ (0x00000000ffffffff, 0x5555555555555555, 0x9999999999999999)
+ | otherwise = (0x0000ffff, 0x55555555, 0x99999999)
+ expected
+ | sizeOf (undefined :: Int) == 8 = 4294967295
+ | otherwise = 65535
+
+-- The tests for AND, NAND, and OR are trivial for two reasons:
+--
+-- * The code path is already well exercised by 'fetchXorTest'.
+--
+-- * It's harder to test these operations, as a long sequence of them
+-- convert to a single value but we'd like to write a test in the
+-- style of 'fetchXorTest' that applies the operation repeatedly,
+-- to make it likely that any race conditions are detected.
+--
+-- Right now we only test that they return the correct value for a
+-- single op on each thread.
+
+fetchOpTest :: (MByteArray -> Int -> Int -> IO ())
+ -> Int -> String -> IO ()
+fetchOpTest op expected name = do
+ res <- race n0
+ (\ mba -> work mba t1pat)
+ (\ mba -> work mba t2pat)
+ assertEq expected res name
+ where
+ work :: MByteArray -> Int -> IO ()
+ work mba val = op mba 0 val
+
+ -- Initial value is a large prime and the two patterns are 1010...
+ -- and 0101...
+ (n0, t1pat, t2pat)
+ | sizeOf (undefined :: Int) == 8 =
+ (0x00000000ffffffff, 0x5555555555555555, 0x9999999999999999)
+ | otherwise = (0x0000ffff, 0x55555555, 0x99999999)
+
+fetchAndTest :: IO ()
+fetchAndTest = fetchOpTest fetchAndIntArray expected "fetchAndTest"
+ where expected
+ | sizeOf (undefined :: Int) == 8 = 286331153
+ | otherwise = 4369
+
+fetchNandTest :: IO ()
+fetchNandTest = fetchOpTest fetchNandIntArray expected "fetchNandTest"
+ where expected
+ | sizeOf (undefined :: Int) == 8 = 7378697629770151799
+ | otherwise = -2576976009
+
+fetchOrTest :: IO ()
+fetchOrTest = fetchOpTest fetchOrIntArray expected "fetchOrTest"
+ where expected
+ | sizeOf (undefined :: Int) == 8 = 15987178197787607039
+ | otherwise = 3722313727
+
+-- | Test casIntArray# by using it to emulate fetchAddIntArray# and
+-- then having two threads concurrenctly increment a counter,
+-- checking the sum at the end.
+casTest :: IO ()
+casTest = do
+ tot <- race 0
+ (\ mba -> work mba iters 1)
+ (\ mba -> work mba iters 2)
+ assertEq 3000000 tot "casTest"
+ where
+ work :: MByteArray -> Int -> Int -> IO ()
+ work mba 0 val = return ()
+ work mba n val = add mba 0 val >> work mba (n-1) val
+
+ -- Fetch-and-add implemented using CAS.
+ add :: MByteArray -> Int -> Int -> IO ()
+ add mba ix n = do
+ old <- readIntArray mba ix
+ old' <- casIntArray mba ix old (old + n)
+ when (old /= old') $ add mba ix n
+
+-- | Tests atomic reads and writes by making sure that one thread sees
+-- updates that are done on another. This test isn't very good at the
+-- moment, as this might work even without atomic ops, but at least it
+-- exercises the code.
+readWriteTest :: IO ()
+readWriteTest = do
+ mba <- newByteArray (sizeOf (undefined :: Int))
+ writeIntArray mba 0 0
+ latch <- newEmptyMVar
+ done <- newEmptyMVar
+ forkIO $ do
+ takeMVar latch
+ n <- atomicReadIntArray mba 0
+ assertEq 1 n "readWriteTest"
+ putMVar done ()
+ atomicWriteIntArray mba 0 1
+ putMVar latch ()
+ takeMVar done
+
+-- | Create two threads that mutate the byte array passed to them
+-- concurrently. The array is one word large.
+race :: Int -- ^ Initial value of array element
+ -> (MByteArray -> IO ()) -- ^ Thread 1 action
+ -> (MByteArray -> IO ()) -- ^ Thread 2 action
+ -> IO Int -- ^ Final value of array element
+race n0 thread1 thread2 = do
+ done1 <- newEmptyMVar
+ done2 <- newEmptyMVar
+ mba <- newByteArray (sizeOf (undefined :: Int))
+ writeIntArray mba 0 n0
+ forkIO $ thread1 mba >> putMVar done1 ()
+ forkIO $ thread2 mba >> putMVar done2 ()
+ mapM_ takeMVar [done1, done2]
+ readIntArray mba 0
+
+------------------------------------------------------------------------
+-- Test helper
+
+assertEq :: (Eq a, Show a) => a -> a -> String -> IO ()
+assertEq expected actual name
+ | expected == actual = putStrLn $ name ++ ": OK"
+ | otherwise = do
+ putStrLn $ name ++ ": FAIL"
+ putStrLn $ "Expected: " ++ show expected
+ putStrLn $ " Actual: " ++ show actual
+
+------------------------------------------------------------------------
+-- Wrappers around MutableByteArray#
+
+data MByteArray = MBA (MutableByteArray# RealWorld)
+
+fetchAddIntArray :: MByteArray -> Int -> Int -> IO ()
+fetchAddIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
+ case fetchAddIntArray# mba# ix# n# s# of
+ (# s2#, _ #) -> (# s2#, () #)
+
+fetchSubIntArray :: MByteArray -> Int -> Int -> IO ()
+fetchSubIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
+ case fetchSubIntArray# mba# ix# n# s# of
+ (# s2#, _ #) -> (# s2#, () #)
+
+fetchAndIntArray :: MByteArray -> Int -> Int -> IO ()
+fetchAndIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
+ case fetchAndIntArray# mba# ix# n# s# of
+ (# s2#, _ #) -> (# s2#, () #)
+
+fetchNandIntArray :: MByteArray -> Int -> Int -> IO ()
+fetchNandIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
+ case fetchNandIntArray# mba# ix# n# s# of
+ (# s2#, _ #) -> (# s2#, () #)
+
+fetchOrIntArray :: MByteArray -> Int -> Int -> IO ()
+fetchOrIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
+ case fetchOrIntArray# mba# ix# n# s# of
+ (# s2#, _ #) -> (# s2#, () #)
+
+fetchXorIntArray :: MByteArray -> Int -> Int -> IO ()
+fetchXorIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
+ case fetchXorIntArray# mba# ix# n# s# of
+ (# s2#, _ #) -> (# s2#, () #)
+
+newByteArray :: Int -> IO MByteArray
+newByteArray (I# n#) = IO $ \ s# ->
+ case newByteArray# n# s# of
+ (# s2#, mba# #) -> (# s2#, MBA mba# #)
+
+writeIntArray :: MByteArray -> Int -> Int -> IO ()
+writeIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
+ case writeIntArray# mba# ix# n# s# of
+ s2# -> (# s2#, () #)
+
+readIntArray :: MByteArray -> Int -> IO Int
+readIntArray (MBA mba#) (I# ix#) = IO $ \ s# ->
+ case readIntArray# mba# ix# s# of
+ (# s2#, n# #) -> (# s2#, I# n# #)
+
+atomicWriteIntArray :: MByteArray -> Int -> Int -> IO ()
+atomicWriteIntArray (MBA mba#) (I# ix#) (I# n#) = IO $ \ s# ->
+ case atomicWriteIntArray# mba# ix# n# s# of
+ s2# -> (# s2#, () #)
+
+atomicReadIntArray :: MByteArray -> Int -> IO Int
+atomicReadIntArray (MBA mba#) (I# ix#) = IO $ \ s# ->
+ case atomicReadIntArray# mba# ix# s# of
+ (# s2#, n# #) -> (# s2#, I# n# #)
+
+casIntArray :: MByteArray -> Int -> Int -> Int -> IO Int
+casIntArray (MBA mba#) (I# ix#) (I# old#) (I# new#) = IO $ \ s# ->
+ case casIntArray# mba# ix# old# new# s# of
+ (# s2#, old2# #) -> (# s2#, I# old2# #)
diff --git a/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout b/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout
new file mode 100644
index 0000000000..c37041a040
--- /dev/null
+++ b/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout
@@ -0,0 +1,7 @@
+fetchAddSubTest: OK
+fetchAndTest: OK
+fetchNandTest: OK
+fetchOrTest: OK
+fetchXorTest: OK
+casTest: OK
+readWriteTest: OK
diff --git a/testsuite/tests/concurrent/should_run/all.T b/testsuite/tests/concurrent/should_run/all.T
index 0b502c3bc7..0a66892d82 100644
--- a/testsuite/tests/concurrent/should_run/all.T
+++ b/testsuite/tests/concurrent/should_run/all.T
@@ -81,6 +81,7 @@ test('tryReadMVar1', normal, compile_and_run, [''])
test('tryReadMVar2', normal, compile_and_run, [''])
test('T7970', normal, compile_and_run, [''])
+test('AtomicPrimops', normal, compile_and_run, [''])
# -----------------------------------------------------------------------------
# These tests we only do for a full run