From b4d39adbb5884c764c6c11b2614a340c78cc078e Mon Sep 17 00:00:00 2001 From: Peter Trommler Date: Sat, 17 Apr 2021 17:59:44 +0200 Subject: PrimOps: Add CAS op for all int sizes PPC NCG: Implement CAS inline for 32 and 64 bit testsuite: Add tests for smaller atomic CAS X86 NCG: Catch calls to CAS C fallback Primops: Add atomicCasWord[8|16|32|64]Addr# Add tests for atomicCasWord[8|16|32|64]Addr# Add changelog entry for new primops X86 NCG: Fix MO-Cmpxchg W64 on 32-bit arch ghc-prim: 64-bit CAS C fallback on all archs --- compiler/GHC/Builtin/primops.txt.pp | 96 ++++++++++++++++++++++++++++++++++++ compiler/GHC/CmmToAsm/PPC/CodeGen.hs | 33 ++++++++++++- compiler/GHC/CmmToAsm/X86/CodeGen.hs | 7 ++- compiler/GHC/StgToCmm/Prim.hs | 18 ++++++- 4 files changed, 150 insertions(+), 4 deletions(-) (limited to 'compiler/GHC') diff --git a/compiler/GHC/Builtin/primops.txt.pp b/compiler/GHC/Builtin/primops.txt.pp index 5f5cd64cfa..b07c344e18 100644 --- a/compiler/GHC/Builtin/primops.txt.pp +++ b/compiler/GHC/Builtin/primops.txt.pp @@ -1927,6 +1927,46 @@ primop CasByteArrayOp_Int "casIntArray#" GenPrimOp with has_side_effects = True can_fail = True +primop CasByteArrayOp_Int8 "casInt8Array#" GenPrimOp + MutableByteArray# s -> Int# -> Int8# -> Int8# -> State# s -> (# State# s, Int8# #) + {Given an array, an offset in bytes, the expected old value, and + the new value, perform an atomic compare and swap i.e. write the new + value if the current value matches the provided old value. Returns + the value of the element before the operation. Implies a full memory + barrier.} + with has_side_effects = True + can_fail = True + +primop CasByteArrayOp_Int16 "casInt16Array#" GenPrimOp + MutableByteArray# s -> Int# -> Int16# -> Int16# -> State# s -> (# State# s, Int16# #) + {Given an array, an offset in 16 bit units, the expected old value, and + the new value, perform an atomic compare and swap i.e. write the new + value if the current value matches the provided old value. Returns + the value of the element before the operation. Implies a full memory + barrier.} + with has_side_effects = True + can_fail = True + +primop CasByteArrayOp_Int32 "casInt32Array#" GenPrimOp + MutableByteArray# s -> Int# -> Int32# -> Int32# -> State# s -> (# State# s, Int32# #) + {Given an array, an offset in 32 bit units, the expected old value, and + the new value, perform an atomic compare and swap i.e. write the new + value if the current value matches the provided old value. Returns + the value of the element before the operation. Implies a full memory + barrier.} + with has_side_effects = True + can_fail = True + +primop CasByteArrayOp_Int64 "casInt64Array#" GenPrimOp + MutableByteArray# s -> Int# -> INT64 -> INT64 -> State# s -> (# State# s, INT64 #) + {Given an array, an offset in 64 bit units, the expected old value, and + the new value, perform an atomic compare and swap i.e. write the new + value if the current value matches the provided old value. Returns + the value of the element before the operation. Implies a full memory + barrier.} + with has_side_effects = True + can_fail = True + primop FetchAddByteArrayOp_Int "fetchAddIntArray#" GenPrimOp MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #) {Given an array, and offset in machine words, and a value to add, @@ -2387,6 +2427,62 @@ primop CasAddrOp_Word "atomicCasWordAddr#" GenPrimOp with has_side_effects = True can_fail = True +primop CasAddrOp_Word8 "atomicCasWord8Addr#" GenPrimOp + Addr# -> Word8# -> Word8# -> State# s -> (# State# s, Word8# #) + { Compare and swap on a 8 bit-sized and aligned memory location. + + Use as: \s -> atomicCasWordAddr8# location expected desired s + + This version always returns the old value read. This follows the normal + protocol for CAS operations (and matches the underlying instruction on + most architectures). + + Implies a full memory barrier.} + with has_side_effects = True + can_fail = True + +primop CasAddrOp_Word16 "atomicCasWord16Addr#" GenPrimOp + Addr# -> Word16# -> Word16# -> State# s -> (# State# s, Word16# #) + { Compare and swap on a 16 bit-sized and aligned memory location. + + Use as: \s -> atomicCasWordAddr16# location expected desired s + + This version always returns the old value read. This follows the normal + protocol for CAS operations (and matches the underlying instruction on + most architectures). + + Implies a full memory barrier.} + with has_side_effects = True + can_fail = True + +primop CasAddrOp_Word32 "atomicCasWord32Addr#" GenPrimOp + Addr# -> Word32# -> Word32# -> State# s -> (# State# s, Word32# #) + { Compare and swap on a 32 bit-sized and aligned memory location. + + Use as: \s -> atomicCasWordAddr32# location expected desired s + + This version always returns the old value read. This follows the normal + protocol for CAS operations (and matches the underlying instruction on + most architectures). + + Implies a full memory barrier.} + with has_side_effects = True + can_fail = True + +primop CasAddrOp_Word64 "atomicCasWord64Addr#" GenPrimOp + Addr# -> WORD64 -> WORD64 -> State# s -> (# State# s, WORD64 #) + { Compare and swap on a 64 bit-sized and aligned memory location. + + Use as: \s -> atomicCasWordAddr64# location expected desired s + + This version always returns the old value read. This follows the normal + protocol for CAS operations (and matches the underlying instruction on + most architectures). + + Implies a full memory barrier.} + with has_side_effects = True + can_fail = True + primop FetchAddAddrOp_Word "fetchAddWordAddr#" GenPrimOp Addr# -> Word# -> State# s -> (# State# s, Word# #) {Given an address, and a value to add, diff --git a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs index 67bc3d9bdb..1c3b244980 100644 --- a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs +++ b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs @@ -1221,7 +1221,38 @@ genCCall (PrimTarget (MO_AtomicRead width)) [dst] [addr] genCCall (PrimTarget (MO_AtomicWrite width)) [] [addr, val] = do code <- assignMem_IntCode (intFormat width) addr val - return $ unitOL(HWSYNC) `appOL` code + return $ unitOL HWSYNC `appOL` code + +genCCall (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] + | width == W32 || width == W64 + = do + platform <- getPlatform + (old_reg, old_code) <- getSomeReg old + (new_reg, new_code) <- getSomeReg new + (addr_reg, addr_code) <- getSomeReg addr + lbl_retry <- getBlockIdNat + lbl_eq <- getBlockIdNat + lbl_end <- getBlockIdNat + let reg_dst = getRegisterReg platform (CmmLocal dst) + code = toOL + [ HWSYNC + , BCC ALWAYS lbl_retry Nothing + , NEWBLOCK lbl_retry + , LDR format reg_dst (AddrRegReg r0 addr_reg) + , CMP format reg_dst (RIReg old_reg) + , BCC NE lbl_end Nothing + , BCC ALWAYS lbl_eq Nothing + , NEWBLOCK lbl_eq + , STC format new_reg (AddrRegReg r0 addr_reg) + , BCC NE lbl_retry Nothing + , BCC ALWAYS lbl_end Nothing + , NEWBLOCK lbl_end + , ISYNC + ] + return $ addr_code `appOL` new_code `appOL` old_code `appOL` code + where + format = intFormat width + genCCall (PrimTarget (MO_Clz width)) [dst] [src] = do platform <- getPlatform diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs index 5e7c261cbb..1ab24c4a25 100644 --- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs +++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs @@ -2595,10 +2595,11 @@ genCCall' _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do code <- assignMem_IntCode (intFormat width) addr val return $ code `snocOL` MFENCE -genCCall' _ is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do +genCCall' _ is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ -- On x86 we don't have enough registers to use cmpxchg with a -- complicated addressing mode, so on that architecture we -- pre-compute the address first. + | not (is32Bit && width == W64) = do Amode amode addr_code <- getSimpleAmode is32Bit addr newval <- getNewRegNat format newval_code <- getAnyReg new @@ -3441,7 +3442,9 @@ outOfLineCmmOp bid mop res args MO_AtomicRMW _ _ -> fsLit "atomicrmw" MO_AtomicRead _ -> fsLit "atomicread" MO_AtomicWrite _ -> fsLit "atomicwrite" - MO_Cmpxchg _ -> fsLit "cmpxchg" + MO_Cmpxchg w -> cmpxchgLabel w -- for W64 on 32-bit + -- TODO: implement + -- cmpxchg8b instr MO_Xchg _ -> should_be_inline MO_UF_Conv _ -> unsupported diff --git a/compiler/GHC/StgToCmm/Prim.hs b/compiler/GHC/StgToCmm/Prim.hs index d61880a0e2..290ace9f01 100644 --- a/compiler/GHC/StgToCmm/Prim.hs +++ b/compiler/GHC/StgToCmm/Prim.hs @@ -872,6 +872,14 @@ emitPrimOp dflags primop = case primop of emitPrimCall [res] (MO_Cmpxchg (wordWidth platform)) [dst, expected, new] CasAddrOp_Word -> \[dst, expected, new] -> opIntoRegs $ \[res] -> emitPrimCall [res] (MO_Cmpxchg (wordWidth platform)) [dst, expected, new] + CasAddrOp_Word8 -> \[dst, expected, new] -> opIntoRegs $ \[res] -> + emitPrimCall [res] (MO_Cmpxchg W8) [dst, expected, new] + CasAddrOp_Word16 -> \[dst, expected, new] -> opIntoRegs $ \[res] -> + emitPrimCall [res] (MO_Cmpxchg W16) [dst, expected, new] + CasAddrOp_Word32 -> \[dst, expected, new] -> opIntoRegs $ \[res] -> + emitPrimCall [res] (MO_Cmpxchg W32) [dst, expected, new] + CasAddrOp_Word64 -> \[dst, expected, new] -> opIntoRegs $ \[res] -> + emitPrimCall [res] (MO_Cmpxchg W64) [dst, expected, new] -- SIMD primops (VecBroadcastOp vcat n w) -> \[e] -> opIntoRegs $ \[res] -> do @@ -1075,6 +1083,14 @@ emitPrimOp dflags primop = case primop of doAtomicWriteByteArray mba ix (bWord platform) val CasByteArrayOp_Int -> \[mba, ix, old, new] -> opIntoRegs $ \[res] -> doCasByteArray res mba ix (bWord platform) old new + CasByteArrayOp_Int8 -> \[mba, ix, old, new] -> opIntoRegs $ \[res] -> + doCasByteArray res mba ix b8 old new + CasByteArrayOp_Int16 -> \[mba, ix, old, new] -> opIntoRegs $ \[res] -> + doCasByteArray res mba ix b16 old new + CasByteArrayOp_Int32 -> \[mba, ix, old, new] -> opIntoRegs $ \[res] -> + doCasByteArray res mba ix b32 old new + CasByteArrayOp_Int64 -> \[mba, ix, old, new] -> opIntoRegs $ \[res] -> + doCasByteArray res mba ix b64 old new -- The rest just translate straightforwardly @@ -3092,7 +3108,7 @@ doCasByteArray doCasByteArray res mba idx idx_ty old new = do profile <- getProfile platform <- getPlatform - let width = (typeWidth idx_ty) + let width = typeWidth idx_ty addr = cmmIndexOffExpr platform (arrWordsHdrSize profile) width mba idx emitPrimCall -- cgit v1.2.1