5 files changed, 349 insertions, 135 deletions
diff --git a/compiler/GHC/Builtin/primops.txt.pp b/compiler/GHC/Builtin/primops.txt.pp
index c292b9ecdc..acee822b0c 100644
--- a/compiler/GHC/Builtin/primops.txt.pp
+++ b/compiler/GHC/Builtin/primops.txt.pp
@@ -1669,7 +1669,7 @@ primop FetchAddByteArrayOp_Int "fetchAddIntArray#" GenPrimOp
 primop FetchSubByteArrayOp_Int "fetchSubIntArray#" GenPrimOp
    MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
    {Given an array, and offset in machine words, and a value to subtract,
-    atomically subtract the value to the element. Returns the value of
+    atomically subtract the value from the element. Returns the value of
     the element before the operation. Implies a full memory barrier.}
    with has_side_effects = True
         can_fail = True
@@ -1677,7 +1677,7 @@ primop FetchSubByteArrayOp_Int "fetchSubIntArray#" GenPrimOp
 primop FetchAndByteArrayOp_Int "fetchAndIntArray#" GenPrimOp
    MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
    {Given an array, and offset in machine words, and a value to AND,
-    atomically AND the value to the element. Returns the value of the
+    atomically AND the value into the element. Returns the value of the
     element before the operation. Implies a full memory barrier.}
    with has_side_effects = True
         can_fail = True
@@ -1685,7 +1685,7 @@ primop FetchAndByteArrayOp_Int "fetchAndIntArray#" GenPrimOp
 primop FetchNandByteArrayOp_Int "fetchNandIntArray#" GenPrimOp
    MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
    {Given an array, and offset in machine words, and a value to NAND,
-    atomically NAND the value to the element. Returns the value of the
+    atomically NAND the value into the element. Returns the value of the
     element before the operation. Implies a full memory barrier.}
    with has_side_effects = True
         can_fail = True
@@ -1693,7 +1693,7 @@ primop FetchNandByteArrayOp_Int "fetchNandIntArray#" GenPrimOp
 primop FetchOrByteArrayOp_Int "fetchOrIntArray#" GenPrimOp
    MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
    {Given an array, and offset in machine words, and a value to OR,
-    atomically OR the value to the element. Returns the value of the
+    atomically OR the value into the element. Returns the value of the
     element before the operation. Implies a full memory barrier.}
    with has_side_effects = True
         can_fail = True
@@ -1701,7 +1701,7 @@ primop FetchOrByteArrayOp_Int "fetchOrIntArray#" GenPrimOp
 primop FetchXorByteArrayOp_Int "fetchXorIntArray#" GenPrimOp
    MutableByteArray# s -> Int# -> Int# -> State# s -> (# State# s, Int# #)
    {Given an array, and offset in machine words, and a value to XOR,
-    atomically XOR the value to the element. Returns the value of the
+    atomically XOR the value into the element. Returns the value of the
     element before the operation. Implies a full memory barrier.}
    with has_side_effects = True
         can_fail = True
@@ -2121,6 +2121,67 @@ primop  CasAddrOp_Word "atomicCasWordAddr#" GenPrimOp
    with has_side_effects = True
         can_fail         = True
 
+primop FetchAddAddrOp_Word "fetchAddWordAddr#" GenPrimOp
+   Addr# -> Word# -> State# s -> (# State# s, Word# #)
+   {Given an address, and a value to add,
+    atomically add the value to the element. Returns the value of the
+    element before the operation. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop FetchSubAddrOp_Word "fetchSubWordAddr#" GenPrimOp
+   Addr# -> Word# -> State# s -> (# State# s, Word# #)
+   {Given an address, and a value to subtract,
+    atomically subtract the value from the element. Returns the value of
+    the element before the operation. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop FetchAndAddrOp_Word "fetchAndWordAddr#" GenPrimOp
+   Addr# -> Word# -> State# s -> (# State# s, Word# #)
+   {Given an address, and a value to AND,
+    atomically AND the value into the element. Returns the value of the
+    element before the operation. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop FetchNandAddrOp_Word "fetchNandWordAddr#" GenPrimOp
+   Addr# -> Word# -> State# s -> (# State# s, Word# #)
+   {Given an address, and a value to NAND,
+    atomically NAND the value into the element. Returns the value of the
+    element before the operation. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop FetchOrAddrOp_Word "fetchOrWordAddr#" GenPrimOp
+   Addr# -> Word# -> State# s -> (# State# s, Word# #)
+   {Given an address, and a value to OR,
+    atomically OR the value into the element. Returns the value of the
+    element before the operation. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop FetchXorAddrOp_Word "fetchXorWordAddr#" GenPrimOp
+   Addr# -> Word# -> State# s -> (# State# s, Word# #)
+   {Given an address, and a value to XOR,
+    atomically XOR the value into the element. Returns the value of the
+    element before the operation. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop  AtomicReadAddrOp_Word "atomicReadWordAddr#" GenPrimOp
+   Addr# -> State# s -> (# State# s, Word# #)
+   {Given an address, read a machine word.  Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+primop  AtomicWriteAddrOp_Word "atomicWriteWordAddr#" GenPrimOp
+   Addr# -> Word# -> State# s -> State# s
+   {Given an address, write a machine word. Implies a full memory barrier.}
+   with has_side_effects = True
+        can_fail = True
+
+
 ------------------------------------------------------------------------
 section "Mutable variables"
         {Operations on MutVar\#s.}
diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
index 4480848b90..3883d44717 100644
--- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
@@ -2121,12 +2121,12 @@ genCCall is32Bit (PrimTarget (MO_AtomicRMW width amop))
         -- final move should go away, because it's the last use of arg
         -- and the first use of dst_r.
         AMO_Add  -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
-                                  , MOV format (OpReg arg) (OpReg dst_r)
-                                  ], bid)
+                                   , MOV format (OpReg arg) (OpReg dst_r)
+                                   ], bid)
         AMO_Sub  -> return $ (toOL [ NEGI format (OpReg arg)
-                                  , LOCK (XADD format (OpReg arg) (OpAddr amode))
-                                  , MOV format (OpReg arg) (OpReg dst_r)
-                                  ], bid)
+                                   , LOCK (XADD format (OpReg arg) (OpAddr amode))
+                                   , MOV format (OpReg arg) (OpReg dst_r)
+                                   ], bid)
         -- In these cases we need a new block id, and have to return it so
         -- that later instruction selection can reference it.
         AMO_And  -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst)
diff --git a/compiler/GHC/StgToCmm/Prim.hs b/compiler/GHC/StgToCmm/Prim.hs
index 099a3850dc..625e76f085 100644
--- a/compiler/GHC/StgToCmm/Prim.hs
+++ b/compiler/GHC/StgToCmm/Prim.hs
@@ -850,6 +850,25 @@ emitPrimOp dflags primop = case primop of
     emitPrimCall [res] (MO_Xchg (wordWidth platform)) [src, value]
   InterlockedExchange_Word -> \[src, value] -> opIntoRegs $ \[res] ->
     emitPrimCall [res] (MO_Xchg (wordWidth platform)) [src, value]
+
+  FetchAddAddrOp_Word -> \[addr, n] -> opIntoRegs $ \[res] ->
+    doAtomicAddrRMW res AMO_Add addr (bWord platform) n
+  FetchSubAddrOp_Word -> \[addr, n] -> opIntoRegs $ \[res] ->
+    doAtomicAddrRMW res AMO_Sub addr (bWord platform) n
+  FetchAndAddrOp_Word -> \[addr, n] -> opIntoRegs $ \[res] ->
+    doAtomicAddrRMW res AMO_And addr (bWord platform) n
+  FetchNandAddrOp_Word -> \[addr, n] -> opIntoRegs $ \[res] ->
+    doAtomicAddrRMW res AMO_Nand addr (bWord platform) n
+  FetchOrAddrOp_Word -> \[addr, n] -> opIntoRegs $ \[res] ->
+    doAtomicAddrRMW res AMO_Or addr (bWord platform) n
+  FetchXorAddrOp_Word -> \[addr, n] -> opIntoRegs $ \[res] ->
+    doAtomicAddrRMW res AMO_Xor addr (bWord platform) n
+
+  AtomicReadAddrOp_Word -> \[addr] -> opIntoRegs $ \[res] ->
+    doAtomicReadAddr res addr (bWord platform)
+  AtomicWriteAddrOp_Word -> \[addr, val] -> opIntoRegs $ \[] ->
+    doAtomicWriteAddr addr (bWord platform) val
+
   CasAddrOp_Addr -> \[dst, expected, new] -> opIntoRegs $ \[res] ->
     emitPrimCall [res] (MO_Cmpxchg (wordWidth platform)) [dst, expected, new]
   CasAddrOp_Word -> \[dst, expected, new] -> opIntoRegs $ \[res] ->
@@ -1040,17 +1059,17 @@ emitPrimOp dflags primop = case primop of
 
 -- Atomic read-modify-write
   FetchAddByteArrayOp_Int -> \[mba, ix, n] -> opIntoRegs $ \[res] ->
-    doAtomicRMW res AMO_Add mba ix (bWord platform) n
+    doAtomicByteArrayRMW res AMO_Add mba ix (bWord platform) n
   FetchSubByteArrayOp_Int -> \[mba, ix, n] -> opIntoRegs $ \[res] ->
-    doAtomicRMW res AMO_Sub mba ix (bWord platform) n
+    doAtomicByteArrayRMW res AMO_Sub mba ix (bWord platform) n
   FetchAndByteArrayOp_Int -> \[mba, ix, n] -> opIntoRegs $ \[res] ->
-    doAtomicRMW res AMO_And mba ix (bWord platform) n
+    doAtomicByteArrayRMW res AMO_And mba ix (bWord platform) n
   FetchNandByteArrayOp_Int -> \[mba, ix, n] -> opIntoRegs $ \[res] ->
-    doAtomicRMW res AMO_Nand mba ix (bWord platform) n
+    doAtomicByteArrayRMW res AMO_Nand mba ix (bWord platform) n
   FetchOrByteArrayOp_Int -> \[mba, ix, n] -> opIntoRegs $ \[res] ->
-    doAtomicRMW res AMO_Or mba ix (bWord platform) n
+    doAtomicByteArrayRMW res AMO_Or mba ix (bWord platform) n
   FetchXorByteArrayOp_Int -> \[mba, ix, n] -> opIntoRegs $ \[res] ->
-    doAtomicRMW res AMO_Xor mba ix (bWord platform) n
+    doAtomicByteArrayRMW res AMO_Xor mba ix (bWord platform) n
   AtomicReadByteArrayOp_Int -> \[mba, ix] -> opIntoRegs $ \[res] ->
     doAtomicReadByteArray res mba ix (bWord platform)
   AtomicWriteByteArrayOp_Int -> \[mba, ix, val] -> opIntoRegs $ \[] ->
@@ -2855,22 +2874,33 @@ doWriteSmallPtrArrayOp addr idx val = do
 -- | Emit an atomic modification to a byte array element. The result
 -- reg contains that previous value of the element. Implies a full
 -- memory barrier.
-doAtomicRMW :: LocalReg      -- ^ Result reg
+doAtomicByteArrayRMW
+            :: LocalReg      -- ^ Result reg
             -> AtomicMachOp  -- ^ Atomic op (e.g. add)
             -> CmmExpr       -- ^ MutableByteArray#
             -> CmmExpr       -- ^ Index
             -> CmmType       -- ^ Type of element by which we are indexing
             -> CmmExpr       -- ^ Op argument (e.g. amount to add)
             -> FCode ()
-doAtomicRMW res amop mba idx idx_ty n = do
+doAtomicByteArrayRMW res amop mba idx idx_ty n = do
     profile <- getProfile
     platform <- getPlatform
     let width = typeWidth idx_ty
         addr  = cmmIndexOffExpr platform (arrWordsHdrSize profile)
                 width mba idx
+    doAtomicAddrRMW res amop addr idx_ty n
+
+doAtomicAddrRMW
+            :: LocalReg      -- ^ Result reg
+            -> AtomicMachOp  -- ^ Atomic op (e.g. add)
+            -> CmmExpr       -- ^ Addr#
+            -> CmmType       -- ^ Pointed value type
+            -> CmmExpr       -- ^ Op argument (e.g. amount to add)
+            -> FCode ()
+doAtomicAddrRMW res amop addr ty n = do
     emitPrimCall
         [ res ]
-        (MO_AtomicRMW width amop)
+        (MO_AtomicRMW (typeWidth ty) amop)
         [ addr, n ]
 
 -- | Emit an atomic read to a byte array that acts as a memory barrier.
@@ -2886,9 +2916,18 @@ doAtomicReadByteArray res mba idx idx_ty = do
     let width = typeWidth idx_ty
         addr  = cmmIndexOffExpr platform (arrWordsHdrSize profile)
                 width mba idx
+    doAtomicReadAddr res addr idx_ty
+
+-- | Emit an atomic read to an address that acts as a memory barrier.
+doAtomicReadAddr
+    :: LocalReg  -- ^ Result reg
+    -> CmmExpr   -- ^ Addr#
+    -> CmmType   -- ^ Type of element by which we are indexing
+    -> FCode ()
+doAtomicReadAddr res addr ty = do
     emitPrimCall
         [ res ]
-        (MO_AtomicRead width)
+        (MO_AtomicRead (typeWidth ty))
         [ addr ]
 
 -- | Emit an atomic write to a byte array that acts as a memory barrier.
@@ -2904,9 +2943,18 @@ doAtomicWriteByteArray mba idx idx_ty val = do
     let width = typeWidth idx_ty
         addr  = cmmIndexOffExpr platform (arrWordsHdrSize profile)
                 width mba idx
+    doAtomicWriteAddr addr idx_ty val
+
+-- | Emit an atomic write to an address that acts as a memory barrier.
+doAtomicWriteAddr
+    :: CmmExpr   -- ^ Addr#
+    -> CmmType   -- ^ Type of element by which we are indexing
+    -> CmmExpr   -- ^ Value to write
+    -> FCode ()
+doAtomicWriteAddr addr ty val = do
     emitPrimCall
         [ {- no results -} ]
-        (MO_AtomicWrite width)
+        (MO_AtomicWrite (typeWidth ty))
         [ addr, val ]
 
 doCasByteArray
diff --git a/testsuite/tests/concurrent/should_run/AtomicPrimops.hs b/testsuite/tests/concurrent/should_run/AtomicPrimops.hs
index aeed9eaab6..83e5b514f0 100644
--- a/testsuite/tests/concurrent/should_run/AtomicPrimops.hs
+++ b/testsuite/tests/concurrent/should_run/AtomicPrimops.hs
@@ -13,61 +13,49 @@ import GHC.Exts
 import GHC.IO
 
 -- | Iterations per worker.
-iters :: Int
+iters :: Word
 iters = 1000000
 
 main :: IO ()
 main = do
+    -- ByteArray#
     fetchAddSubTest
     fetchAndTest
     fetchNandTest
     fetchOrTest
     fetchXorTest
     casTest
-    casTestAddr
     readWriteTest
-
--- | Test fetchAddIntArray# by having two threads concurrenctly
+    -- Addr#
+    fetchAddSubAddrTest
+    fetchAndAddrTest
+    fetchNandAddrTest
+    fetchOrAddrTest
+    fetchXorAddrTest
+    casAddrTest
+    readWriteAddrTest
+
+loop :: Word -> IO () -> IO ()
+loop 0 act = return ()
+loop n act = act >> loop (n-1) act
+
+-- | Test fetchAddIntArray# by having two threads concurrently
 -- increment a counter and then checking the sum at the end.
 fetchAddSubTest :: IO ()
 fetchAddSubTest = do
     tot <- race 0
-        (\ mba -> work fetchAddIntArray mba iters 2)
-        (\ mba -> work fetchSubIntArray mba iters 1)
+        (\ mba -> loop iters $ fetchAddIntArray mba 0 2)
+        (\ mba -> loop iters $ fetchSubIntArray mba 0 1)
     assertEq 1000000 tot "fetchAddSubTest"
-  where
-    work :: (MByteArray -> Int -> Int -> IO ()) -> MByteArray -> Int -> Int
-         -> IO ()
-    work op mba 0 val = return ()
-    work op mba n val = op mba 0 val >> work op mba (n-1) val
 
--- | Test fetchXorIntArray# by having two threads concurrenctly XORing
--- and then checking the result at the end. Works since XOR is
--- commutative.
---
--- Covers the code paths for AND, NAND, and OR as well.
-fetchXorTest :: IO ()
-fetchXorTest = do
-    res <- race n0
-        (\ mba -> work mba iters t1pat)
-        (\ mba -> work mba iters t2pat)
-    assertEq expected res "fetchXorTest"
-  where
-    work :: MByteArray -> Int -> Int -> IO ()
-    work mba 0 val = return ()
-    work mba n val = fetchXorIntArray mba 0 val >> work mba (n-1) val
-
-    -- The two patterns are 1010...  and 0101...  The second pattern is larger
-    -- than maxBound, avoid warnings by initialising as a Word.
-    (n0, t1pat, t2pat)
-        | sizeOf (undefined :: Int) == 8 =
-            ( 0x00000000ffffffff, 0x5555555555555555
-            , fromIntegral (0x9999999999999999 :: Word))
-        | otherwise = ( 0x0000ffff, 0x55555555
-                      , fromIntegral (0x99999999 :: Word))
-    expected
-        | sizeOf (undefined :: Int) == 8 = 4294967295
-        | otherwise = 65535
+-- | Test fetchAddWordAddr# by having two threads concurrently
+-- increment a counter and then checking the sum at the end.
+fetchAddSubAddrTest :: IO ()
+fetchAddSubAddrTest = do
+    tot <- raceAddr 0
+        (\ addr -> loop iters $ fetchAddWordPtr addr 2)
+        (\ addr -> loop iters $ fetchSubWordPtr addr 1)
+    assertEq 1000000 tot "fetchAddSubAddrTest"
 
 -- The tests for AND, NAND, and OR are trivial for two reasons:
 --
@@ -81,71 +69,132 @@ fetchXorTest = do
 -- Right now we only test that they return the correct value for a
 -- single op on each thread.
 
--- | Test an associative operation.
-fetchOpTest :: (MByteArray -> Int -> Int -> IO ())
-            -> Int -> String -> IO ()
-fetchOpTest op expected name = do
-    res <- race n0
-        (\ mba -> work mba t1pat)
-        (\ mba -> work mba t2pat)
-    assertEq expected res name
-  where
-    work :: MByteArray -> Int -> IO ()
-    work mba val = op mba 0 val
-
 -- | Initial value and operation arguments for race test.
 --
 -- The two patterns are 1010...  and 0101...  The second pattern is larger than
 -- maxBound, avoid warnings by initialising as a Word.
-n0, t1pat, t2pat :: Int
+n0, t1pat, t2pat :: Word
 (n0, t1pat, t2pat)
-    | sizeOf (undefined :: Int) == 8 =
-        ( 0x00000000ffffffff, 0x5555555555555555
-        , fromIntegral (0x9999999999999999 :: Word))
-    | otherwise = ( 0x0000ffff, 0x55555555
-                  , fromIntegral (0x99999999 :: Word))
+    | sizeOf (undefined :: Word) == 8
+    = (0x00000000ffffffff, 0x5555555555555555, 0x9999999999999999)
+    | otherwise
+    = (0x0000ffff, 0x55555555, 0x99999999)
+
+-- | Test an associative operation.
+fetchOpTest :: (MByteArray -> Int -> Int -> IO ())
+            -> Int -> String -> IO ()
+fetchOpTest op expected name = do
+    res <- race (fromIntegral n0)
+        (\ mba -> op mba 0 (fromIntegral t1pat))
+        (\ mba -> op mba 0 (fromIntegral t2pat))
+    assertEq expected res name
 
 fetchAndTest :: IO ()
 fetchAndTest = fetchOpTest fetchAndIntArray expected "fetchAndTest"
   where expected
-            | sizeOf (undefined :: Int) == 8 = 286331153
+            | sizeOf (undefined :: Word) == 8 = 286331153
             | otherwise = 4369
 
+fetchOrTest :: IO ()
+fetchOrTest = fetchOpTest fetchOrIntArray expected "fetchOrTest"
+  where expected
+            | sizeOf (undefined :: Word) == 8
+            = fromIntegral (15987178197787607039 :: Word)
+            | otherwise
+            = fromIntegral (3722313727 :: Word)
+
 -- | Test NAND without any race, as NAND isn't associative.
 fetchNandTest :: IO ()
 fetchNandTest = do
-    mba <- newByteArray (sizeOf (undefined :: Int))
-    writeIntArray mba 0 n0
-    fetchNandIntArray mba 0 t1pat
-    fetchNandIntArray mba 0 t2pat
+    mba <- newByteArray (sizeOf (undefined :: Word))
+    writeIntArray mba 0 (fromIntegral n0)
+    fetchNandIntArray mba 0 (fromIntegral t1pat)
+    fetchNandIntArray mba 0 (fromIntegral t2pat)
     res <- readIntArray mba 0
     assertEq expected res "fetchNandTest"
   where expected
-            | sizeOf (undefined :: Int) == 8 = 7378697629770151799
+            | sizeOf (undefined :: Word) == 8 = 7378697629770151799
             | otherwise = -2576976009
 
-fetchOrTest :: IO ()
-fetchOrTest = fetchOpTest fetchOrIntArray expected "fetchOrTest"
+-- | Test fetchXorIntArray# by having two threads concurrently XORing
+-- and then checking the result at the end. Works since XOR is
+-- commutative.
+--
+-- Covers the code paths for AND, NAND, and OR as well.
+fetchXorTest :: IO ()
+fetchXorTest = do
+    res <- race (fromIntegral n0)
+        (\mba -> loop iters $ fetchXorIntArray mba 0 (fromIntegral t1pat))
+        (\mba -> loop iters $ fetchXorIntArray mba 0 (fromIntegral t2pat))
+    assertEq expected res "fetchXorTest"
+  where
+    expected
+        | sizeOf (undefined :: Word) == 8 = 4294967295
+        | otherwise = 65535
+
+
+-- | Test an associative operation.
+fetchOpAddrTest :: (Ptr Word -> Word -> IO ()) -> Word -> String -> IO ()
+fetchOpAddrTest op expected name = do
+    res <- raceAddr n0
+        (\ptr -> op ptr t1pat)
+        (\ptr -> op ptr t2pat)
+    assertEq expected res name
+
+fetchAndAddrTest :: IO ()
+fetchAndAddrTest = fetchOpAddrTest fetchAndWordPtr expected "fetchAndAddrTest"
   where expected
-            | sizeOf (undefined :: Int) == 8
-            = fromIntegral (15987178197787607039 :: Word)
+            | sizeOf (undefined :: Word) == 8 = 286331153
+            | otherwise = 4369
+
+fetchOrAddrTest :: IO ()
+fetchOrAddrTest = fetchOpAddrTest fetchOrWordPtr expected "fetchOrAddrTest"
+  where expected
+            | sizeOf (undefined :: Word) == 8
+            = 15987178197787607039
             | otherwise
-            = fromIntegral (3722313727 :: Word)
+            = 3722313727
+
+
+-- | Test NAND without any race, as NAND isn't associative.
+fetchNandAddrTest :: IO ()
+fetchNandAddrTest = do
+    ptr <- castPtr <$> callocBytes (sizeOf (undefined :: Word))
+    poke ptr n0
+    fetchNandWordPtr ptr t1pat
+    fetchNandWordPtr ptr t2pat
+    res <- peek ptr
+    assertEq expected res "fetchNandAddrTest"
+  where expected
+            | sizeOf (undefined :: Word) == 8 = 7378697629770151799
+            | otherwise = -2576976009
+
+-- | Test fetchXorIntArray# by having two threads concurrently XORing
+-- and then checking the result at the end. Works since XOR is
+-- commutative.
+--
+-- Covers the code paths for AND, NAND, and OR as well.
+fetchXorAddrTest :: IO ()
+fetchXorAddrTest = do
+    res <- raceAddr n0
+        (\ptr -> loop iters $ fetchXorWordPtr ptr t1pat)
+        (\ptr -> loop iters $ fetchXorWordPtr ptr t2pat)
+    assertEq expected res "fetchXorAddrTest"
+  where
+    expected
+        | sizeOf (undefined :: Int) == 8 = 4294967295
+        | otherwise = 65535
 
 -- | Test casIntArray# by using it to emulate fetchAddIntArray# and
--- then having two threads concurrenctly increment a counter,
+-- then having two threads concurrently increment a counter,
 -- checking the sum at the end.
 casTest :: IO ()
 casTest = do
     tot <- race 0
-        (\ mba -> work mba iters 1)
-        (\ mba -> work mba iters 2)
-    assertEq (3 * iters) tot "casTest"
+        (\ mba -> loop iters $ add mba 0 1)
+        (\ mba -> loop iters $ add mba 0 2)
+    assertEq (3 * fromIntegral iters) tot "casTest"
   where
-    work :: MByteArray -> Int -> Int -> IO ()
-    work mba 0 val = return ()
-    work mba n val = add mba 0 val >> work mba (n-1) val
-
     -- Fetch-and-add implemented using CAS.
     add :: MByteArray -> Int -> Int -> IO ()
     add mba ix n = do
@@ -153,6 +202,24 @@ casTest = do
         old' <- casIntArray mba ix old (old + n)
         when (old /= old') $ add mba ix n
 
+-- | Test atomicCasWordAddr# by having two threads concurrently increment a
+-- counter, checking the sum at the end.
+casAddrTest :: IO ()
+casAddrTest = do
+    tot <- raceAddr 0
+        (\ addr -> loop iters $ add addr 1)
+        (\ addr -> loop iters $ add addr 2)
+    assertEq (3 * iters) tot "casAddrTest"
+  where
+    -- Fetch-and-add implemented using CAS.
+    add :: Ptr Word -> Word -> IO ()
+    add ptr n = peek ptr >>= go
+      where
+        go old = do
+            old' <- atomicCasWordPtr ptr old (old + n)
+            when (old /= old') $ go old'
+
+
 -- | Tests atomic reads and writes by making sure that one thread sees
 -- updates that are done on another. This test isn't very good at the
 -- moment, as this might work even without atomic ops, but at least it
@@ -172,6 +239,21 @@ readWriteTest = do
     putMVar latch ()
     takeMVar done
 
+readWriteAddrTest :: IO ()
+readWriteAddrTest = do
+    ptr <- castPtr <$> callocBytes (sizeOf (undefined :: Word))
+    poke ptr 0
+    latch <- newEmptyMVar
+    done <- newEmptyMVar
+    forkIO $ do
+        takeMVar latch
+        n <- atomicReadWordPtr ptr
+        assertEq 1 n "readWriteAddrTest"
+        putMVar done ()
+    atomicWriteWordPtr ptr 1
+    putMVar latch ()
+    takeMVar done
+
 -- | Create two threads that mutate the byte array passed to them
 -- concurrently. The array is one word large.
 race :: Int                    -- ^ Initial value of array element
@@ -188,44 +270,21 @@ race n0 thread1 thread2 = do
     mapM_ takeMVar [done1, done2]
     readIntArray mba 0
 
--- | Test atomicCasWordAddr# by having two threads concurrenctly increment a
--- counter, checking the sum at the end.
-casTestAddr :: IO ()
-casTestAddr = do
-    tot <- raceAddr 0
-        (\ addr -> work addr (fromIntegral iters) 1)
-        (\ addr -> work addr (fromIntegral iters) 2)
-    assertEq (3 * fromIntegral iters) tot "casTestAddr"
-  where
-    work :: Ptr Word -> Word -> Word -> IO ()
-    work ptr 0 val = return ()
-    work ptr n val = add ptr val >> work ptr (n-1) val
-
-    -- Fetch-and-add implemented using CAS.
-    add :: Ptr Word -> Word -> IO ()
-    add ptr n = peek ptr >>= go
-      where
-        go old = do
-            old' <- atomicCasWordPtr ptr old (old + n)
-            when (old /= old') $ go old'
-
-    -- | Create two threads that mutate the byte array passed to them
-    -- concurrently. The array is one word large.
-    raceAddr :: Word                -- ^ Initial value of array element
-            -> (Ptr Word -> IO ())  -- ^ Thread 1 action
-            -> (Ptr Word -> IO ())  -- ^ Thread 2 action
-            -> IO Word              -- ^ Final value of array element
-    raceAddr n0 thread1 thread2 = do
-        done1 <- newEmptyMVar
-        done2 <- newEmptyMVar
-        ptr <- asWordPtr <$> callocBytes (sizeOf (undefined :: Word))
-        forkIO $ thread1 ptr >> putMVar done1 ()
-        forkIO $ thread2 ptr >> putMVar done2 ()
-        mapM_ takeMVar [done1, done2]
-        peek ptr
-      where
-        asWordPtr :: Ptr a -> Ptr Word
-        asWordPtr = castPtr
+-- | Create two threads that mutate the byte array passed to them
+-- concurrently. The array is one word large.
+raceAddr :: Word                -- ^ Initial value of array element
+        -> (Ptr Word -> IO ())  -- ^ Thread 1 action
+        -> (Ptr Word -> IO ())  -- ^ Thread 2 action
+        -> IO Word              -- ^ Final value of array element
+raceAddr n0 thread1 thread2 = do
+    done1 <- newEmptyMVar
+    done2 <- newEmptyMVar
+    ptr <- castPtr <$> callocBytes (sizeOf (undefined :: Word))
+    poke ptr n0
+    forkIO $ thread1 ptr >> putMVar done1 ()
+    forkIO $ thread2 ptr >> putMVar done2 ()
+    mapM_ takeMVar [done1, done2]
+    peek ptr
 
 ------------------------------------------------------------------------
 -- Test helper
@@ -306,6 +365,46 @@ casIntArray (MBA mba#) (I# ix#) (I# old#) (I# new#) = IO $ \ s# ->
 ------------------------------------------------------------------------
 -- Wrappers around Addr#
 
+fetchAddWordPtr :: Ptr Word -> Word -> IO ()
+fetchAddWordPtr (Ptr addr#) (W# n#) = IO $ \ s# ->
+    case fetchAddWordAddr# addr# n# s# of
+        (# s2#, _ #) -> (# s2#, () #)
+
+fetchSubWordPtr :: Ptr Word -> Word -> IO ()
+fetchSubWordPtr (Ptr addr#) (W# n#) = IO $ \ s# ->
+    case fetchSubWordAddr# addr# n# s# of
+        (# s2#, _ #) -> (# s2#, () #)
+
+fetchAndWordPtr :: Ptr Word -> Word -> IO ()
+fetchAndWordPtr (Ptr addr#) (W# n#) = IO $ \ s# ->
+    case fetchAndWordAddr# addr# n# s# of
+        (# s2#, _ #) -> (# s2#, () #)
+
+fetchOrWordPtr :: Ptr Word -> Word -> IO ()
+fetchOrWordPtr (Ptr addr#) (W# n#) = IO $ \ s# ->
+    case fetchOrWordAddr# addr# n# s# of
+        (# s2#, _ #) -> (# s2#, () #)
+
+fetchNandWordPtr :: Ptr Word -> Word -> IO ()
+fetchNandWordPtr (Ptr addr#) (W# n#) = IO $ \ s# ->
+    case fetchNandWordAddr# addr# n# s# of
+        (# s2#, _ #) -> (# s2#, () #)
+
+fetchXorWordPtr :: Ptr Word -> Word -> IO ()
+fetchXorWordPtr (Ptr addr#) (W# n#) = IO $ \ s# ->
+    case fetchXorWordAddr# addr# n# s# of
+        (# s2#, _ #) -> (# s2#, () #)
+
+atomicWriteWordPtr :: Ptr Word -> Word -> IO ()
+atomicWriteWordPtr (Ptr addr#) (W# n#) = IO $ \ s# ->
+    case atomicWriteWordAddr# addr# n# s# of
+        s2# -> (# s2#, () #)
+
+atomicReadWordPtr :: Ptr Word -> IO Word
+atomicReadWordPtr (Ptr addr#) = IO $ \ s# ->
+    case atomicReadWordAddr# addr# s# of
+        (# s2#, n# #) -> (# s2#, W# n# #)
+
 -- Should this be added to Foreign.Storable?  Similar to poke, but does the
 -- update atomically.
 atomicCasWordPtr :: Ptr Word -> Word -> Word -> IO Word
diff --git a/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout b/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout
index c9ea7ee500..b09c2a8eaa 100644
--- a/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout
+++ b/testsuite/tests/concurrent/should_run/AtomicPrimops.stdout
@@ -4,5 +4,11 @@ fetchNandTest: OK
 fetchOrTest: OK
 fetchXorTest: OK
 casTest: OK
-casTestAddr: OK
 readWriteTest: OK
+fetchAddSubAddrTest: OK
+fetchAndAddrTest: OK
+fetchNandAddrTest: OK
+fetchOrAddrTest: OK
+fetchXorAddrTest: OK
+casAddrTest: OK
+readWriteAddrTest: OK