Backport fixes from !1953 fixing #17334.wip/andreask/1953_backport

author: Andreas Klebinger <klebinger.andreas@gmx.at> 2019-10-15 00:58:12 +0200
committer: Andreas Klebinger <klebinger.andreas@gmx.at> 2019-11-12 15:31:27 +0100
commit: ec962cb95fb6f288f9a449f3c5da8998afb636ba (patch)
tree: 1ecd5b3a612f5650d6a3d7d56fdaaaeebc8a6853
parent: dde5c06a9ac205b4edbff679c682a4ed0fedbc2e (diff)
download: haskell-wip/andreask/1953_backport.tar.gz
8 files changed, 830 insertions, 266 deletions
diff --git a/compiler/nativeGen/AsmCodeGen.hs b/compiler/nativeGen/AsmCodeGen.hs
index 0c21bc0641..cb2fb5476a 100644
--- a/compiler/nativeGen/AsmCodeGen.hs
+++ b/compiler/nativeGen/AsmCodeGen.hs
@@ -550,6 +550,10 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
  = do
         let platform = targetPlatform dflags
 
+        let proc_name = case cmm of
+                (CmmProc _ entry_label _ _) -> ppr entry_label
+                _                           -> text "DataChunk"
+
         -- rewrite assignments to global regs
         let fixed_cmm =
                 {-# SCC "fixStgRegisters" #-}
@@ -579,12 +583,11 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
                 Opt_D_dump_asm_native "Native code"
                 (vcat $ map (pprNatCmmDecl ncgImpl) native)
 
-        dumpIfSet_dyn dflags
-                Opt_D_dump_cfg_weights "CFG Weights"
-                (pprEdgeWeights nativeCfgWeights)
+        maybeDumpCfg dflags (Just nativeCfgWeights) "CFG Weights - Native" proc_name
 
         -- tag instructions with register liveness information
-        -- also drops dead code
+        -- also drops dead code. We don't keep the cfg in sync on
+        -- some backends, so don't use it there.
         let livenessCfg = if (backendMaintainsCfg dflags)
                                 then Just nativeCfgWeights
                                 else Nothing
@@ -697,12 +700,13 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
             cfgRegAllocUpdates = (concatMap Linear.ra_fixupList raStats)
 
         let cfgWithFixupBlks =
-                addNodesBetween nativeCfgWeights cfgRegAllocUpdates
+                pure addNodesBetween <*> livenessCfg <*> pure cfgRegAllocUpdates
 
         -- Insert stack update blocks
-        let postRegCFG =
-                foldl' (\m (from,to) -> addImmediateSuccessor from to m )
-                       cfgWithFixupBlks stack_updt_blks
+        let postRegCFG :: Maybe CFG
+            postRegCFG =
+                pure (foldl' (\m (from,to) -> addImmediateSuccessor from to m )) <*>
+                        cfgWithFixupBlks <*> pure stack_updt_blks
 
         ---- x86fp_kludge.  This pass inserts ffree instructions to clear
         ---- the FPU stack on x86.  The x86 ABI requires that the FPU stack
@@ -729,11 +733,9 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
                 shortcutBranches dflags ncgImpl tabled postRegCFG
 
         let optimizedCFG =
-                optimizeCFG (cfgWeightInfo dflags) cmm postShortCFG
+                optimizeCFG (cfgWeightInfo dflags) cmm <$> postShortCFG
 
-        dumpIfSet_dyn dflags
-                Opt_D_dump_cfg_weights "CFG Final Weights"
-                ( pprEdgeWeights optimizedCFG )
+        maybeDumpCfg dflags optimizedCFG "CFG Weights - Final" proc_name
 
         --TODO: Partially check validity of the cfg.
         let getBlks (CmmProc _info _lbl _live (ListGraph blocks)) = blocks
@@ -743,8 +745,8 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
                 (gopt Opt_DoAsmLinting dflags || debugIsOn )) $ do
                 let blocks = concatMap getBlks shorted
                 let labels = setFromList $ fmap blockId blocks :: LabelSet
-                return $! seq (sanityCheckCfg optimizedCFG labels $
-                                text "cfg not in lockstep") ()
+                return $! seq (pure sanityCheckCfg <*> optimizedCFG <*> pure labels <*>
+                                        pure (text "cfg not in lockstep")) ()
 
         ---- sequence blocks
         let sequenced :: [NatCmmDecl statics instr]
@@ -761,6 +763,8 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
                 {-# SCC "invertCondBranches" #-}
                 map invert sequenced
               where
+                invertConds :: LabelMap CmmStatics -> [NatBasicBlock instr]
+                            -> [NatBasicBlock instr]
                 invertConds = (invertCondBranches ncgImpl) optimizedCFG
                 invert top@CmmData {} = top
                 invert (CmmProc info lbl live (ListGraph blocks)) =
@@ -793,6 +797,15 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
                 , ppr_raStatsLinear
                 , unwinds )
 
+maybeDumpCfg :: DynFlags -> Maybe CFG -> String -> SDoc -> IO ()
+maybeDumpCfg _dflags Nothing _ _ = return ()
+maybeDumpCfg dflags (Just cfg) msg proc_name
+        | null cfg = return ()
+        | otherwise
+        = dumpIfSet_dyn
+                dflags Opt_D_dump_cfg_weights msg
+                (proc_name <> char ':' $$ pprEdgeWeights cfg)
+
 -- | Make sure all blocks we want the layout algorithm to place have been placed.
 checkLayout :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr]
             -> [NatCmmDecl statics instr]
@@ -917,13 +930,13 @@ shortcutBranches
         :: forall statics instr jumpDest. (Outputable jumpDest) => DynFlags
         -> NcgImpl statics instr jumpDest
         -> [NatCmmDecl statics instr]
-        -> CFG
-        -> ([NatCmmDecl statics instr],CFG)
+        -> Maybe CFG
+        -> ([NatCmmDecl statics instr],Maybe CFG)
 
 shortcutBranches dflags ncgImpl tops weights
   | gopt Opt_AsmShortcutting dflags
   = ( map (apply_mapping ncgImpl mapping) tops'
-    , shortcutWeightMap weights mappingBid )
+    , shortcutWeightMap mappingBid <$!> weights )
   | otherwise
   = (tops, weights)
   where
diff --git a/compiler/nativeGen/BlockLayout.hs b/compiler/nativeGen/BlockLayout.hs
index 72aea5bf10..8dff0952fc 100644
--- a/compiler/nativeGen/BlockLayout.hs
+++ b/compiler/nativeGen/BlockLayout.hs
@@ -639,29 +639,31 @@ dropJumps info ((BasicBlock lbl ins):todo)
 
 sequenceTop
     :: (Instruction instr, Outputable instr)
-    => DynFlags --Use new layout code
-    -> NcgImpl statics instr jumpDest -> CFG
-    -> NatCmmDecl statics instr -> NatCmmDecl statics instr
+    => DynFlags -- Determine which layout algo to use
+    -> NcgImpl statics instr jumpDest
+    -> Maybe CFG -- ^ CFG if we have one.
+    -> NatCmmDecl statics instr -- ^ Function to serialize
+    -> NatCmmDecl statics instr
 
 sequenceTop _     _       _           top@(CmmData _ _) = top
 sequenceTop dflags ncgImpl edgeWeights
             (CmmProc info lbl live (ListGraph blocks))
   | (gopt Opt_CfgBlocklayout dflags) && backendMaintainsCfg dflags
   --Use chain based algorithm
+  , Just cfg <- edgeWeights
   = CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $
-                            sequenceChain info edgeWeights blocks )
+                            {-# SCC layoutBlocks #-}
+                            sequenceChain info cfg blocks )
   | otherwise
   --Use old algorithm
-  = CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $
-                            sequenceBlocks cfg info blocks)
+  = let cfg = if dontUseCfg then Nothing else edgeWeights
+    in  CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $
+                                {-# SCC layoutBlocks #-}
+                                sequenceBlocks cfg info blocks)
   where
-    cfg
-      | (gopt Opt_WeightlessBlocklayout dflags) ||
-        (not $ backendMaintainsCfg dflags)
-      -- Don't make use of cfg in the old algorithm
-      = Nothing
-      -- Use cfg in the old algorithm
-      | otherwise = Just edgeWeights
+    dontUseCfg = gopt Opt_WeightlessBlocklayout dflags ||
+                 (not $ backendMaintainsCfg dflags)
+
 
 -- The old algorithm:
 -- It is very simple (and stupid): We make a graph out of
diff --git a/compiler/nativeGen/CFG.hs b/compiler/nativeGen/CFG.hs
index 155e5bcac4..cf155364e7 100644
--- a/compiler/nativeGen/CFG.hs
+++ b/compiler/nativeGen/CFG.hs
@@ -61,8 +61,6 @@ import qualified DynFlags as D
 
 import Data.List
 
--- import qualified Data.IntMap.Strict as M --TODO: LabelMap
-
 type Edge = (BlockId, BlockId)
 type Edges = [Edge]
 
@@ -76,6 +74,13 @@ instance Outputable EdgeWeight where
 type EdgeInfoMap edgeInfo = LabelMap (LabelMap edgeInfo)
 
 -- | A control flow graph where edges have been annotated with a weight.
+-- Implemented as IntMap (IntMap <edgeData>)
+-- We must uphold the invariant that for each edge A -> B we must have:
+-- A entry B in the outer map.
+-- A entry B in the map we get when looking up A.
+-- Maintaining this invariant is useful as any failed lookup now indicates
+-- an actual error in code which might go unnoticed for a while
+-- otherwise.
 type CFG = EdgeInfoMap EdgeInfo
 
 data CfgEdge
@@ -144,11 +149,20 @@ adjustEdgeWeight cfg f from to
   = addEdge from to (info { edgeWeight = f weight}) cfg
   | otherwise = cfg
 
+
 getCfgNodes :: CFG -> LabelSet
 getCfgNodes m = mapFoldMapWithKey (\k v -> setFromList (k:mapKeys v)) m
 
+-- | Is this block part of this graph?
 hasNode :: CFG -> BlockId -> Bool
-hasNode m node = mapMember node m || any (mapMember node) m
+hasNode m node =
+  -- Check the invariant that each node must exist in the first map or not at all.
+  ASSERT( found || not (any (mapMember node) m))
+  found
+    where
+      found = mapMember node m
+
+
 
 -- | Check if the nodes in the cfg and the set of blocks are the same.
 --   In a case of a missmatch we panic and show the difference.
@@ -160,7 +174,7 @@ sanityCheckCfg m blockSet msg
         pprPanic "Block list and cfg nodes don't match" (
             text "difference:" <+> ppr diff $$
             text "blocks:" <+> ppr blockSet $$
-            text "cfg:" <+> ppr m $$
+            text "cfg:" <+> pprEdgeWeights m $$
             msg )
             False
     where
@@ -224,8 +238,8 @@ This function (shortcutWeightMap) takes the same mapping and
 applies the mapping to the CFG in the way layed out above.
 
 -}
-shortcutWeightMap :: CFG -> LabelMap (Maybe BlockId) -> CFG
-shortcutWeightMap cfg cuts =
+shortcutWeightMap :: LabelMap (Maybe BlockId) -> CFG -> CFG
+shortcutWeightMap cuts cfg =
   foldl' applyMapping cfg $ mapToList cuts
     where
 -- takes the tuple (B,C) from the notation in [Updating the CFG during shortcutting]
@@ -259,7 +273,7 @@ shortcutWeightMap cfg cuts =
 --             \                  \
 --              -> C    =>         -> C
 --
-addImmediateSuccessor :: BlockId -> BlockId -> CFG -> CFG
+addImmediateSuccessor :: HasDebugCallStack => BlockId -> BlockId -> CFG -> CFG
 addImmediateSuccessor node follower cfg
     = updateEdges . addWeightEdge node follower uncondWeight $ cfg
     where
@@ -275,10 +289,16 @@ addImmediateSuccessor node follower cfg
 -- | Adds a new edge, overwrites existing edges if present
 addEdge :: BlockId -> BlockId -> EdgeInfo -> CFG -> CFG
 addEdge from to info cfg =
-    mapAlter addDest from cfg
+    mapAlter addFromToEdge from $
+    mapAlter addDestNode to cfg
     where
-        addDest Nothing = Just $ mapSingleton to info
-        addDest (Just wm) = Just $ mapInsert to info wm
+        -- Simply insert the edge into the edge list.
+        addFromToEdge Nothing = Just $ mapSingleton to info
+        addFromToEdge (Just wm) = Just $ mapInsert to info wm
+        -- We must add the destination node explicitly as well
+        addDestNode Nothing = Just $ mapEmpty
+        addDestNode n@(Just _) = n
+
 
 -- | Adds a edge with the given weight to the cfg
 --   If there already existed an edge it is overwritten.
@@ -304,8 +324,11 @@ getSuccEdgesSorted m bid =
         sortedEdges
 
 -- | Get successors of a given node with edge weights.
-getSuccessorEdges :: CFG -> BlockId -> [(BlockId,EdgeInfo)]
-getSuccessorEdges m bid = maybe [] mapToList $ mapLookup bid m
+getSuccessorEdges :: HasDebugCallStack => CFG -> BlockId -> [(BlockId,EdgeInfo)]
+getSuccessorEdges m bid = maybe lookupError mapToList (mapLookup bid m)
+  where
+    lookupError = pprPanic "getSuccessorEdges: Block does not exist" $
+                    ppr bid $$ text "CFG:" <+> pprEdgeWeights m
 
 getEdgeInfo :: BlockId -> BlockId -> CFG -> Maybe EdgeInfo
 getEdgeInfo from to m
@@ -316,12 +339,13 @@ getEdgeInfo from to m
     = Nothing
 
 reverseEdges :: CFG -> CFG
-reverseEdges cfg = foldr add mapEmpty flatElems
+reverseEdges cfg = mapFoldlWithKey (\cfg from toMap -> go (addNode cfg from) from toMap) mapEmpty cfg
   where
-    elems = mapToList $ fmap mapToList cfg :: [(BlockId,[(BlockId,EdgeInfo)])]
-    flatElems =
-        concatMap (\(from,ws) -> map (\(to,info) -> (to,from,info)) ws ) elems
-    add (to,from,info) m = addEdge to from info m
+    -- We preserve nodes without outgoing edges!
+    addNode :: CFG -> BlockId -> CFG
+    addNode cfg b = mapInsertWith mapUnion b mapEmpty cfg
+    go :: CFG -> BlockId -> (LabelMap EdgeInfo) -> CFG
+    go cfg from toMap = mapFoldlWithKey (\cfg to info -> addEdge to from info cfg) cfg toMap  :: CFG
 
 -- | Returns a unordered list of all edges with info
 infoEdgeList :: CFG -> [CfgEdge]
@@ -347,11 +371,14 @@ edgeList m =
         mapFoldMapWithKey (\from toMap -> fmap (from,) (mapKeys toMap)) m
 
 -- | Get successors of a given node without edge weights.
-getSuccessors :: CFG -> BlockId -> [BlockId]
+getSuccessors :: HasDebugCallStack => CFG -> BlockId -> [BlockId]
 getSuccessors m bid
     | Just wm <- mapLookup bid m
     = mapKeys wm
-    | otherwise = []
+    | otherwise = lookupError
+    where
+      lookupError = pprPanic "getSuccessors: Block does not exist" $
+                    ppr bid <+> pprEdgeWeights m
 
 pprEdgeWeights :: CFG -> SDoc
 pprEdgeWeights m =
@@ -375,6 +402,7 @@ pprEdgeWeights m =
     text "}\n"
 
 {-# INLINE updateEdgeWeight #-} --Allows eliminating the tuple when possible
+-- | Invariant: The edge **must** exist already in the graph.
 updateEdgeWeight :: (EdgeWeight -> EdgeWeight) -> Edge -> CFG -> CFG
 updateEdgeWeight f (from, to) cfg
     | Just oldInfo <- getEdgeInfo from to cfg
@@ -422,7 +450,8 @@ addNodesBetween m updates =
         | otherwise
         = pprPanic "Can't find weight for edge that should have one" (
             text "triple" <+> ppr (from,between,old) $$
-            text "updates" <+> ppr updates )
+            text "updates" <+> ppr updates $$
+            text "cfg:" <+> pprEdgeWeights m )
       updateWeight :: CFG -> (BlockId,BlockId,BlockId,EdgeInfo) -> CFG
       updateWeight m (from,between,old,edgeInfo)
         = addEdge from between edgeInfo .
@@ -550,7 +579,7 @@ getCfg weights graph =
     blocks = revPostorder graph :: [CmmBlock]
 
 --Find back edges by BFS
-findBackEdges :: BlockId -> CFG -> Edges
+findBackEdges :: HasDebugCallStack => BlockId -> CFG -> Edges
 findBackEdges root cfg =
     --pprTraceIt "Backedges:" $
     map fst .
@@ -562,7 +591,7 @@ findBackEdges root cfg =
       classifyEdges root getSuccs edges :: [((BlockId,BlockId),EdgeType)]
 
 
-optimizeCFG :: D.CfgWeights -> RawCmmDecl -> CFG -> CFG
+optimizeCFG :: HasDebugCallStack => D.CfgWeights -> RawCmmDecl -> CFG -> CFG
 optimizeCFG _ (CmmData {}) cfg = cfg
 optimizeCFG weights (CmmProc info _lab _live graph) cfg =
     favourFewerPreds  .
@@ -641,16 +670,17 @@ optimizeCFG weights (CmmProc info _lab _live graph) cfg =
 -- | Determine loop membership of blocks based on SCC analysis
 --   Ideally we would replace this with a variant giving us loop
 --   levels instead but the SCC code will do for now.
-loopMembers :: CFG -> LabelMap Bool
+loopMembers :: HasDebugCallStack => CFG -> LabelMap Bool
 loopMembers cfg =
     foldl' (flip setLevel) mapEmpty sccs
   where
     mkNode :: BlockId -> Node BlockId BlockId
     mkNode bid = DigraphNode bid bid (getSuccessors cfg bid)
-    nodes = map mkNode (setElems $ getCfgNodes cfg)
+    nodes = map mkNode $ setElems (getCfgNodes cfg)
 
     sccs = stronglyConnCompFromEdgedVerticesOrd nodes
 
     setLevel :: SCC BlockId -> LabelMap Bool -> LabelMap Bool
     setLevel (AcyclicSCC bid) m = mapInsert bid False m
     setLevel (CyclicSCC bids) m = foldl' (\m k -> mapInsert k True m) m bids
+
diff --git a/compiler/nativeGen/NCGMonad.hs b/compiler/nativeGen/NCGMonad.hs
index c22a656d2a..998361a2e3 100644
--- a/compiler/nativeGen/NCGMonad.hs
+++ b/compiler/nativeGen/NCGMonad.hs
@@ -1,4 +1,6 @@
 {-# LANGUAGE CPP #-}
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE BangPatterns #-}
 
 -- -----------------------------------------------------------------------------
 --
@@ -18,7 +20,7 @@ module NCGMonad (
         addNodeBetweenNat,
         addImmediateSuccessorNat,
         updateCfgNat,
-        getUniqueNat,
+        getUniqueNat, getCfgNat,
         mapAccumLNat,
         setDeltaNat,
         getDeltaNat,
@@ -65,6 +67,7 @@ import Instruction
 import Outputable (SDoc, pprPanic, ppr)
 import Cmm (RawCmmDecl, CmmStatics)
 import CFG
+import Util
 
 data NcgImpl statics instr jumpDest = NcgImpl {
     cmmTopCodeGen             :: RawCmmDecl -> NatM [NatCmmDecl statics instr],
@@ -88,7 +91,7 @@ data NcgImpl statics instr jumpDest = NcgImpl {
     -- the block's 'UnwindPoint's
     -- See Note [What is this unwinding business?] in Debug
     -- and Note [Unwinding information in the NCG] in this module.
-    invertCondBranches        :: CFG -> LabelMap CmmStatics -> [NatBasicBlock instr]
+    invertCondBranches        :: Maybe CFG -> LabelMap CmmStatics -> [NatBasicBlock instr]
                               -> [NatBasicBlock instr]
     -- ^ Turn the sequence of `jcc l1; jmp l2` into `jncc l2; <block_l1>`
     -- when possible.
@@ -206,7 +209,11 @@ addImportNat imp
 
 updateCfgNat :: (CFG -> CFG) -> NatM ()
 updateCfgNat f
-        = NatM $ \ st -> ((), st { natm_cfg = f (natm_cfg st) })
+        = NatM $ \ st -> let !cfg' = f (natm_cfg st)
+                         in ((), st { natm_cfg = cfg'})
+
+getCfgNat :: NatM CFG
+getCfgNat = NatM $ \ st -> (natm_cfg st, st)
 
 -- | Record that we added a block between `from` and `old`.
 addNodeBetweenNat :: BlockId -> BlockId -> BlockId -> NatM ()
@@ -231,7 +238,7 @@ addNodeBetweenNat from between to
 
 -- | Place `succ` after `block` and change any edges
 --   block -> X to `succ` -> X
-addImmediateSuccessorNat :: BlockId -> BlockId -> NatM ()
+addImmediateSuccessorNat :: HasDebugCallStack => BlockId -> BlockId -> NatM ()
 addImmediateSuccessorNat block succ
         = updateCfgNat (addImmediateSuccessor block succ)
 
diff --git a/compiler/nativeGen/RegAlloc/Liveness.hs b/compiler/nativeGen/RegAlloc/Liveness.hs
index b7f8d1c871..9d68ca12c1 100644
--- a/compiler/nativeGen/RegAlloc/Liveness.hs
+++ b/compiler/nativeGen/RegAlloc/Liveness.hs
@@ -705,7 +705,7 @@ sccBlocks blocks entries mcfg = map (fmap node_payload) sccs
         reachable :: LabelSet
         reachable
             | Just cfg <- mcfg
-            -- Our CFG only contains reachable nodes by construction.
+            -- Our CFG only contains reachable nodes by construction at this point.
             = getCfgNodes cfg
             | otherwise
             = setFromList $ [ node_key node | node <- reachablesG g1 roots ]
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 69ab7b202d..11e79acebc 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1,4 +1,5 @@
 {-# LANGUAGE CPP, GADTs, NondecreasingIndentation #-}
+{-# LANGUAGE TupleSections #-}
 
 -- The default iteration limit is a bit too low for the definitions
 -- in this module.
@@ -36,6 +37,7 @@ import GhcPrelude
 import X86.Instr
 import X86.Cond
 import X86.Regs
+import X86.Ppr (  )
 import X86.RegInfo
 
 --TODO: Remove - Just for development/debugging
@@ -130,6 +132,56 @@ cmmTopCodeGen (CmmProc info lab live graph) = do
 cmmTopCodeGen (CmmData sec dat) = do
   return [CmmData sec (1, dat)]  -- no translation, we just use CmmStatic
 
+{- Note [Verifying basic blocks]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   We want to guarantee a few things about the results
+   of instruction selection.
+
+   Namely that each basic blocks consists of:
+    * A (potentially empty) sequence of straight line instructions
+  followed by
+    * A (potentially empty) sequence of jump like instructions.
+
+    We can verify this by going through the instructions and
+    making sure that any non-jumpish instruction can't appear
+    after a jumpish instruction.
+
+    There are gotchas however:
+    * CALLs are strictly speaking control flow but here we care
+      not about them. Hence we treat them as regular instructions.
+
+      It's safe for them to appear inside a basic block
+      as (ignoring side effects inside the call) they will result in
+      straight line code.
+
+    * NEWBLOCK marks the start of a new basic block so can
+      be followed by any instructions.
+-}
+
+-- Verifying basic blocks is cheap, but not cheap enough to enable it unconditionally.
+verifyBasicBlock :: [Instr] -> ()
+verifyBasicBlock instrs
+  | debugIsOn     = go False instrs
+  | otherwise     = ()
+  where
+    go _     [] = ()
+    go atEnd (i:instr)
+        = case i of
+            -- Start a new basic block
+            NEWBLOCK {} -> go False instr
+            -- Calls are not viable block terminators
+            CALL {}     | atEnd -> faultyBlockWith i
+                        | not atEnd -> go atEnd instr
+            -- All instructions ok, check if we reached the end and continue.
+            _ | not atEnd -> go (isJumpishInstr i) instr
+              -- Only jumps allowed at the end of basic blocks.
+              | otherwise -> if isJumpishInstr i
+                                then go True instr
+                                else faultyBlockWith i
+    faultyBlockWith i
+        = pprPanic "Non control flow instructions after end of basic block."
+                   (ppr i <+> text "in:" $$ vcat (map ppr instrs))
 
 basicBlockCodeGen
         :: CmmBlock
@@ -148,9 +200,10 @@ basicBlockCodeGen block = do
             let line = srcSpanStartLine span; col = srcSpanStartCol span
             return $ unitOL $ LOCATION fileId line col name
     _ -> return nilOL
-  mid_instrs <- stmtsToInstrs id stmts
-  tail_instrs <- stmtToInstrs id tail
+  (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
+  (tail_instrs,_) <- stmtToInstrs mid_bid tail
   let instrs = loc_instrs `appOL` mid_instrs `appOL` tail_instrs
+  return $! verifyBasicBlock (fromOL instrs)
   instrs' <- fold <$> traverse addSpUnwindings instrs
   -- code generation may introduce new basic block boundaries, which
   -- are indicated by the NEWBLOCK instruction.  We must split up the
@@ -180,60 +233,137 @@ addSpUnwindings instr@(DELTA d) = do
         else return (unitOL instr)
 addSpUnwindings instr = return $ unitOL instr
 
-stmtsToInstrs :: BlockId -> [CmmNode e x] -> NatM InstrBlock
-stmtsToInstrs bid stmts
-   = do instrss <- mapM (stmtToInstrs bid) stmts
-        return (concatOL instrss)
+{- Note [Keeping track of the current block]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When generating instructions for Cmm we sometimes require
+the current block for things like retry loops.
+
+We also sometimes change the current block, if a MachOP
+results in branching control flow.
+
+Issues arise if we have two statements in the same block,
+which both depend on the current block id *and* change the
+basic block after them. This happens for atomic primops
+in the X86 backend where we want to update the CFG data structure
+when introducing new basic blocks.
+
+For example in #17334 we got this Cmm code:
+
+        c3Bf: // global
+            (_s3t1::I64) = call MO_AtomicRMW W64 AMO_And(_s3sQ::P64 + 88, 18);
+            (_s3t4::I64) = call MO_AtomicRMW W64 AMO_Or(_s3sQ::P64 + 88, 0);
+            _s3sT::I64 = _s3sV::I64;
+            goto c3B1;
+
+This resulted in two new basic blocks being inserted:
+
+        c3Bf:
+                movl $18,%vI_n3Bo
+                movq 88(%vI_s3sQ),%rax
+                jmp _n3Bp
+        n3Bp:
+                ...
+                cmpxchgq %vI_n3Bq,88(%vI_s3sQ)
+                jne _n3Bp
+                ...
+                jmp _n3Bs
+        n3Bs:
+                ...
+                cmpxchgq %vI_n3Bt,88(%vI_s3sQ)
+                jne _n3Bs
+                ...
+                jmp _c3B1
+        ...
+
+Based on the Cmm we called stmtToInstrs we translated both atomic operations under
+the assumption they would be placed into their Cmm basic block `c3Bf`.
+However for the retry loop we introduce new labels, so this is not the case
+for the second statement.
+This resulted in a desync between the explicit control flow graph
+we construct as a separate data type and the actual control flow graph in the code.
+
+Instead we now return the new basic block if a statement causes a change
+in the current block and use the block for all following statements.
+
+For this reason genCCall is also split into two parts.
+One for calls which *won't* change the basic blocks in
+which successive instructions will be placed.
+A different one for calls which *are* known to change the
+basic block.
+
+-}
+
+-- See Note [Keeping track of the current block] for why
+-- we pass the BlockId.
+stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
+              -> [CmmNode O O] -- ^ Cmm Statement
+              -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
+stmtsToInstrs bid stmts =
+    go bid stmts nilOL
+  where
+    go bid  []        instrs = return (instrs,bid)
+    go bid (s:stmts)  instrs = do
+      (instrs',bid') <- stmtToInstrs bid s
+      -- If the statement introduced a new block, we use that one
+      let newBid = fromMaybe bid bid'
+      go newBid stmts (instrs `appOL` instrs')
 
 -- | `bid` refers to the current block and is used to update the CFG
 --   if new blocks are inserted in the control flow.
-stmtToInstrs :: BlockId -> CmmNode e x -> NatM InstrBlock
+-- See Note [Keeping track of the current block] for more details.
+stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in.
+             -> CmmNode e x
+             -> NatM (InstrBlock, Maybe BlockId)
+             -- ^ Instructions, and bid of new block if successive
+             -- statements are placed in a different basic block.
 stmtToInstrs bid stmt = do
   dflags <- getDynFlags
   is32Bit <- is32BitPlatform
   case stmt of
-    CmmComment s   -> return (unitOL (COMMENT s))
-    CmmTick {}     -> return nilOL
-
-    CmmUnwind regs -> do
-      let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
-          to_unwind_entry (reg, expr) = M.singleton reg (fmap toUnwindExpr expr)
-      case foldMap to_unwind_entry regs of
-        tbl | M.null tbl -> return nilOL
-            | otherwise  -> do
-                lbl <- mkAsmTempLabel <$> getUniqueM
-                return $ unitOL $ UNWIND lbl tbl
-
-    CmmAssign reg src
-      | isFloatType ty         -> assignReg_FltCode format reg src
-      | is32Bit && isWord64 ty -> assignReg_I64Code      reg src
-      | otherwise              -> assignReg_IntCode format reg src
-        where ty = cmmRegType dflags reg
-              format = cmmTypeFormat ty
-
-    CmmStore addr src
-      | isFloatType ty         -> assignMem_FltCode format addr src
-      | is32Bit && isWord64 ty -> assignMem_I64Code      addr src
-      | otherwise              -> assignMem_IntCode format addr src
-        where ty = cmmExprType dflags src
-              format = cmmTypeFormat ty
-
     CmmUnsafeForeignCall target result_regs args
        -> genCCall dflags is32Bit target result_regs args bid
 
-    CmmBranch id          -> return $ genBranch id
-
-    --We try to arrange blocks such that the likely branch is the fallthrough
-    --in CmmContFlowOpt. So we can assume the condition is likely false here.
-    CmmCondBranch arg true false _ -> genCondBranch bid true false arg
-    CmmSwitch arg ids -> do dflags <- getDynFlags
-                            genSwitch dflags arg ids
-    CmmCall { cml_target = arg
-            , cml_args_regs = gregs } -> do
-                                dflags <- getDynFlags
-                                genJump arg (jumpRegs dflags gregs)
-    _ ->
-      panic "stmtToInstrs: statement should have been cps'd away"
+    _ -> (,Nothing) <$> case stmt of
+      CmmComment s   -> return (unitOL (COMMENT s))
+      CmmTick {}     -> return nilOL
+
+      CmmUnwind regs -> do
+        let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
+            to_unwind_entry (reg, expr) = M.singleton reg (fmap toUnwindExpr expr)
+        case foldMap to_unwind_entry regs of
+          tbl | M.null tbl -> return nilOL
+              | otherwise  -> do
+                  lbl <- mkAsmTempLabel <$> getUniqueM
+                  return $ unitOL $ UNWIND lbl tbl
+
+      CmmAssign reg src
+        | isFloatType ty         -> assignReg_FltCode format reg src
+        | is32Bit && isWord64 ty -> assignReg_I64Code      reg src
+        | otherwise              -> assignReg_IntCode format reg src
+          where ty = cmmRegType dflags reg
+                format = cmmTypeFormat ty
+
+      CmmStore addr src
+        | isFloatType ty         -> assignMem_FltCode format addr src
+        | is32Bit && isWord64 ty -> assignMem_I64Code      addr src
+        | otherwise              -> assignMem_IntCode format addr src
+          where ty = cmmExprType dflags src
+                format = cmmTypeFormat ty
+
+      CmmBranch id          -> return $ genBranch id
+
+      --We try to arrange blocks such that the likely branch is the fallthrough
+      --in CmmContFlowOpt. So we can assume the condition is likely false here.
+      CmmCondBranch arg true false _ -> genCondBranch bid true false arg
+      CmmSwitch arg ids -> do dflags <- getDynFlags
+                              genSwitch dflags arg ids
+      CmmCall { cml_target = arg
+              , cml_args_regs = gregs } -> do
+                                  dflags <- getDynFlags
+                                  genJump arg (jumpRegs dflags gregs)
+      _ ->
+        panic "stmtToInstrs: statement should have been cps'd away"
 
 
 jumpRegs :: DynFlags -> [GlobalReg] -> [Reg]
@@ -1772,6 +1902,109 @@ genCondBranch' _ bid id false bool = do
         updateCfgNat (\cfg -> adjustEdgeWeight cfg (+3) bid false)
         return (cond_code `appOL` code)
 
+{-  Note [Introducing cfg edges inside basic blocks]
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    During instruction selection a statement `s`
+    in a block B with control of the sort: B -> C
+    will sometimes result in control
+    flow of the sort:
+
+            ┌ < ┐
+            v   ^
+      B ->  B1  ┴ -> C
+
+    as is the case for some atomic operations.
+
+    Now to keep the CFG in sync when introducing B1 we clearly
+    want to insert it between B and C. However there is
+    a catch when we have to deal with self loops.
+
+    We might start with code and a CFG of these forms:
+
+    loop:
+        stmt1               ┌ < ┐
+        ....                v   ^
+        stmtX              loop ┘
+        stmtY
+        ....
+        goto loop:
+
+    Now we introduce B1:
+                            ┌ ─ ─ ─ ─ ─┐
+        loop:               │   ┌ <  ┐ │
+        instrs              v   │    │ ^
+        ....               loop ┴ B1 ┴ ┘
+        instrsFromX
+        stmtY
+        goto loop:
+
+    This is simple, all outgoing edges from loop now simply
+    start from B1 instead and the code generator knows which
+    new edges it introduced for the self loop of B1.
+
+    Disaster strikes if the statement Y follows the same pattern.
+    If we apply the same rule that all outgoing edges change then
+    we end up with:
+
+        loop ─> B1 ─> B2 ┬─┐
+          │      │    └─<┤ │
+          │      └───<───┘ │
+          └───────<────────┘
+
+    This is problematic. The edge B1->B1 is modified as expected.
+    However the modification is wrong!
+
+    The assembly in this case looked like this:
+
+    _loop:
+        <instrs>
+    _B1:
+        ...
+        cmpxchgq ...
+        jne _B1
+        <instrs>
+        <end _B1>
+    _B2:
+        ...
+        cmpxchgq ...
+        jne _B2
+        <instrs>
+        jmp loop
+
+    There is no edge _B2 -> _B1 here. It's still a self loop onto _B1.
+
+    The problem here is that really B1 should be two basic blocks.
+    Otherwise we have control flow in the *middle* of a basic block.
+    A contradiction!
+
+    So to account for this we add yet another basic block marker:
+
+    _B:
+        <instrs>
+    _B1:
+        ...
+        cmpxchgq ...
+        jne _B1
+        jmp _B1'
+    _B1':
+        <instrs>
+        <end _B1>
+    _B2:
+        ...
+
+    Now when inserting B2 we will only look at the outgoing edges of B1' and
+    everything will work out nicely.
+
+    You might also wonder why we don't insert jumps at the end of _B1'. There is
+    no way another block ends up jumping to the labels _B1 or _B2 since they are
+    essentially invisible to other blocks. View them as control flow labels local
+    to the basic block if you'd like.
+
+    Not doing this ultimately caused (part 2 of) #17334.
+-}
+
+
 -- -----------------------------------------------------------------------------
 --  Generating C calls
 
@@ -1789,14 +2022,168 @@ genCCall
     -> [CmmFormal]        -- where to put the result
     -> [CmmActual]        -- arguments (of mixed type)
     -> BlockId      -- The block we are in
-    -> NatM InstrBlock
+    -> NatM (InstrBlock, Maybe BlockId)
+
+-- First we deal with cases which might introduce new blocks in the stream.
+
+genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
+                                           [dst] [addr, n] bid = do
+    use_sse2 <- sse2Enabled
+    Amode amode addr_code <-
+        if amop `elem` [AMO_Add, AMO_Sub]
+        then getAmode addr
+        else getSimpleAmode dflags is32Bit addr  -- See genCCall for MO_Cmpxchg
+    arg <- getNewRegNat format
+    arg_code <- getAnyReg n
+    let platform = targetPlatform dflags
+        dst_r    = getRegisterReg platform use_sse2 (CmmLocal dst)
+    (code, lbl) <- op_code dst_r arg amode
+    return (addr_code `appOL` arg_code arg `appOL` code, Just lbl)
+  where
+    -- Code for the operation
+    op_code :: Reg       -- Destination reg
+            -> Reg       -- Register containing argument
+            -> AddrMode  -- Address of location to mutate
+            -> NatM (OrdList Instr,BlockId) -- TODO: Return Maybe BlockId
+    op_code dst_r arg amode = case amop of
+        -- In the common case where dst_r is a virtual register the
+        -- final move should go away, because it's the last use of arg
+        -- and the first use of dst_r.
+        AMO_Add  -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
+                                  , MOV format (OpReg arg) (OpReg dst_r)
+                                  ], bid)
+        AMO_Sub  -> return $ (toOL [ NEGI format (OpReg arg)
+                                  , LOCK (XADD format (OpReg arg) (OpAddr amode))
+                                  , MOV format (OpReg arg) (OpReg dst_r)
+                                  ], bid)
+        -- In these cases we need a new block id, and have to return it so
+        -- that later instruction selection can reference it.
+        AMO_And  -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst)
+        AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst
+                                                    , NOT format dst
+                                                    ])
+        AMO_Or   -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst)
+        AMO_Xor  -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst)
+      where
+        -- Simulate operation that lacks a dedicated instruction using
+        -- cmpxchg.
+        cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
+                     -> NatM (OrdList Instr, BlockId)
+        cmpxchg_code instrs = do
+            lbl1 <- getBlockIdNat
+            lbl2 <- getBlockIdNat
+            tmp <- getNewRegNat format
+
+            --Record inserted blocks
+            --  We turn A -> B into A -> A' -> A'' -> B
+            --  with a self loop on A'.
+            addImmediateSuccessorNat bid lbl1
+            addImmediateSuccessorNat lbl1 lbl2
+            updateCfgNat (addWeightEdge lbl1 lbl1 0)
+
+            return $ (toOL
+                [ MOV format (OpAddr amode) (OpReg eax)
+                , JXX ALWAYS lbl1
+                , NEWBLOCK lbl1
+                  -- Keep old value so we can return it:
+                , MOV format (OpReg eax) (OpReg dst_r)
+                , MOV format (OpReg eax) (OpReg tmp)
+                ]
+                `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
+                [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
+                , JXX NE lbl1
+                -- See Note [Introducing cfg edges inside basic blocks]
+                -- why this basic block is required.
+                , JXX ALWAYS lbl2
+                , NEWBLOCK lbl2
+                ],
+                lbl2)
+    format = intFormat width
+
+genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
+  | is32Bit, width == W64 = do
+      ChildCode64 vcode rlo <- iselExpr64 src
+      use_sse2 <- sse2Enabled
+      let rhi     = getHiVRegFromLo rlo
+          dst_r   = getRegisterReg platform use_sse2 (CmmLocal dst)
+      lbl1 <- getBlockIdNat
+      lbl2 <- getBlockIdNat
+      let format = if width == W8 then II16 else intFormat width
+      tmp_r <- getNewRegNat format
+
+      -- New CFG Edges:
+      --  bid -> lbl2
+      --  bid -> lbl1 -> lbl2
+      --  We also changes edges originating at bid to start at lbl2 instead.
+      updateCfgNat (addWeightEdge bid lbl1 110 .
+                    addWeightEdge lbl1 lbl2 110 .
+                    addImmediateSuccessor bid lbl2)
+
+      -- The following instruction sequence corresponds to the pseudo-code
+      --
+      --  if (src) {
+      --    dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
+      --  } else {
+      --    dst = 64;
+      --  }
+      let instrs = vcode `appOL` toOL
+               ([ MOV      II32 (OpReg rhi)         (OpReg tmp_r)
+                , OR       II32 (OpReg rlo)         (OpReg tmp_r)
+                , MOV      II32 (OpImm (ImmInt 64)) (OpReg dst_r)
+                , JXX EQQ    lbl2
+                , JXX ALWAYS lbl1
+
+                , NEWBLOCK   lbl1
+                , BSF     II32 (OpReg rhi)         dst_r
+                , ADD     II32 (OpImm (ImmInt 32)) (OpReg dst_r)
+                , BSF     II32 (OpReg rlo)         tmp_r
+                , CMOV NE II32 (OpReg tmp_r)       dst_r
+                , JXX ALWAYS lbl2
+
+                , NEWBLOCK   lbl2
+                ])
+      return (instrs, Just lbl2)
+
+  | otherwise = do
+    code_src <- getAnyReg src
+    use_sse2 <- sse2Enabled
+    let dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
+
+    -- The following insn sequence makes sure 'ctz 0' has a defined value.
+    -- starting with Haswell, one could use the TZCNT insn instead.
+    let format = if width == W8 then II16 else intFormat width
+    src_r <- getNewRegNat format
+    tmp_r <- getNewRegNat format
+    let instrs = code_src src_r `appOL` toOL
+              ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+              [ BSF     format (OpReg src_r) tmp_r
+              , MOV     II32   (OpImm (ImmInt bw)) (OpReg dst_r)
+              , CMOV NE format (OpReg tmp_r) dst_r
+              ]) -- NB: We don't need to zero-extend the result for the
+                  -- W8/W16 cases because the 'MOV' insn already
+                  -- took care of implicitly clearing the upper bits
+    return (instrs, Nothing)
+  where
+    bw = widthInBits width
+    platform = targetPlatform dflags
+
+genCCall dflags bits mop dst args bid = do
+  instr <- genCCall' dflags bits mop dst args bid
+  return (instr, Nothing)
 
--- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+-- genCCall' handles cases not introducing new code blocks.
+genCCall'
+    :: DynFlags
+    -> Bool                     -- 32 bit platform?
+    -> ForeignTarget            -- function to call
+    -> [CmmFormal]        -- where to put the result
+    -> [CmmActual]        -- arguments (of mixed type)
+    -> BlockId      -- The block we are in
+    -> NatM InstrBlock
 
--- Unroll memcpy calls if the source and destination pointers are at
--- least DWORD aligned and the number of bytes to copy isn't too
+-- Unroll memcpy calls if the number of bytes to copy isn't too
 -- large.  Otherwise, call C's memcpy.
-genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _
+genCCall' dflags is32Bit (PrimTarget (MO_Memcpy align)) _
          [dst, src, CmmLit (CmmInt n _)] _
     | fromInteger insns <= maxInlineMemcpyInsns dflags && align .&. 3 == 0 = do
         code_dst <- getAnyReg dst
@@ -1843,7 +2230,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _
         dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
                    (ImmInteger (n - i))
 
-genCCall dflags _ (PrimTarget (MO_Memset align)) _
+genCCall' dflags _ (PrimTarget (MO_Memset align)) _
          [dst,
           CmmLit (CmmInt c _),
           CmmLit (CmmInt n _)]
@@ -1888,14 +2275,14 @@ genCCall dflags _ (PrimTarget (MO_Memset align)) _
         dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
                    (ImmInteger (n - i))
 
-genCCall _ _ (PrimTarget MO_ReadBarrier) _ _ _  = return nilOL
-genCCall _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
+genCCall' _ _ (PrimTarget MO_ReadBarrier) _ _ _  = return nilOL
+genCCall' _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
         -- barriers compile to no code on x86/x86-64;
         -- we keep it this long in order to prevent earlier optimisations.
 
-genCCall _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL
+genCCall' _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL
 
-genCCall _ is32bit (PrimTarget (MO_Prefetch_Data n )) _  [src] _ =
+genCCall' _ is32bit (PrimTarget (MO_Prefetch_Data n )) _  [src] _ =
         case n of
             0 -> genPrefetch src $ PREFETCH NTA  format
             1 -> genPrefetch src $ PREFETCH Lvl2 format
@@ -1916,9 +2303,10 @@ genCCall _ is32bit (PrimTarget (MO_Prefetch_Data n )) _  [src] _ =
                               ((AddrBaseIndex (EABaseReg src_r )   EAIndexNone (ImmInt 0))))  ))
                   -- prefetch always takes an address
 
-genCCall dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
+genCCall' dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
     let platform = targetPlatform dflags
-    let dst_r = getRegisterReg platform False (CmmLocal dst)
+    use_sse2 <- sse2Enabled
+    let dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
     case width of
         W64 | is32Bit -> do
                ChildCode64 vcode rlo <- iselExpr64 src
@@ -1938,7 +2326,7 @@ genCCall dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
   where
     format = intFormat width
 
-genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
+genCCall' dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
          args@[src] bid = do
     sse4_2 <- sse4_2Enabled
     let platform = targetPlatform dflags
@@ -1964,20 +2352,21 @@ genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
             let target = ForeignTarget targetExpr (ForeignConvention CCallConv
                                                            [NoHint] [NoHint]
                                                            CmmMayReturn)
-            genCCall dflags is32Bit target dest_regs args bid
+            genCCall' dflags is32Bit target dest_regs args bid
   where
     format = intFormat width
     lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width))
 
-genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
+genCCall' dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
          args@[src, mask] bid = do
     let platform = targetPlatform dflags
+    use_sse2 <- sse2Enabled
     if isBmi2Enabled dflags
         then do code_src  <- getAnyReg src
                 code_mask <- getAnyReg mask
                 src_r     <- getNewRegNat format
                 mask_r    <- getNewRegNat format
-                let dst_r = getRegisterReg platform False (CmmLocal dst)
+                let dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
                 return $ code_src src_r `appOL` code_mask mask_r `appOL`
                     (if width == W8 then
                          -- The PDEP instruction doesn't take a r/m8
@@ -1997,20 +2386,21 @@ genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
             let target = ForeignTarget targetExpr (ForeignConvention CCallConv
                                                            [NoHint] [NoHint]
                                                            CmmMayReturn)
-            genCCall dflags is32Bit target dest_regs args bid
+            genCCall' dflags is32Bit target dest_regs args bid
   where
     format = intFormat width
     lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width))
 
-genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
+genCCall' dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
          args@[src, mask] bid = do
     let platform = targetPlatform dflags
+    use_sse2 <- sse2Enabled
     if isBmi2Enabled dflags
         then do code_src  <- getAnyReg src
                 code_mask <- getAnyReg mask
                 src_r     <- getNewRegNat format
                 mask_r    <- getNewRegNat format
-                let dst_r = getRegisterReg platform False (CmmLocal dst)
+                let dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
                 return $ code_src src_r `appOL` code_mask mask_r `appOL`
                     (if width == W8 then
                          -- The PEXT instruction doesn't take a r/m8
@@ -2030,19 +2420,19 @@ genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
             let target = ForeignTarget targetExpr (ForeignConvention CCallConv
                                                            [NoHint] [NoHint]
                                                            CmmMayReturn)
-            genCCall dflags is32Bit target dest_regs args bid
+            genCCall' dflags is32Bit target dest_regs args bid
   where
     format = intFormat width
     lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width))
 
-genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid
+genCCall' dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid
   | is32Bit && width == W64 = do
     -- Fallback to `hs_clz64` on i386
     targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
     let target = ForeignTarget targetExpr (ForeignConvention CCallConv
                                            [NoHint] [NoHint]
                                            CmmMayReturn)
-    genCCall dflags is32Bit target dest_regs args bid
+    genCCall' dflags is32Bit target dest_regs args bid
 
   | otherwise = do
     code_src <- getAnyReg src
@@ -2067,162 +2457,37 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] b
     format = if width == W8 then II16 else intFormat width
     lbl = mkCmmCodeLabel primUnitId (fsLit (clzLabel width))
 
-genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
-  | is32Bit, width == W64 = do
-      ChildCode64 vcode rlo <- iselExpr64 src
-      let rhi     = getHiVRegFromLo rlo
-          dst_r   = getRegisterReg platform False (CmmLocal dst)
-      lbl1 <- getBlockIdNat
-      lbl2 <- getBlockIdNat
-      tmp_r <- getNewRegNat format
-
-      -- New CFG Edges:
-      --  bid -> lbl2
-      --  bid -> lbl1 -> lbl2
-      --  We also changes edges originating at bid to start at lbl2 instead.
-      updateCfgNat (addWeightEdge bid lbl1 110 .
-                    addWeightEdge lbl1 lbl2 110 .
-                    addImmediateSuccessor bid lbl2)
-
-      -- The following instruction sequence corresponds to the pseudo-code
-      --
-      --  if (src) {
-      --    dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
-      --  } else {
-      --    dst = 64;
-      --  }
-      return $ vcode `appOL` toOL
-               ([ MOV      II32 (OpReg rhi)         (OpReg tmp_r)
-                , OR       II32 (OpReg rlo)         (OpReg tmp_r)
-                , MOV      II32 (OpImm (ImmInt 64)) (OpReg dst_r)
-                , JXX EQQ    lbl2
-                , JXX ALWAYS lbl1
-
-                , NEWBLOCK   lbl1
-                , BSF     II32 (OpReg rhi)         dst_r
-                , ADD     II32 (OpImm (ImmInt 32)) (OpReg dst_r)
-                , BSF     II32 (OpReg rlo)         tmp_r
-                , CMOV NE II32 (OpReg tmp_r)       dst_r
-                , JXX ALWAYS lbl2
-
-                , NEWBLOCK   lbl2
-                ])
-
-  | otherwise = do
-    code_src <- getAnyReg src
-    src_r <- getNewRegNat format
-    tmp_r <- getNewRegNat format
-    let dst_r = getRegisterReg platform False (CmmLocal dst)
-
-    -- The following insn sequence makes sure 'ctz 0' has a defined value.
-    -- starting with Haswell, one could use the TZCNT insn instead.
-    return $ code_src src_r `appOL` toOL
-             ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
-              [ BSF     format (OpReg src_r) tmp_r
-              , MOV     II32   (OpImm (ImmInt bw)) (OpReg dst_r)
-              , CMOV NE format (OpReg tmp_r) dst_r
-              ]) -- NB: We don't need to zero-extend the result for the
-                 -- W8/W16 cases because the 'MOV' insn already
-                 -- took care of implicitly clearing the upper bits
-  where
-    bw = widthInBits width
-    platform = targetPlatform dflags
-    format = if width == W8 then II16 else intFormat width
-
-genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
+genCCall' dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
     targetExpr <- cmmMakeDynamicReference dflags
                   CallReference lbl
     let target = ForeignTarget targetExpr (ForeignConvention CCallConv
                                            [NoHint] [NoHint]
                                            CmmMayReturn)
-    genCCall dflags is32Bit target dest_regs args bid
+    genCCall' dflags is32Bit target dest_regs args bid
   where
     lbl = mkCmmCodeLabel primUnitId (fsLit (word2FloatLabel width))
 
-genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
-                                           [dst] [addr, n] bid = do
-    Amode amode addr_code <-
-        if amop `elem` [AMO_Add, AMO_Sub]
-        then getAmode addr
-        else getSimpleAmode dflags is32Bit addr  -- See genCCall for MO_Cmpxchg
-    arg <- getNewRegNat format
-    arg_code <- getAnyReg n
-    use_sse2 <- sse2Enabled
-    let platform = targetPlatform dflags
-        dst_r    = getRegisterReg platform use_sse2 (CmmLocal dst)
-    code <- op_code dst_r arg amode
-    return $ addr_code `appOL` arg_code arg `appOL` code
-  where
-    -- Code for the operation
-    op_code :: Reg       -- Destination reg
-            -> Reg       -- Register containing argument
-            -> AddrMode  -- Address of location to mutate
-            -> NatM (OrdList Instr)
-    op_code dst_r arg amode = case amop of
-        -- In the common case where dst_r is a virtual register the
-        -- final move should go away, because it's the last use of arg
-        -- and the first use of dst_r.
-        AMO_Add  -> return $ toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
-                                  , MOV format (OpReg arg) (OpReg dst_r)
-                                  ]
-        AMO_Sub  -> return $ toOL [ NEGI format (OpReg arg)
-                                  , LOCK (XADD format (OpReg arg) (OpAddr amode))
-                                  , MOV format (OpReg arg) (OpReg dst_r)
-                                  ]
-        AMO_And  -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst)
-        AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst
-                                                    , NOT format dst
-                                                    ])
-        AMO_Or   -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst)
-        AMO_Xor  -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst)
-      where
-        -- Simulate operation that lacks a dedicated instruction using
-        -- cmpxchg.
-        cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
-                     -> NatM (OrdList Instr)
-        cmpxchg_code instrs = do
-            lbl <- getBlockIdNat
-            tmp <- getNewRegNat format
-
-            --Record inserted blocks
-            addImmediateSuccessorNat bid lbl
-            updateCfgNat (addWeightEdge lbl lbl 0)
-
-            return $ toOL
-                [ MOV format (OpAddr amode) (OpReg eax)
-                , JXX ALWAYS lbl
-                , NEWBLOCK lbl
-                  -- Keep old value so we can return it:
-                , MOV format (OpReg eax) (OpReg dst_r)
-                , MOV format (OpReg eax) (OpReg tmp)
-                ]
-                `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
-                [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
-                , JXX NE lbl
-                ]
-
-    format = intFormat width
-
-genCCall dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
+genCCall' dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
   load_code <- intLoadCode (MOV (intFormat width)) addr
   let platform = targetPlatform dflags
   use_sse2 <- sse2Enabled
+
   return (load_code (getRegisterReg platform use_sse2 (CmmLocal dst)))
 
-genCCall _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
+genCCall' _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
     code <- assignMem_IntCode (intFormat width) addr val
     return $ code `snocOL` MFENCE
 
-genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do
+genCCall' dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do
     -- On x86 we don't have enough registers to use cmpxchg with a
     -- complicated addressing mode, so on that architecture we
     -- pre-compute the address first.
+    use_sse2 <- sse2Enabled
     Amode amode addr_code <- getSimpleAmode dflags is32Bit addr
     newval <- getNewRegNat format
     newval_code <- getAnyReg new
     oldval <- getNewRegNat format
     oldval_code <- getAnyReg old
-    use_sse2 <- sse2Enabled
     let platform = targetPlatform dflags
         dst_r    = getRegisterReg platform use_sse2 (CmmLocal dst)
         code     = toOL
@@ -2235,7 +2500,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _
   where
     format = intFormat width
 
-genCCall _ is32Bit target dest_regs args bid = do
+genCCall' _ is32Bit target dest_regs args bid = do
   dflags <- getDynFlags
   let platform = targetPlatform dflags
       sse2     = isSse2Enabled dflags
@@ -2853,7 +3118,8 @@ outOfLineCmmOp bid mop res args
       let target = ForeignTarget targetExpr
                            (ForeignConvention CCallConv [] [] CmmMayReturn)
 
-      stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args)
+      (instrs, _) <- stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args)
+      return instrs
   where
         -- Assume we can call these functions directly, and that they're not in a dynamic library.
         -- TODO: Why is this ok? Under linux this code will be in libm.so
@@ -3426,10 +3692,14 @@ needLlvm =
 -- | This works on the invariant that all jumps in the given blocks are required.
 --   Starting from there we try to make a few more jumps redundant by reordering
 --   them.
-invertCondBranches :: CFG -> LabelMap a -> [NatBasicBlock Instr]
+--   We depend on the information in the CFG to do so. Without a given CFG
+--   we do nothing.
+invertCondBranches :: Maybe CFG  -- ^ CFG if present
+                   -> LabelMap a -- ^ Blocks with info tables
+                   -> [NatBasicBlock Instr] -- ^ List of basic blocks
                    -> [NatBasicBlock Instr]
-invertCondBranches cfg keep bs =
-    --trace "Foo" $
+invertCondBranches Nothing _       bs = bs
+invertCondBranches (Just cfg) keep bs =
     invert bs
   where
     invert :: [NatBasicBlock Instr] -> [NatBasicBlock Instr]
@@ -3448,7 +3718,7 @@ invertCondBranches cfg keep bs =
       , Just edgeInfo2 <- getEdgeInfo lbl1 target2 cfg
       -- Both jumps come from the same cmm statement
       , transitionSource edgeInfo1 == transitionSource edgeInfo2
-      , (CmmSource cmmCondBranch) <- transitionSource edgeInfo1
+      , CmmSource cmmCondBranch <- transitionSource edgeInfo1
 
       --Int comparisons are invertable
       , CmmCondBranch (CmmMachOp op _args) _ _ _ <- cmmCondBranch
diff --git a/compiler/nativeGen/X86/Instr.hs b/compiler/nativeGen/X86/Instr.hs
index 9f4a25b23e..bddd4fdc74 100644
--- a/compiler/nativeGen/X86/Instr.hs
+++ b/compiler/nativeGen/X86/Instr.hs
@@ -325,7 +325,9 @@ data Instr
                       [Maybe JumpDest] -- Targets of the jump table
                       Section   -- Data section jump table should be put in
                       CLabel    -- Label of jump table
-        | CALL        (Either Imm Reg) [Reg]
+        -- | X86 call instruction
+        | CALL        (Either Imm Reg) -- ^ Jump target
+                      [Reg]            -- ^ Arguments (required for register allocation)
 
         -- Other things.
         | CLTD Format            -- sign extend %eax into %edx:%eax
diff --git a/testsuite/tests/codeGen/should_compile/T17334.hs b/testsuite/tests/codeGen/should_compile/T17334.hs
new file mode 100644
index 0000000000..6ad6d347e8
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T17334.hs
@@ -0,0 +1,240 @@
+-- Reproducer for T17334
+{-# LANGUAGE BangPatterns          #-}
+{-# LANGUAGE FlexibleContexts      #-}
+{-# LANGUAGE MagicHash             #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE TypeFamilies          #-}
+{-# LANGUAGE UnboxedTuples         #-}
+
+--Reproducer uses 64bit literals in reverseWord.
+--It's ok to truncate those in x86
+{-# OPTIONS_GHC -Wno-overflowed-literals #-}
+
+module Bug (reverseInPlace) where
+
+import Control.Monad.ST
+import Data.Bits
+import GHC.Exts
+import GHC.ST (ST(..))
+import Data.Kind
+
+reverseInPlace :: PrimMonad m => UMVector (PrimState m) Bit -> m ()
+reverseInPlace xs | len == 0  = pure ()
+                  | otherwise = loop 0
+ where
+  len = ulength xs
+
+  loop !i
+    | i' <= j' = do
+      x <- readWord xs i
+      y <- readWord xs j'
+
+      writeWord xs i  (reverseWord y)
+      writeWord xs j' (reverseWord x)
+
+      loop i'
+    | i' < j = do
+      let w = (j - i) `shiftR` 1
+          k = j - w
+      x <- readWord xs i
+      y <- readWord xs k
+
+      writeWord xs i (meld w (reversePartialWord w y) x)
+      writeWord xs k (meld w (reversePartialWord w x) y)
+
+      loop i'
+    | otherwise = do
+      let w = j - i
+      x <- readWord xs i
+      writeWord xs i (meld w (reversePartialWord w x) x)
+   where
+    !j  = len - i
+    !i' = i + wordSize
+    !j' = j - wordSize
+{-# SPECIALIZE reverseInPlace :: UMVector s Bit -> ST s () #-}
+
+newtype Bit = Bit { unBit :: Bool }
+
+instance Unbox Bit
+
+data instance UMVector s Bit = BitMVec !Int !Int !(MutableByteArray s)
+data instance UVector    Bit = BitVec  !Int !Int !ByteArray
+
+readWord :: PrimMonad m => UMVector (PrimState m) Bit -> Int -> m Word
+readWord !(BitMVec _ 0 _) _ = pure 0
+readWord !(BitMVec off len' arr) !i' = do
+  let len  = off + len'
+      i    = off + i'
+      nMod = modWordSize i
+      loIx = divWordSize i
+  loWord <- readByteArray arr loIx
+
+  if nMod == 0
+    then pure loWord
+    else if loIx == divWordSize (len - 1)
+      then pure (loWord `unsafeShiftR` nMod)
+      else do
+        hiWord <- readByteArray arr (loIx + 1)
+        pure
+          $   (loWord `unsafeShiftR` nMod)
+          .|. (hiWord `unsafeShiftL` (wordSize - nMod))
+{-# SPECIALIZE readWord :: UMVector s Bit -> Int -> ST s Word #-}
+{-# INLINE readWord #-}
+
+writeWord :: PrimMonad m => UMVector (PrimState m) Bit -> Int -> Word -> m ()
+writeWord !(BitMVec _ 0 _) _ _ = pure ()
+writeWord !(BitMVec off len' arr@(MutableByteArray mba)) !i' !x@(W# x#) = do
+  let len    = off + len'
+      lenMod = modWordSize len
+      i      = off + i'
+      nMod   = modWordSize i
+      loIx@(I# loIx#)   = divWordSize i
+
+  if nMod == 0
+    then if len >= i + wordSize
+      then primitive $ \state ->
+        (# atomicWriteIntArray# mba loIx# (word2Int# x#) state, () #)
+      else do
+        let W# andMask# = hiMask lenMod
+            W# orMask#  = x .&. loMask lenMod
+        primitive $ \state ->
+          let !(# state',  _ #) = fetchAndIntArray# mba loIx# (word2Int# andMask#) state  in
+            let !(# state'', _ #) = fetchOrIntArray#  mba loIx# (word2Int# orMask#)  state' in
+              (# state'', () #)
+    else if loIx == divWordSize (len - 1)
+      then do
+        loWord <- readByteArray arr loIx
+        if lenMod == 0
+          then
+            writeByteArray arr loIx
+            $   (loWord .&. loMask nMod)
+            .|. (x `unsafeShiftL` nMod)
+          else
+            writeByteArray arr loIx
+            $   (loWord .&. (loMask nMod .|. hiMask lenMod))
+            .|. ((x `unsafeShiftL` nMod) .&. loMask lenMod)
+      else do
+        loWord <- readByteArray arr loIx
+        writeByteArray arr loIx
+          $   (loWord .&. loMask nMod)
+          .|. (x `unsafeShiftL` nMod)
+        hiWord <- readByteArray arr (loIx + 1)
+        writeByteArray arr (loIx + 1)
+          $   (hiWord .&. hiMask nMod)
+          .|. (x `unsafeShiftR` (wordSize - nMod))
+{-# SPECIALIZE writeWord :: UMVector s Bit -> Int -> Word -> ST s () #-}
+{-# INLINE writeWord #-}
+
+instance GMVector UMVector Bit where
+  {-# INLINE basicLength #-}
+  basicLength (BitMVec _ n _) = n
+
+instance GVector UVector Bit where
+
+wordSize :: Int
+wordSize = finiteBitSize (0 :: Word)
+
+lgWordSize :: Int
+lgWordSize = case wordSize of
+  32 -> 5
+  64 -> 6
+  _  -> error "wordsToBytes: unknown architecture"
+
+divWordSize :: Bits a => a -> a
+divWordSize x = unsafeShiftR x lgWordSize
+{-# INLINE divWordSize #-}
+
+modWordSize :: Int -> Int
+modWordSize x = x .&. (wordSize - 1)
+{-# INLINE modWordSize #-}
+
+mask :: Int -> Word
+mask b = m
+ where
+  m | b >= finiteBitSize m = complement 0
+    | b < 0                = 0
+    | otherwise            = bit b - 1
+
+meld :: Int -> Word -> Word -> Word
+meld b lo hi = (lo .&. m) .|. (hi .&. complement m) where m = mask b
+{-# INLINE meld #-}
+
+reverseWord :: Word -> Word
+reverseWord x0 = x6
+ where
+  x1 = ((x0 .&. 0x5555555555555555) `shiftL`  1) .|. ((x0 .&. 0xAAAAAAAAAAAAAAAA) `shiftR`  1)
+  x2 = ((x1 .&. 0x3333333333333333) `shiftL`  2) .|. ((x1 .&. 0xCCCCCCCCCCCCCCCC) `shiftR`  2)
+  x3 = ((x2 .&. 0x0F0F0F0F0F0F0F0F) `shiftL`  4) .|. ((x2 .&. 0xF0F0F0F0F0F0F0F0) `shiftR`  4)
+  x4 = ((x3 .&. 0x00FF00FF00FF00FF) `shiftL`  8) .|. ((x3 .&. 0xFF00FF00FF00FF00) `shiftR`  8)
+  x5 = ((x4 .&. 0x0000FFFF0000FFFF) `shiftL` 16) .|. ((x4 .&. 0xFFFF0000FFFF0000) `shiftR` 16)
+  x6 = ((x5 .&. 0x00000000FFFFFFFF) `shiftL` 32) .|. ((x5 .&. 0xFFFFFFFF00000000) `shiftR` 32)
+
+reversePartialWord :: Int -> Word -> Word
+reversePartialWord n w | n >= wordSize = reverseWord w
+                       | otherwise     = reverseWord w `shiftR` (wordSize - n)
+
+loMask :: Int -> Word
+loMask n = 1 `unsafeShiftL` n - 1
+{-# INLINE loMask #-}
+
+hiMask :: Int -> Word
+hiMask n = complement (1 `unsafeShiftL` n - 1)
+{-# INLINE hiMask #-}
+
+class GMVector v a where
+  basicLength :: v s a -> Int
+
+glength :: GMVector v a => v s a -> Int
+{-# INLINE glength #-}
+glength = basicLength
+
+type family GMutable (v :: Type -> Type) :: Type -> Type -> Type
+class GMVector (GMutable v) a => GVector v a
+data family UMVector s a
+data family UVector    a
+class (GVector UVector a, GMVector UMVector a) => Unbox a
+type instance GMutable UVector = UMVector
+
+ulength :: Unbox a => UMVector s a -> Int
+{-# INLINE ulength #-}
+ulength = glength
+
+data ByteArray = ByteArray ByteArray#
+data MutableByteArray s = MutableByteArray (MutableByteArray# s)
+
+readByteArray
+  :: (Prim a, PrimMonad m) => MutableByteArray (PrimState m) -> Int -> m a
+{-# INLINE readByteArray #-}
+readByteArray (MutableByteArray arr#) (I# i#)
+  = primitive (readByteArray# arr# i#)
+
+writeByteArray
+  :: (Prim a, PrimMonad m) => MutableByteArray (PrimState m) -> Int -> a -> m ()
+{-# INLINE writeByteArray #-}
+writeByteArray (MutableByteArray arr#) (I# i#) x
+  = primitive_ (writeByteArray# arr# i# x)
+
+class Prim a where
+  readByteArray# :: MutableByteArray# s -> Int# -> State# s -> (# State# s, a #)
+  writeByteArray# :: MutableByteArray# s -> Int# -> a -> State# s -> State# s
+
+instance Prim Word where
+  readByteArray#  arr# i# s# = case readWordArray# arr# i# s# of
+                                 (# s1#, x# #) -> (# s1#, W# x# #)
+  writeByteArray# arr# i# (W# x#) s# = writeWordArray# arr# i# x# s#
+
+class Monad m => PrimMonad m where
+  type PrimState m
+  primitive :: (State# (PrimState m) -> (# State# (PrimState m), a #)) -> m a
+
+instance PrimMonad (ST s) where
+  type PrimState (ST s) = s
+  primitive = ST
+  {-# INLINE primitive #-}
+
+primitive_ :: PrimMonad m
+              => (State# (PrimState m) -> State# (PrimState m)) -> m ()
+{-# INLINE primitive_ #-}
+primitive_ f = primitive (\s# ->
+    case f s# of
+        s'# -> (# s'#, () #))
author	Andreas Klebinger <klebinger.andreas@gmx.at>	2019-10-15 00:58:12 +0200
committer	Andreas Klebinger <klebinger.andreas@gmx.at>	2019-11-12 15:31:27 +0100
commit	ec962cb95fb6f288f9a449f3c5da8998afb636ba (patch)
tree	1ecd5b3a612f5650d6a3d7d56fdaaaeebc8a6853
parent	dde5c06a9ac205b4edbff679c682a4ed0fedbc2e (diff)
download	haskell-wip/andreask/1953_backport.tar.gz