30 files changed, 3377 insertions, 1777 deletions
diff --git a/compiler/cmm/BlockId.hs b/compiler/cmm/BlockId.hs
index feeacb553d..d5a8e045bf 100644
--- a/compiler/cmm/BlockId.hs
+++ b/compiler/cmm/BlockId.hs
@@ -31,15 +31,9 @@ compilation unit in which it appears.
 
 type BlockId = Hoopl.Label
 
-instance Uniquable BlockId where
-  getUnique label = getUnique (uniqueToInt $ lblToUnique label)
-
 mkBlockId :: Unique -> BlockId
 mkBlockId unique = uniqueToLbl $ intToUnique $ getKey unique
 
-instance Outputable BlockId where
-  ppr label = ppr (getUnique label)
-
 retPtLbl :: BlockId -> CLabel
 retPtLbl label = mkReturnPtLabel $ getUnique label
 
diff --git a/compiler/cmm/Cmm.hs b/compiler/cmm/Cmm.hs
index f1318c1dc9..d70fd8c835 100644
--- a/compiler/cmm/Cmm.hs
+++ b/compiler/cmm/Cmm.hs
@@ -69,8 +69,6 @@ type CmmGroup = GenCmmGroup CmmStatics CmmTopInfo CmmGraph
 --   (a) C--, i.e. populated with various C-- constructs
 --       (Cmm and RawCmm in OldCmm.hs)
 --   (b) Native code, populated with data/instructions
---
--- A second family of instances based on Hoopl is in Cmm.hs.
 
 -- | A top-level chunk, abstracted over the type of the contents of
 -- the basic blocks (Cmm or instructions are the likely instantiations).
@@ -103,11 +101,15 @@ type CmmBwdRewrite f = BwdRewrite FuelUniqSM CmmNode f
 --     Info Tables
 -----------------------------------------------------------------------------
 
-data CmmTopInfo   = TopInfo {info_tbl :: CmmInfoTable, stack_info :: CmmStackInfo}
+data CmmTopInfo   = TopInfo { info_tbl :: CmmInfoTable
+                            , stack_info :: CmmStackInfo }
 
 data CmmStackInfo
    = StackInfo {
-       arg_space :: ByteOff,            -- XXX: comment?
+       arg_space :: ByteOff,
+               -- number of bytes of arguments on the stack on entry to the
+               -- the proc.  This is filled in by StgCmm.codeGen, and used
+               -- by the stack allocator later.
        updfr_space :: Maybe ByteOff     -- XXX: comment?
    }
 
diff --git a/compiler/cmm/CmmBuildInfoTables.hs b/compiler/cmm/CmmBuildInfoTables.hs
index 011947f55d..2378988b68 100644
--- a/compiler/cmm/CmmBuildInfoTables.hs
+++ b/compiler/cmm/CmmBuildInfoTables.hs
@@ -14,12 +14,10 @@
 {-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
 module CmmBuildInfoTables
     ( CAFSet, CAFEnv, cafAnal, localCAFInfo, mkTopCAFInfo
-    , setInfoTableSRT, setInfoTableStackMap
+    , setInfoTableSRT
     , TopSRT, emptySRT, srtToData
     , bundleCAFs
-    , lowerSafeForeignCalls
-    , cafTransfers, liveSlotTransfers
-    , mkLiveness )
+    , cafTransfers )
 where
 
 #include "HsVersions.h"
@@ -39,7 +37,6 @@ import Bitmap
 import CLabel
 import Cmm
 import CmmUtils
-import CmmStackLayout
 import Module
 import FastString
 import ForeignCall
@@ -54,129 +51,25 @@ import Outputable
 import SMRep
 import UniqSupply
 
-import Compiler.Hoopl
+import Hoopl
 
 import Data.Map (Map)
 import qualified Data.Map as Map
+import Data.Set (Set)
+import qualified Data.Set as Set
 import qualified FiniteMap as Map
 
+#if __GLASGOW_HASKELL__ < 704
+foldSet = Set.fold
+#else
+foldSet = Set.foldr
+#endif
+
 ----------------------------------------------------------------
 -- Building InfoTables
 
 
 -----------------------------------------------------------------------
--- Stack Maps
-
--- Given a block ID, we return a representation of the layout of the stack,
--- as suspended before entering that block.
--- (For a return site to a function call, the layout does not include the
---  parameter passing area (or the "return address" on the stack)).
--- If the element is `Nothing`, then it represents a word of the stack that
--- does not contain a live pointer.
--- If the element is `Just` a register, then it represents a live spill slot
--- for a pointer; we assume that a pointer is the size of a word.
--- The head of the list represents the young end of the stack where the infotable
--- pointer for the block `Bid` is stored.
--- The infotable pointer itself is not included in the list.
--- Call areas are also excluded from the list: besides the stuff in the update
--- frame (and the return infotable), call areas should never be live across
--- function calls.
-
--- RTS Invariant: All pointers must be word-aligned because each bit in the bitmap
--- represents a word. Consequently, we have to be careful when we see a live slot
--- on the stack: if we have packed multiple sub-word values into a word,
--- we have to make sure that we only mark the entire word as a non-pointer.
-
--- Also, don't forget to stop at the old end of the stack (oldByte),
--- which may differ depending on whether there is an update frame.
-
-type RegSlotInfo
-   = ( Int	  -- Offset from oldest byte of Old area
-     , LocalReg   -- The register
-     , Int)       -- Width of the register
-
-live_ptrs :: ByteOff -> BlockEnv SubAreaSet -> AreaMap -> BlockId -> StackLayout
-live_ptrs oldByte slotEnv areaMap bid =
-  -- pprTrace "live_ptrs for" (ppr bid <+> text (show oldByte ++ "-" ++ show youngByte) <+>
-  --                           ppr liveSlots) $
-  -- pprTrace ("stack layout for " ++ show bid ++ ": ") (ppr res) $ res
-  res
-  where 
-        res = mkLiveness (reverse $ slotsToList youngByte liveSlots [])
- 
-        slotsToList :: Int -> [RegSlotInfo] -> [Maybe LocalReg] -> [Maybe LocalReg]
-        -- n starts at youngByte and is decremented down to oldByte
-	-- Returns a list, one element per word, with 
-	--    (Just r) meaning 'pointer register r is saved here', 
-	--    Nothing  meaning 'non-pointer or empty'
-
-        slotsToList n [] results | n == oldByte = results -- at old end of stack frame
-
-        slotsToList n (s : _) _  | n == oldByte =
-          pprPanic "slot left off live_ptrs" (ppr s <+> ppr oldByte <+>
-               ppr n <+> ppr liveSlots <+> ppr youngByte)
-
-        slotsToList n _ _ | n < oldByte =
-          panic "stack slots not allocated on word boundaries?"
-
-        slotsToList n l@((n', r, w) : rst) results =
-          if n == (n' + w) then -- slot's young byte is at n
-            ASSERT (not (isPtr r) ||
-                    (n `mod` wORD_SIZE == 0 && w == wORD_SIZE)) -- ptrs must be aligned
-            slotsToList next (dropWhile (non_ptr_younger_than next) rst)
-                        (stack_rep : results)
-          else slotsToList next (dropWhile (non_ptr_younger_than next) l)
-                           (Nothing : results)
-          where next = n - wORD_SIZE
-                stack_rep = if isPtr r then Just r else Nothing
-
-        slotsToList n [] results = slotsToList (n - wORD_SIZE) [] (Nothing : results)
-
-        non_ptr_younger_than next (n', r, w) =
-          n' + w > next &&
-            ASSERT (not (isPtr r))
-            True
-        isPtr = isGcPtrType . localRegType
-
-        liveSlots :: [RegSlotInfo]
-        liveSlots = sortBy (\ (off,_,_) (off',_,_) -> compare off' off)
-                           (Map.foldRightWithKey (\_ -> flip $ foldl add_slot) [] slots)
-                    
-        add_slot :: [RegSlotInfo] -> SubArea -> [RegSlotInfo]
-        add_slot rst (a@(RegSlot r@(LocalReg _ ty)), off, w) = 
-          if off == w && widthInBytes (typeWidth ty) == w then
-            (expectJust "add_slot" (Map.lookup a areaMap), r, w) : rst
-          else panic "live_ptrs: only part of a variable live at a proc point"
-        add_slot rst (CallArea Old, _, _) =
-          rst -- the update frame (or return infotable) should be live
-              -- would be nice to check that only that part of the callarea is live...
-        add_slot rst ((CallArea _), _, _) =
-          rst
-          -- JD: THIS ISN'T CURRENTLY A CORRECTNESS PROBLEM, BUT WE SHOULD REALLY
-          -- MAKE LIVENESS INFO AROUND CALLS MORE PRECISE -- FOR NOW, A 32-BIT
-          -- FLOAT PADS OUT TO 64 BITS, BUT WE ASSUME THE WHOLE PARAMETER-PASSING
-          -- AREA IS LIVE (WHICH IT ISN'T...).  WE SHOULD JUST PUT THE LIVE AREAS
-          -- IN THE CALL NODES, WHICH SHOULD EVENTUALLY HAVE LIVE REGISTER AS WELL,
-          -- SO IT'S ALL GOING IN THE SAME DIRECTION.
-          -- pprPanic "CallAreas must not be live across function calls" (ppr bid <+> ppr c)
-
-        slots :: SubAreaSet	 -- The SubAreaSet for 'bid'
-        slots = expectJust "live_ptrs slots" $ mapLookup bid slotEnv
-        youngByte = expectJust "live_ptrs bid_pos" $ Map.lookup (CallArea (Young bid)) areaMap
-
--- Construct the stack maps for a procedure _if_ it needs an infotable.
--- When wouldn't a procedure need an infotable? If it is a procpoint that
--- is not the successor of a call.
-setInfoTableStackMap :: SlotEnv -> AreaMap -> CmmDecl -> CmmDecl
-setInfoTableStackMap slotEnv areaMap
-     t@(CmmProc (TopInfo {stack_info=StackInfo {updfr_space = Just updfr_off}}) _ 
-                (CmmGraph {g_entry = eid}))
-  = updInfo (const (live_ptrs updfr_off slotEnv areaMap eid)) id t
-setInfoTableStackMap _ _ t = t
-                 
-
-
------------------------------------------------------------------------
 -- SRTs
 
 -- WE NEED AN EXAMPLE HERE.
@@ -191,14 +84,14 @@ setInfoTableStackMap _ _ t = t
 -----------------------------------------------------------------------
 -- Finding the CAFs used by a procedure
 
-type CAFSet = Map CLabel ()
+type CAFSet = Set CLabel
 type CAFEnv = BlockEnv CAFSet
 
 -- First, an analysis to find live CAFs.
 cafLattice :: DataflowLattice CAFSet
-cafLattice = DataflowLattice "live cafs" Map.empty add
-  where add _ (OldFact old) (NewFact new) = case old `Map.union` new of
-                                              new' -> (changeIf $ Map.size new' > Map.size old, new')
+cafLattice = DataflowLattice "live cafs" Set.empty add
+  where add _ (OldFact old) (NewFact new) = case old `Set.union` new of
+                                              new' -> (changeIf $ Set.size new' > Set.size old, new')
 
 cafTransfers :: BwdTransfer CmmNode CAFSet
 cafTransfers = mkBTransfer3 first middle last
@@ -210,11 +103,11 @@ cafTransfers = mkBTransfer3 first middle last
                CmmLit (CmmLabelOff c _)         -> add c set
                CmmLit (CmmLabelDiffOff c1 c2 _) -> add c1 $ add c2 set
                _ -> set
-        add l s = if hasCAF l then Map.insert (toClosureLbl l) () s
+        add l s = if hasCAF l then Set.insert (toClosureLbl l) s
                               else s
 
-cafAnal :: CmmGraph -> FuelUniqSM CAFEnv
-cafAnal g = liftM snd $ dataflowPassBwd g [] $ analBwd cafLattice cafTransfers
+cafAnal :: Platform -> CmmGraph -> CAFEnv
+cafAnal g = dataflowAnalBwd g [] $ analBwd cafLattice cafTransfers
 
 -----------------------------------------------------------------------
 -- Building the SRTs
@@ -266,13 +159,13 @@ srtToData srt = [CmmData RelocatableReadOnlyData (Statics (lbl srt) tbl)]
 buildSRTs :: TopSRT -> Map CLabel CAFSet -> CAFSet ->
              FuelUniqSM (TopSRT, Maybe CmmDecl, C_SRT)
 buildSRTs topSRT topCAFMap cafs =
-  do let liftCAF lbl () z = -- get CAFs for functions without static closures
-           case Map.lookup lbl topCAFMap of Just cafs -> z `Map.union` cafs
-                                            Nothing   -> Map.insert lbl () z
+  do let liftCAF lbl z = -- get CAFs for functions without static closures
+           case Map.lookup lbl topCAFMap of Just cafs -> z `Set.union` cafs
+                                            Nothing   -> Set.insert lbl z
          -- For each label referring to a function f without a static closure,
          -- replace it with the CAFs that are reachable from f.
          sub_srt topSRT localCafs =
-           let cafs = Map.keys (Map.foldRightWithKey liftCAF Map.empty localCafs)
+           let cafs = Set.elems (foldSet liftCAF Set.empty localCafs)
                mkSRT topSRT =
                  do localSRTs <- procpointSRT (lbl topSRT) (elt_map topSRT) cafs
                     return (topSRT, localSRTs)
@@ -373,21 +266,21 @@ mkTopCAFInfo localCAFs = foldl addToTop Map.empty g
           Map.insert l (flatten env cafset) env
         addToTop env (CyclicSCC nodes) =
           let (lbls, cafsets) = unzip nodes
-              cafset  = lbls `Map.deleteList` foldl Map.union Map.empty cafsets
+              cafset  = foldr Set.delete (foldl Set.union Set.empty cafsets) lbls
           in foldl (\env l -> Map.insert l (flatten env cafset) env) env lbls
-        flatten env cafset = Map.foldRightWithKey (lookup env) Map.empty cafset
-        lookup env caf () cafset' =
-          case Map.lookup caf env of Just cafs -> Map.foldRightWithKey add cafset' cafs
-                                     Nothing -> add caf () cafset'
-        add caf () cafset' = Map.insert caf () cafset'
+        flatten env cafset = foldSet (lookup env) Set.empty cafset
+        lookup env caf cafset' =
+          case Map.lookup caf env of Just cafs -> foldSet add cafset' cafs
+                                     Nothing -> add caf cafset'
+        add caf cafset' = Set.insert caf cafset'
         g = stronglyConnCompFromEdgedVertices
-              (map (\n@(l, cafs) -> (n, l, Map.keys cafs)) localCAFs)
+              (map (\n@(l, cafs) -> (n, l, Set.elems cafs)) localCAFs)
 
 -- Bundle the CAFs used at a procpoint.
 bundleCAFs :: CAFEnv -> CmmDecl -> (CAFSet, CmmDecl)
 bundleCAFs cafEnv t@(CmmProc _ _ (CmmGraph {g_entry=entry})) =
   (expectJust "bundleCAFs" (mapLookup entry cafEnv), t)
-bundleCAFs _ t = (Map.empty, t)
+bundleCAFs _ t = (Set.empty, t)
 
 -- Construct the SRTs for the given procedure.
 setInfoTableSRT :: Map CLabel CAFSet -> TopSRT -> (CAFSet, CmmDecl) ->
@@ -418,91 +311,3 @@ updInfoTbl toVars toSrt info_tbl@(CmmInfoTable {})
                            StackRep ls -> StackRep (toVars ls)
                            other       -> other }
 updInfoTbl _ _ t@CmmNonInfoTable = t
-  
-----------------------------------------------------------------
--- Safe foreign calls: We need to insert the code that suspends and resumes
--- the thread before and after a safe foreign call.
--- Why do we do this so late in the pipeline?
--- Because we need this code to appear without interrruption: you can't rely on the
--- value of the stack pointer between the call and resetting the thread state;
--- you need to have an infotable on the young end of the stack both when
--- suspending the thread and making the foreign call.
--- All of this is much easier if we insert the suspend and resume calls here.
-
--- At the same time, we prepare for the stages of the compiler that
--- build the proc points. We have to do this at the same time because
--- the safe foreign calls need special treatment with respect to infotables.
--- A safe foreign call needs an infotable even though it isn't
--- a procpoint. The following datatype captures the information
--- needed to generate the infotables along with the Cmm data and procedures.
-
--- JD: Why not do this while splitting procedures?
-lowerSafeForeignCalls :: AreaMap -> CmmDecl -> FuelUniqSM CmmDecl
-lowerSafeForeignCalls _ t@(CmmData _ _) = return t
-lowerSafeForeignCalls areaMap (CmmProc info l g@(CmmGraph {g_entry=entry})) = do
-  let block b mblocks = mblocks >>= lowerSafeCallBlock entry areaMap b
-  blocks <- foldGraphBlocks block (return mapEmpty) g
-  return $ CmmProc info l (ofBlockMap entry blocks)
-
--- If the block ends with a safe call in the block, lower it to an unsafe
--- call (with appropriate saves and restores before and after).
-lowerSafeCallBlock :: BlockId -> AreaMap -> CmmBlock -> BlockEnv CmmBlock
-                              -> FuelUniqSM (BlockEnv CmmBlock)
-lowerSafeCallBlock entry areaMap b blocks =
-  case blockToNodeList b of
-    (JustC (CmmEntry id), m, JustC l@(CmmForeignCall {})) -> lowerSafeForeignCall entry areaMap blocks id m l
-    _                                                    -> return $ insertBlock b blocks
-
--- Late in the code generator, we want to insert the code necessary
--- to lower a safe foreign call to a sequence of unsafe calls.
-lowerSafeForeignCall :: BlockId -> AreaMap -> BlockEnv CmmBlock -> BlockId -> [CmmNode O O] -> CmmNode O C
-                                -> FuelUniqSM (BlockEnv CmmBlock)
-lowerSafeForeignCall entry areaMap blocks bid m
-    (CmmForeignCall {tgt=tgt, res=rs, args=as, succ=succ, updfr = updfr_off, intrbl = intrbl}) =
- do let newTemp rep = getUniqueM >>= \u -> return (LocalReg u rep)
-    -- Both 'id' and 'new_base' are KindNonPtr because they're
-    -- RTS-only objects and are not subject to garbage collection
-    id <- newTemp bWord
-    new_base <- newTemp (cmmRegType (CmmGlobal BaseReg))
-    let (caller_save, caller_load) = callerSaveVolatileRegs
-    load_tso <- newTemp gcWord -- TODO FIXME NOW
-    load_stack <- newTemp gcWord -- TODO FIXME NOW
-    let (<**>) = (M.<*>)
-    let suspendThread = foreignLbl "suspendThread"
-        resumeThread  = foreignLbl "resumeThread"
-        foreignLbl name = CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId (fsLit name)))
-        suspend = saveThreadState <**>
-                  caller_save <**>
-                  mkUnsafeCall (ForeignTarget suspendThread
-                                (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint]))
-                               [id] [CmmReg (CmmGlobal BaseReg), CmmLit (CmmInt (fromIntegral (fromEnum intrbl)) wordWidth)]
-        midCall = mkUnsafeCall tgt rs as
-        resume  = mkUnsafeCall (ForeignTarget resumeThread
-                                (ForeignConvention CCallConv [AddrHint] [AddrHint]))
-                     [new_base] [CmmReg (CmmLocal id)] <**>
-                  -- Assign the result to BaseReg: we
-                  -- might now have a different Capability!
-                  mkAssign (CmmGlobal BaseReg) (CmmReg (CmmLocal new_base)) <**>
-                  caller_load <**>
-                  loadThreadState load_tso load_stack
-        -- We have to save the return value on the stack because its next use
-        -- may appear in a different procedure due to procpoint splitting...
-        saveRetVals = foldl (<**>) emptyAGraph $ map (M.mkMiddle . spill) rs
-        spill r = CmmStore (regSlot r) (CmmReg $ CmmLocal r)
-        regSlot r@(LocalReg _ _) = CmmRegOff (CmmGlobal Sp) (sp_off - offset)
-          where offset = w + expectJust "lowerForeign" (Map.lookup (RegSlot r) areaMap)
-                sp_off = wORD_SIZE + expectJust "lowerForeign" (Map.lookup (CallArea area) areaMap)
-                area = if succ == entry then Old else Young succ
-                w = widthInBytes $ typeWidth $ localRegType r
-        -- Note: The successor must be a procpoint, and we have already split,
-        --       so we use a jump, not a branch.
-        succLbl = CmmLit (CmmLabel (infoTblLbl succ))
-        jump = CmmCall { cml_target  = succLbl, cml_cont = Nothing
-                       , cml_args    = widthInBytes wordWidth ,cml_ret_args = 0
-                       , cml_ret_off = updfr_off}
-    graph' <- liftUniq $ labelAGraph bid $ catAGraphs (map M.mkMiddle m) <**>
-                                           suspend <**> midCall <**>
-                                           resume  <**> saveRetVals <**> M.mkLast jump
-    return $ blocks `mapUnion` toBlockMap graph'
-lowerSafeForeignCall _ _ _ _ _ _ = panic "lowerSafeForeignCall was passed something else"
-
diff --git a/compiler/cmm/CmmCommonBlockElim.hs b/compiler/cmm/CmmCommonBlockElim.hs
index abbfd01156..4df7304acf 100644
--- a/compiler/cmm/CmmCommonBlockElim.hs
+++ b/compiler/cmm/CmmCommonBlockElim.hs
@@ -13,22 +13,23 @@ where
 import BlockId
 import Cmm
 import CmmUtils
+import CmmContFlowOpt
 import Prelude hiding (iterate, succ, unzip, zip)
 
-import Compiler.Hoopl
+import Hoopl hiding (ChangeFlag)
 import Data.Bits
 import qualified Data.List as List
 import Data.Word
 import FastString
-import Control.Monad
 import Outputable
 import UniqFM
-import Unique
 
 my_trace :: String -> SDoc -> a -> a
 my_trace = if False then pprTrace else \_ _ a -> a
 
--- Eliminate common blocks:
+-- -----------------------------------------------------------------------------
+-- Eliminate common blocks
+
 -- If two blocks are identical except for the label on the first node,
 -- then we can eliminate one of the blocks. To ensure that the semantics
 -- of the program are preserved, we have to rewrite each predecessor of the
@@ -42,59 +43,50 @@ my_trace = if False then pprTrace else \_ _ a -> a
 
 -- TODO: Use optimization fuel
 elimCommonBlocks :: CmmGraph -> CmmGraph
-elimCommonBlocks g =
-    upd_graph g . snd $ iterate common_block reset hashed_blocks
-                                (emptyUFM, mapEmpty)
-      where hashed_blocks    = map (\b -> (hash_block b, b)) (reverse (postorderDfs g))
-            reset (_, subst) = (emptyUFM, subst)
+elimCommonBlocks g = replaceLabels env g
+  where
+     env = iterate hashed_blocks mapEmpty
+     hashed_blocks = map (\b -> (hash_block b, b)) $ postorderDfs g
 
 -- Iterate over the blocks until convergence
-iterate :: (t -> a -> (Bool, t)) -> (t -> t) -> [a] -> t -> t
-iterate upd reset blocks state =
-  case foldl upd' (False, state) blocks of
-    (True,  state') -> iterate upd reset blocks (reset state')
-    (False, state') -> state'
-  where upd' (b, s) a = let (b', s') = upd s a in (b || b', s') -- lift to track changes
+iterate :: [(HashCode,CmmBlock)] -> BlockEnv BlockId -> BlockEnv BlockId
+iterate blocks subst =
+  case foldl common_block (False, emptyUFM, subst) blocks of
+    (changed,  _, subst)
+       | changed   -> iterate blocks subst
+       | otherwise -> subst
+
+type State  = (ChangeFlag, UniqFM [CmmBlock], BlockEnv BlockId)
+
+type ChangeFlag = Bool
+type HashCode = Int
 
 -- Try to find a block that is equal (or ``common'') to b.
-type BidMap = BlockEnv BlockId
-type State  = (UniqFM [CmmBlock], BidMap)
-common_block :: (Outputable h, Uniquable h) =>  State -> (h, CmmBlock) -> (Bool, State)
-common_block (bmap, subst) (hash, b) =
+common_block :: State -> (HashCode, CmmBlock) -> State
+common_block (old_change, bmap, subst) (hash, b) =
   case lookupUFM bmap hash of
     Just bs -> case (List.find (eqBlockBodyWith (eqBid subst) b) bs,
                      mapLookup bid subst) of
                  (Just b', Nothing)                         -> addSubst b'
                  (Just b', Just b'') | entryLabel b' /= b'' -> addSubst b'
-                 _ -> (False, (addToUFM bmap hash (b : bs), subst))
-    Nothing -> (False, (addToUFM bmap hash [b], subst))
+                                     | otherwise -> (old_change, bmap, subst)
+                 _ -> (old_change, addToUFM bmap hash (b : bs), subst)
+    Nothing -> (old_change, addToUFM bmap hash [b], subst)
   where bid = entryLabel b
-        addSubst b' = my_trace "found new common block" (ppr (entryLabel b')) $
-                      (True, (bmap, mapInsert bid (entryLabel b') subst))
-
--- Given the map ``subst'' from BlockId -> BlockId, we rewrite the graph.
-upd_graph :: CmmGraph -> BidMap -> CmmGraph
-upd_graph g subst = mapGraphNodes (id, middle, last) g
-  where middle = mapExpDeep exp
-        last l = last' (mapExpDeep exp l)
-        last' :: CmmNode O C -> CmmNode O C
-        last' (CmmBranch bid)              = CmmBranch $ sub bid
-        last' (CmmCondBranch p t f)        = cond p (sub t) (sub f)
-        last' (CmmCall t (Just bid) a r o) = CmmCall t (Just $ sub bid) a r o
-        last' l@(CmmCall _ Nothing _ _ _)  = l
-        last' (CmmForeignCall t r a bid u i) = CmmForeignCall t r a (sub bid) u i
-        last' (CmmSwitch e bs)             = CmmSwitch e $ map (liftM sub) bs
-        cond p t f = if t == f then CmmBranch t else CmmCondBranch p t f
-        exp (CmmStackSlot (CallArea (Young id))       off) =
-             CmmStackSlot (CallArea (Young (sub id))) off
-        exp (CmmLit (CmmBlock id)) = CmmLit (CmmBlock (sub id))
-        exp e = e
-        sub = lookupBid subst
+        addSubst b' = my_trace "found new common block" (ppr bid <> char '=' <> ppr (entryLabel b')) $
+                      (True, bmap, mapInsert bid (entryLabel b') subst)
+
+
+-- -----------------------------------------------------------------------------
+-- Hashing and equality on blocks
+
+-- Below here is mostly boilerplate: hashing blocks ignoring labels,
+-- and comparing blocks modulo a label mapping.
 
 -- To speed up comparisons, we hash each basic block modulo labels.
 -- The hashing is a bit arbitrary (the numbers are completely arbitrary),
 -- but it should be fast and good enough.
-hash_block :: CmmBlock -> Int
+hash_block :: CmmBlock -> HashCode
 hash_block block =
   fromIntegral (foldBlockNodesB3 (hash_fst, hash_mid, hash_lst) block (0 :: Word32) .&. (0x7fffffff :: Word32))
   -- UniqFM doesn't like negative Ints
@@ -103,11 +95,11 @@ hash_block block =
         hash_lst m h = hash_node m + h `shiftL` 1
 
         hash_node :: CmmNode O x -> Word32
-        hash_node (CmmComment (FastString u _ _ _ _)) = cvt u
+        hash_node (CmmComment (FastString u _ _ _ _)) = 0 -- don't care
         hash_node (CmmAssign r e) = hash_reg r + hash_e e
         hash_node (CmmStore e e') = hash_e e + hash_e e'
         hash_node (CmmUnsafeForeignCall t _ as) = hash_tgt t + hash_list hash_e as
-        hash_node (CmmBranch _) = 23 -- would be great to hash these properly
+        hash_node (CmmBranch _) = 23 -- NB. ignore the label
         hash_node (CmmCondBranch p _ _) = hash_e p
         hash_node (CmmCall e _ _ _ _) = hash_e e
         hash_node (CmmForeignCall t _ _ _ _ _) = hash_tgt t
@@ -143,18 +135,60 @@ hash_block block =
 -- Utilities: equality and substitution on the graph.
 
 -- Given a map ``subst'' from BlockID -> BlockID, we define equality.
-eqBid :: BidMap -> BlockId -> BlockId -> Bool
+eqBid :: BlockEnv BlockId -> BlockId -> BlockId -> Bool
 eqBid subst bid bid' = lookupBid subst bid == lookupBid subst bid'
-lookupBid :: BidMap -> BlockId -> BlockId
+lookupBid :: BlockEnv BlockId -> BlockId -> BlockId
 lookupBid subst bid = case mapLookup bid subst of
                         Just bid  -> lookupBid subst bid
                         Nothing -> bid
 
--- Equality on the body of a block, modulo a function mapping block IDs to block IDs.
+-- Middle nodes and expressions can contain BlockIds, in particular in
+-- CmmStackSlot and CmmBlock, so we have to use a special equality for
+-- these.
+--
+eqMiddleWith :: (BlockId -> BlockId -> Bool)
+             -> CmmNode O O -> CmmNode O O -> Bool
+eqMiddleWith eqBid (CmmComment _) (CmmComment _) = True
+eqMiddleWith eqBid (CmmAssign r1 e1) (CmmAssign r2 e2)
+  = r1 == r2 && eqExprWith eqBid e1 e2
+eqMiddleWith eqBid (CmmStore l1 r1) (CmmStore l2 r2)
+  = eqExprWith eqBid l1 l2 && eqExprWith eqBid r1 r2
+eqMiddleWith eqBid (CmmUnsafeForeignCall t1 r1 a1)
+                   (CmmUnsafeForeignCall t2 r2 a2)
+  = t1 == t2 && r1 == r2 && and (zipWith (eqExprWith eqBid) a1 a2)
+eqMiddleWith _ _ _ = False
+
+eqExprWith :: (BlockId -> BlockId -> Bool)
+           -> CmmExpr -> CmmExpr -> Bool
+eqExprWith eqBid = eq
+ where
+  CmmLit l1          `eq` CmmLit l2          = eqLit l1 l2
+  CmmLoad e1 _       `eq` CmmLoad e2 _       = e1 `eq` e2
+  CmmReg r1          `eq` CmmReg r2          = r1==r2
+  CmmRegOff r1 i1    `eq` CmmRegOff r2 i2    = r1==r2 && i1==i2
+  CmmMachOp op1 es1  `eq` CmmMachOp op2 es2  = op1==op2 && es1 `eqs` es2
+  CmmStackSlot a1 i1 `eq` CmmStackSlot a2 i2 = eqArea a1 a2 && i1==i2
+  _e1                `eq` _e2                = False
+
+  xs `eqs` ys = and (zipWith eq xs ys)
+
+  eqLit (CmmBlock id1) (CmmBlock id2) = eqBid id1 id2
+  eqLit l1 l2 = l1 == l2
+
+  eqArea Old Old = True
+  eqArea (Young id1) (Young id2) = eqBid id1 id2
+  eqArea _ _ = False
+
+-- Equality on the body of a block, modulo a function mapping block
+-- IDs to block IDs.
 eqBlockBodyWith :: (BlockId -> BlockId -> Bool) -> CmmBlock -> CmmBlock -> Bool
-eqBlockBodyWith eqBid block block' = middles == middles' && eqLastWith eqBid last last'
-  where (_, middles , JustC last  :: MaybeC C (CmmNode O C)) = blockToNodeList block
-        (_, middles', JustC last' :: MaybeC C (CmmNode O C)) = blockToNodeList block'
+eqBlockBodyWith eqBid block block'
+  = and (zipWith (eqMiddleWith eqBid) (blockToList m) (blockToList m')) &&
+    eqLastWith eqBid l l'
+  where (_,m,l)   = blockSplit block
+        (_,m',l') = blockSplit block'
+
+
 
 eqLastWith :: (BlockId -> BlockId -> Bool) -> CmmNode O C -> CmmNode O C -> Bool
 eqLastWith eqBid (CmmBranch bid1) (CmmBranch bid2) = eqBid bid1 bid2
diff --git a/compiler/cmm/CmmContFlowOpt.hs b/compiler/cmm/CmmContFlowOpt.hs
index 73ce57e93f..36e7b8ec62 100644
--- a/compiler/cmm/CmmContFlowOpt.hs
+++ b/compiler/cmm/CmmContFlowOpt.hs
@@ -2,19 +2,19 @@
 {-# OPTIONS_GHC -fno-warn-warnings-deprecations -fno-warn-incomplete-patterns #-}
 
 module CmmContFlowOpt
-    ( runCmmContFlowOpts
-    , removeUnreachableBlocks, replaceBranches
+    ( cmmCfgOpts
+    , cmmCfgOptsProc
+    , removeUnreachableBlocks
+    , replaceLabels
     )
 where
 
 import BlockId
 import Cmm
 import CmmUtils
-import Digraph
 import Maybes
-import Outputable
 
-import Compiler.Hoopl
+import Hoopl
 import Control.Monad
 import Prelude hiding (succ, unzip, zip)
 
@@ -24,104 +24,158 @@ import Prelude hiding (succ, unzip, zip)
 --
 -----------------------------------------------------------------------------
 
-runCmmContFlowOpts :: CmmGroup -> CmmGroup
-runCmmContFlowOpts = map (optProc cmmCfgOpts)
-
 cmmCfgOpts :: CmmGraph -> CmmGraph
-cmmCfgOpts = removeUnreachableBlocks . blockConcat . branchChainElim
-        -- Here branchChainElim can ultimately be replaced
-        -- with a more exciting combination of optimisations
+cmmCfgOpts = removeUnreachableBlocks . blockConcat
+
+cmmCfgOptsProc :: CmmDecl -> CmmDecl
+cmmCfgOptsProc = optProc cmmCfgOpts
 
 optProc :: (g -> g) -> GenCmmDecl d h g -> GenCmmDecl d h g
 optProc opt (CmmProc info lbl g) = CmmProc info lbl (opt g)
 optProc _   top                  = top
 
+
 -----------------------------------------------------------------------------
 --
--- Branch Chain Elimination
+-- Block concatenation
 --
 -----------------------------------------------------------------------------
 
--- | Remove any basic block of the form L: goto L', and replace L with
--- L' everywhere else, unless L is the successor of a call instruction
--- and L' is the entry block. You don't want to set the successor of a
--- function call to the entry block because there is no good way to
--- store both the infotables for the call and from the callee, while
--- putting the stack pointer in a consistent place.
+-- This optimisation does two things:
+--   - If a block finishes with an unconditional branch, then we may
+--     be able to concatenate the block it points to and remove the
+--     branch.  We do this either if the destination block is small
+--     (e.g. just another branch), or if this is the only jump to
+--     this particular destination block.
+--
+--   - If a block finishes in a call whose continuation block is a
+--     goto, then we can shortcut the destination, making the
+--     continuation block the destination of the goto.
+--
+-- Both transformations are improved by working from the end of the
+-- graph towards the beginning, because we may be able to perform many
+-- shortcuts in one go.
+
+
+-- We need to walk over the blocks from the end back to the
+-- beginning.  We are going to maintain the "current" graph
+-- (BlockEnv CmmBlock) as we go, and also a mapping from BlockId
+-- to BlockId, representing continuation labels that we have
+-- renamed.  This latter mapping is important because we might
+-- shortcut a CmmCall continuation.  For example:
+--
+--    Sp[0] = L
+--    call g returns to L
+--
+--    L: goto M
 --
--- JD isn't quite sure when it's safe to share continuations for different
--- function calls -- have to think about where the SP will be,
--- so we'll table that problem for now by leaving all call successors alone.
-
-branchChainElim :: CmmGraph -> CmmGraph
-branchChainElim g
-  | null lone_branch_blocks = g    -- No blocks to remove
-  | otherwise               = {- pprTrace "branchChainElim" (ppr forest) $ -}
-                              replaceLabels (mapFromList edges) g
+--    M: ...
+--
+-- So when we shortcut the L block, we need to replace not only
+-- the continuation of the call, but also references to L in the
+-- code (e.g. the assignment Sp[0] = L).  So we keep track of
+-- which labels we have renamed and apply the mapping at the end
+-- with replaceLabels.
+
+blockConcat  :: CmmGraph -> CmmGraph
+blockConcat g@CmmGraph { g_entry = entry_id }
+  = replaceLabels shortcut_map $ ofBlockMap new_entry new_blocks
   where
-    blocks = toBlockList g
-
-    lone_branch_blocks :: [(BlockId, BlockId)]
-      -- each (L,K) is a block of the form
-      --   L : goto K
-    lone_branch_blocks = mapCatMaybes isLoneBranch blocks
-
-    call_succs = foldl add emptyBlockSet blocks
-      where add :: BlockSet -> CmmBlock -> BlockSet
-            add succs b =
-              case lastNode b of
-                (CmmCall _ (Just k) _ _ _) -> setInsert k succs
-                (CmmForeignCall {succ=k})  -> setInsert k succs
-                _                          -> succs
-
-    isLoneBranch :: CmmBlock -> Maybe (BlockId, BlockId)
-    isLoneBranch block
-      | (JustC (CmmEntry id), [], JustC (CmmBranch target)) <- blockToNodeList block
-      , not (setMember id call_succs)
-      = Just (id,target)
-      | otherwise
-      = Nothing
-
-    -- We build a graph from lone_branch_blocks (every node has only
-    -- one out edge).  Then we
-    --   - topologically sort the graph: if from A we can reach B,
-    --     then A occurs before B in the result list.
-    --   - depth-first search starting from the nodes in this list.
-    --     This gives us a [[node]], in which each list is a dependency
-    --     chain.
-    --   - for each list [a1,a2,...an] replace branches to ai with an.
-    --
-    -- This approach nicely deals with cycles by ignoring them.
-    -- Branches in a cycle will be redirected to somewhere in the
-    -- cycle, but we don't really care where.  A cycle should be dead code,
-    -- and so will be eliminated by removeUnreachableBlocks.
-    --
-    fromNode (b,_) = b
-    toNode   a     = (a,a)
-
-    all_block_ids :: LabelSet
-    all_block_ids = setFromList (map fst lone_branch_blocks)
-                      `setUnion`
-                    setFromList (map snd lone_branch_blocks)
-
-    forest = dfsTopSortG $ graphFromVerticesAndAdjacency nodes lone_branch_blocks
-        where nodes = map toNode $ setElems $ all_block_ids
-
-    edges  = [ (fromNode y, fromNode x)
-             | (x:xs) <- map reverse forest, y <- xs ]
+     -- we might be able to shortcut the entry BlockId itself
+     new_entry
+       | Just entry_blk <- mapLookup entry_id new_blocks
+       , Just dest      <- canShortcut entry_blk
+       = dest
+       | otherwise
+       = entry_id
 
-----------------------------------------------------------------
+     blocks = postorderDfs g
+
+     (new_blocks, shortcut_map) =
+           foldr maybe_concat (toBlockMap g, mapEmpty) blocks
+
+     maybe_concat :: CmmBlock
+                  -> (BlockEnv CmmBlock, BlockEnv BlockId)
+                  -> (BlockEnv CmmBlock, BlockEnv BlockId)
+     maybe_concat block unchanged@(blocks, shortcut_map)
+        | CmmBranch b' <- last
+        , Just blk' <- mapLookup b' blocks
+        , shouldConcatWith b' blk'
+        = (mapInsert bid (splice head blk') blocks, shortcut_map)
+
+        -- calls: if we can shortcut the continuation label, then
+        -- we must *also* remember to substitute for the label in the
+        -- code, because we will push it somewhere.
+        | Just b'   <- callContinuation_maybe last
+        , Just blk' <- mapLookup b' blocks
+        , Just dest <- canShortcut blk'
+        = (blocks, mapInsert b' dest shortcut_map)
+           -- replaceLabels will substitute dest for b' everywhere, later
+
+        -- non-calls: see if we can shortcut any of the successors.
+        | Nothing <- callContinuation_maybe last
+        = ( mapInsert bid (blockJoinTail head shortcut_last) blocks
+          , shortcut_map )
+
+        | otherwise
+        = (blocks, shortcut_map)
+        where
+          (head, last) = blockSplitTail block
+          bid = entryLabel block
+          shortcut_last = mapSuccessors shortcut last
+          shortcut l =
+             case mapLookup l blocks of
+               Just b | Just dest <- canShortcut b  -> dest
+               _otherwise -> l
+
+     shouldConcatWith b block
+       | num_preds b == 1    = True  -- only one predecessor: go for it
+       | okToDuplicate block = True  -- short enough to duplicate
+       | otherwise           = False
+       where num_preds bid = mapLookup bid backEdges `orElse` 0
+
+     canShortcut :: CmmBlock -> Maybe BlockId
+     canShortcut block
+       | (_, middle, CmmBranch dest) <- blockSplit block
+       , isEmptyBlock middle
+       = Just dest
+       | otherwise
+       = Nothing
+
+     backEdges :: BlockEnv Int -- number of predecessors for each block
+     backEdges = mapInsertWith (+) entry_id 1 $ -- add 1 for the entry id
+                   mapMap setSize $ predMap blocks
+
+     splice :: Block CmmNode C O -> CmmBlock -> CmmBlock
+     splice head rest = head `blockAppend` snd (blockSplitHead rest)
+
+
+callContinuation_maybe :: CmmNode O C -> Maybe BlockId
+callContinuation_maybe (CmmCall { cml_cont = Just b }) = Just b
+callContinuation_maybe (CmmForeignCall { succ = b })   = Just b
+callContinuation_maybe _ = Nothing
+
+okToDuplicate :: CmmBlock -> Bool
+okToDuplicate block
+  = case blockSplit block of (_, m, _) -> isEmptyBlock m
+  -- cheap and cheerful; we might expand this in the future to
+  -- e.g. spot blocks that represent a single instruction or two
+
+------------------------------------------------------------------------
+-- Map over the CmmGraph, replacing each label with its mapping in the
+-- supplied BlockEnv.
 
 replaceLabels :: BlockEnv BlockId -> CmmGraph -> CmmGraph
-replaceLabels env =
-  replace_eid . mapGraphNodes1 txnode
+replaceLabels env g
+  | mapNull env = g
+  | otherwise   = replace_eid $ mapGraphNodes1 txnode g
    where
      replace_eid g = g {g_entry = lookup (g_entry g)}
      lookup id = mapLookup id env `orElse` id
 
      txnode :: CmmNode e x -> CmmNode e x
      txnode (CmmBranch bid)         = CmmBranch (lookup bid)
-     txnode (CmmCondBranch p t f)   = CmmCondBranch (exp p) (lookup t) (lookup f)
+     txnode (CmmCondBranch p t f)   = mkCmmCondBranch (exp p) (lookup t) (lookup f)
      txnode (CmmSwitch e arms)      = CmmSwitch (exp e) (map (liftM lookup) arms)
      txnode (CmmCall t k a res r)   = CmmCall (exp t) (liftM lookup k) a res r
      txnode fc@CmmForeignCall{}     = fc{ args = map exp (args fc)
@@ -130,90 +184,25 @@ replaceLabels env =
 
      exp :: CmmExpr -> CmmExpr
      exp (CmmLit (CmmBlock bid))                = CmmLit (CmmBlock (lookup bid))
-     exp (CmmStackSlot (CallArea (Young id)) i) = CmmStackSlot (CallArea (Young (lookup id))) i
+     exp (CmmStackSlot (Young id) i) = CmmStackSlot (Young (lookup id)) i
      exp e                                      = e
 
-
-replaceBranches :: BlockEnv BlockId -> CmmGraph -> CmmGraph
-replaceBranches env g = mapGraphNodes (id, id, last) g
-  where
-    last :: CmmNode O C -> CmmNode O C
-    last (CmmBranch id)          = CmmBranch (lookup id)
-    last (CmmCondBranch e ti fi) = CmmCondBranch e (lookup ti) (lookup fi)
-    last (CmmSwitch e tbl)       = CmmSwitch e (map (fmap lookup) tbl)
-    last l@(CmmCall {})          = l
-    last l@(CmmForeignCall {})   = l
-    lookup id = fmap lookup (mapLookup id env) `orElse` id
-            -- XXX: this is a recursive lookup, it follows chains until the lookup
-            -- returns Nothing, at which point we return the last BlockId
+mkCmmCondBranch :: CmmExpr -> Label -> Label -> CmmNode O C
+mkCmmCondBranch p t f = if t == f then CmmBranch t else CmmCondBranch p t f
 
 ----------------------------------------------------------------
 -- Build a map from a block to its set of predecessors. Very useful.
+
 predMap :: [CmmBlock] -> BlockEnv BlockSet
 predMap blocks = foldr add_preds mapEmpty blocks -- find the back edges
   where add_preds block env = foldl (add (entryLabel block)) env (successors block)
         add bid env b' =
           mapInsert b' (setInsert bid (mapLookup b' env `orElse` setEmpty)) env
 
------------------------------------------------------------------------------
---
--- Block concatenation
---
------------------------------------------------------------------------------
-
--- If a block B branches to a label L, L is not the entry block,
--- and L has no other predecessors,
--- then we can splice the block starting with L onto the end of B.
--- Order matters, so we work bottom up (reverse postorder DFS).
--- This optimization can be inhibited by unreachable blocks, but
--- the reverse postorder DFS returns only reachable blocks.
---
--- To ensure correctness, we have to make sure that the BlockId of the block
--- we are about to eliminate is not named in another instruction.
---
--- Note: This optimization does _not_ subsume branch chain elimination.
-
-blockConcat  :: CmmGraph -> CmmGraph
-blockConcat g@(CmmGraph {g_entry=eid}) =
-  replaceLabels concatMap $ ofBlockMap (g_entry g) blocks'
-  where
-     blocks = postorderDfs g
-
-     (blocks', concatMap) =
-           foldr maybe_concat (toBlockMap g, mapEmpty) $ blocks
-
-     maybe_concat :: CmmBlock -> (LabelMap CmmBlock, LabelMap Label) -> (LabelMap CmmBlock, LabelMap Label)
-     maybe_concat b unchanged@(blocks', concatMap) =
-       let bid = entryLabel b
-       in case blockToNodeList b of
-            (JustC h, m, JustC (CmmBranch b')) ->
-               if canConcatWith b' then
-                 (mapInsert bid (splice blocks' h m b') blocks',
-                  mapInsert b' bid concatMap)
-               else unchanged
-            _ -> unchanged
-
-     num_preds bid = liftM setSize (mapLookup bid backEdges) `orElse` 0
-
-     canConcatWith b' = b' /= eid && num_preds b' == 1
-
-     backEdges = predMap blocks
-
-     splice :: forall map n e x.
-               IsMap map =>
-               map (Block n e x) -> n C O -> [n O O] -> KeyOf map -> Block n C x
-     splice blocks' h m bid' =
-          case mapLookup bid' blocks' of
-            Nothing -> panic "unknown successor block"
-            Just block | (_, m', l') <- blockToNodeList block
-                -> blockOfNodeList (JustC h, (m ++ m'), l')
-
 
 -----------------------------------------------------------------------------
 --
 -- Removing unreachable blocks
---
------------------------------------------------------------------------------
 
 removeUnreachableBlocks :: CmmGraph -> CmmGraph
 removeUnreachableBlocks g
diff --git a/compiler/cmm/CmmCvt.hs b/compiler/cmm/CmmCvt.hs
index 80c6079aac..e72eee041c 100644
--- a/compiler/cmm/CmmCvt.hs
+++ b/compiler/cmm/CmmCvt.hs
@@ -12,29 +12,25 @@ import CmmUtils
 import qualified OldCmm as Old
 import OldPprCmm ()
 
-import Compiler.Hoopl hiding ((<*>), mkLabel, mkBranch)
+import Hoopl hiding ((<*>), mkLabel, mkBranch)
 import Data.Maybe
 import Maybes
 import Outputable
 
 cmmOfZgraph :: CmmGroup -> Old.CmmGroup
 cmmOfZgraph tops = map mapTop tops
-  where mapTop (CmmProc h l g) = CmmProc (Old.CmmInfo Nothing Nothing (info_tbl h)) l (ofZgraph g)
+  where mapTop (CmmProc h l g) = CmmProc (info_tbl h) l (ofZgraph g)
         mapTop (CmmData s ds) = CmmData s ds
 
 data ValueDirection = Arguments | Results
 
-add_hints :: Convention -> ValueDirection -> [a] -> [Old.CmmHinted a]
+add_hints :: ForeignTarget -> ValueDirection -> [a] -> [Old.CmmHinted a]
 add_hints conv vd args = zipWith Old.CmmHinted args (get_hints conv vd)
 
-get_hints :: Convention -> ValueDirection -> [ForeignHint]
-get_hints (Foreign (ForeignConvention _ hints _)) Arguments = hints
-get_hints (Foreign (ForeignConvention _ _ hints)) Results   = hints
-get_hints _other_conv                             _vd       = repeat NoHint
-
-get_conv :: ForeignTarget -> Convention
-get_conv (PrimTarget _)       = NativeNodeCall -- JD: SUSPICIOUS
-get_conv (ForeignTarget _ fc) = Foreign fc
+get_hints :: ForeignTarget -> ValueDirection -> [ForeignHint]
+get_hints (ForeignTarget _ (ForeignConvention _ hints _)) Arguments = hints
+get_hints (ForeignTarget _ (ForeignConvention _ _ hints)) Results   = hints
+get_hints (PrimTarget _) _vd = repeat NoHint
 
 cmm_target :: ForeignTarget -> Old.CmmCallTarget
 cmm_target (PrimTarget op) = Old.CmmPrim op Nothing
@@ -89,8 +85,8 @@ ofZgraph g = Old.ListGraph $ mapMaybe convert_block $ postorderDfs g
                             CmmUnsafeForeignCall (PrimTarget MO_Touch) _ _ -> Old.CmmNop
                             CmmUnsafeForeignCall target ress args          -> 
                               Old.CmmCall (cmm_target target)
-                                          (add_hints (get_conv target) Results   ress)
-                                          (add_hints (get_conv target) Arguments args)
+                                          (add_hints target Results   ress)
+                                          (add_hints target Arguments args)
                                           Old.CmmMayReturn
 
                   last :: CmmNode O C -> () -> [Old.CmmStmt]
diff --git a/compiler/cmm/CmmExpr.hs b/compiler/cmm/CmmExpr.hs
index 6eb91e89ba..939d4b7ca9 100644
--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -14,11 +14,11 @@ module CmmExpr
     , GlobalReg(..), globalRegType, spReg, hpReg, spLimReg, nodeReg, node, baseReg
     , VGcPtr(..), vgcFlag 	-- Temporary!
     , DefinerOfLocalRegs, UserOfLocalRegs, foldRegsDefd, foldRegsUsed, filterRegsUsed
-    , DefinerOfSlots, UserOfSlots, foldSlotsDefd, foldSlotsUsed
     , RegSet, emptyRegSet, elemRegSet, extendRegSet, deleteFromRegSet, mkRegSet
-            , plusRegSet, minusRegSet, timesRegSet
-    , regUsedIn, regSlot
-    , Area(..), AreaId(..), SubArea, SubAreaSet, AreaMap, isStackSlotOf
+            , plusRegSet, minusRegSet, timesRegSet, sizeRegSet, nullRegSet
+            , regSetToList
+    , regUsedIn
+    , Area(..)
     , module CmmMachOp
     , module CmmType
     )
@@ -31,9 +31,10 @@ import CmmMachOp
 import BlockId
 import CLabel
 import Unique
-import UniqSet
 
 import Data.Map (Map)
+import Data.Set (Set)
+import qualified Data.Set as Set
 
 -----------------------------------------------------------------------------
 --		CmmExpr
@@ -42,11 +43,12 @@ import Data.Map (Map)
 
 data CmmExpr
   = CmmLit CmmLit               -- Literal
-  | CmmLoad CmmExpr CmmType     -- Read memory location
-  | CmmReg CmmReg		-- Contents of register
+  | CmmLoad !CmmExpr !CmmType   -- Read memory location
+  | CmmReg !CmmReg              -- Contents of register
   | CmmMachOp MachOp [CmmExpr]  -- Machine operation (+, -, *, etc.)
-  | CmmStackSlot Area Int       -- addressing expression of a stack slot
-  | CmmRegOff CmmReg Int	
+  | CmmStackSlot Area {-# UNPACK #-} !Int
+                                -- addressing expression of a stack slot
+  | CmmRegOff !CmmReg Int
 	-- CmmRegOff reg i
 	--        ** is shorthand only, meaning **
 	-- CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt (fromIntegral i) rep)]
@@ -62,20 +64,16 @@ instance Eq CmmExpr where	-- Equality ignores the types
   _e1               == _e2               = False
 
 data CmmReg 
-  = CmmLocal  LocalReg
+  = CmmLocal  {-# UNPACK #-} !LocalReg
   | CmmGlobal GlobalReg
   deriving( Eq, Ord )
 
 -- | A stack area is either the stack slot where a variable is spilled
 -- or the stack space where function arguments and results are passed.
 data Area
-  = RegSlot  LocalReg
-  | CallArea AreaId
-  deriving (Eq, Ord)
-
-data AreaId
   = Old            -- See Note [Old Area]
-  | Young BlockId
+  | Young {-# UNPACK #-} !BlockId  -- Invariant: must be a continuation BlockId
+                   -- See Note [Continuation BlockId] in CmmNode.
   deriving (Eq, Ord)
 
 {- Note [Old Area] 
@@ -94,15 +92,8 @@ necessarily at the young end of the Old area.
 
 End of note -}
 
-type SubArea    = (Area, Int, Int) -- area, offset, width
-type SubAreaSet = Map Area [SubArea]
-
-type AreaMap    = Map Area Int
-     -- Byte offset of the oldest byte of the Area, 
-     -- relative to the oldest byte of the Old Area
-
 data CmmLit
-  = CmmInt Integer  Width
+  = CmmInt !Integer  Width
 	-- Interpretation: the 2's complement representation of the value
 	-- is truncated to the specified size.  This is easier than trying
 	-- to keep the value within range, because we don't know whether
@@ -120,7 +111,11 @@ data CmmLit
         -- It is also used inside the NCG during when generating
         -- position-independent code. 
   | CmmLabelDiffOff CLabel CLabel Int   -- label1 - label2 + offset
-  | CmmBlock BlockId			-- Code label
+
+  | CmmBlock {-# UNPACK #-} !BlockId     -- Code label
+        -- Invariant: must be a continuation BlockId
+        -- See Note [Continuation BlockId] in CmmNode.
+
   | CmmHighStackMark -- stands for the max stack space used during a procedure
   deriving Eq
 
@@ -163,7 +158,7 @@ maybeInvertCmmExpr _ = Nothing
 -----------------------------------------------------------------------------
 
 data LocalReg
-  = LocalReg !Unique CmmType
+  = LocalReg {-# UNPACK #-} !Unique CmmType
     -- ^ Parameters:
     --   1. Identifier
     --   2. Type
@@ -189,22 +184,35 @@ localRegType (LocalReg _ rep) = rep
 -----------------------------------------------------------------------------
 
 -- | Sets of local registers
-type RegSet              =  UniqSet LocalReg
+
+-- These are used for dataflow facts, and a common operation is taking
+-- the union of two RegSets and then asking whether the union is the
+-- same as one of the inputs.  UniqSet isn't good here, because
+-- sizeUniqSet is O(n) whereas Set.size is O(1), so we use ordinary
+-- Sets.
+
+type RegSet              =  Set LocalReg
 emptyRegSet             :: RegSet
+nullRegSet              :: RegSet -> Bool
 elemRegSet              :: LocalReg -> RegSet -> Bool
 extendRegSet            :: RegSet -> LocalReg -> RegSet
 deleteFromRegSet        :: RegSet -> LocalReg -> RegSet
 mkRegSet                :: [LocalReg] -> RegSet
 minusRegSet, plusRegSet, timesRegSet :: RegSet -> RegSet -> RegSet
-
-emptyRegSet      = emptyUniqSet
-elemRegSet       = elementOfUniqSet
-extendRegSet     = addOneToUniqSet
-deleteFromRegSet = delOneFromUniqSet
-mkRegSet         = mkUniqSet
-minusRegSet      = minusUniqSet
-plusRegSet       = unionUniqSets
-timesRegSet      = intersectUniqSets
+sizeRegSet              :: RegSet -> Int
+regSetToList            :: RegSet -> [LocalReg]
+
+emptyRegSet      = Set.empty
+nullRegSet       = Set.null
+elemRegSet       = Set.member
+extendRegSet     = flip Set.insert
+deleteFromRegSet = flip Set.delete
+mkRegSet         = Set.fromList
+minusRegSet      = Set.difference
+plusRegSet       = Set.union
+timesRegSet      = Set.intersection
+sizeRegSet       = Set.size
+regSetToList     = Set.toList
 
 class UserOfLocalRegs a where
   foldRegsUsed :: (b -> LocalReg -> b) -> b -> a -> b
@@ -236,7 +244,7 @@ instance DefinerOfLocalRegs LocalReg where
     foldRegsDefd f z r = f z r
 
 instance UserOfLocalRegs RegSet where
-    foldRegsUsed f = foldUniqSet (flip f)
+    foldRegsUsed f = Set.fold (flip f)
 
 instance UserOfLocalRegs CmmExpr where
   foldRegsUsed f z e = expr z e
@@ -271,49 +279,6 @@ reg `regUsedIn` CmmMachOp _ es   = any (reg `regUsedIn`) es
 _   `regUsedIn` CmmStackSlot _ _ = False
 
 -----------------------------------------------------------------------------
---    Stack slots
------------------------------------------------------------------------------
-
-isStackSlotOf :: CmmExpr -> LocalReg -> Bool
-isStackSlotOf (CmmStackSlot (RegSlot r) _) r' = r == r'
-isStackSlotOf _ _ = False
-
-regSlot :: LocalReg -> CmmExpr
-regSlot r = CmmStackSlot (RegSlot r) (widthInBytes $ typeWidth $ localRegType r)
-
------------------------------------------------------------------------------
---    Stack slot use information for expressions and other types [_$_]
------------------------------------------------------------------------------
-
--- Fold over the area, the offset into the area, and the width of the subarea.
-class UserOfSlots a where
-  foldSlotsUsed :: (b -> SubArea -> b) -> b -> a -> b
-
-class DefinerOfSlots a where
-  foldSlotsDefd :: (b -> SubArea -> b) -> b -> a -> b
-
-instance UserOfSlots CmmExpr where
-  foldSlotsUsed f z e = expr z e
-    where expr z (CmmLit _)          = z
-          expr z (CmmLoad (CmmStackSlot a i) ty) = f z (a, i, widthInBytes $ typeWidth ty)
-          expr z (CmmLoad addr _)    = foldSlotsUsed f z addr
-          expr z (CmmReg _)          = z
-          expr z (CmmMachOp _ exprs) = foldSlotsUsed f z exprs
-          expr z (CmmRegOff _ _)     = z
-          expr z (CmmStackSlot _ _)  = z
-
-instance UserOfSlots a => UserOfSlots [a] where
-  foldSlotsUsed _ set [] = set
-  foldSlotsUsed f set (x:xs) = foldSlotsUsed f (foldSlotsUsed f set x) xs
-
-instance DefinerOfSlots a => DefinerOfSlots [a] where
-  foldSlotsDefd _ set [] = set
-  foldSlotsDefd f set (x:xs) = foldSlotsDefd f (foldSlotsDefd f set x) xs
-
-instance DefinerOfSlots SubArea where
-    foldSlotsDefd f z a = f z a
-
------------------------------------------------------------------------------
 --		Global STG registers
 -----------------------------------------------------------------------------
 
diff --git a/compiler/cmm/CmmInfo.hs b/compiler/cmm/CmmInfo.hs
index fe0c104d1c..a171faa057 100644
--- a/compiler/cmm/CmmInfo.hs
+++ b/compiler/cmm/CmmInfo.hs
@@ -19,6 +19,8 @@ import CmmUtils
 import CLabel
 import SMRep
 import Bitmap
+import Stream (Stream)
+import qualified Stream
 
 import Maybes
 import Constants
@@ -40,10 +42,16 @@ mkEmptyContInfoTable info_lbl
                  , cit_prof = NoProfilingInfo
                  , cit_srt  = NoC_SRT }
 
-cmmToRawCmm :: Platform -> [Old.CmmGroup] -> IO [Old.RawCmmGroup]
+cmmToRawCmm :: Platform -> Stream IO Old.CmmGroup ()
+            -> IO (Stream IO Old.RawCmmGroup ())
 cmmToRawCmm platform cmms
   = do { uniqs <- mkSplitUniqSupply 'i'
-       ; return (initUs_ uniqs (mapM (concatMapM (mkInfoTable platform)) cmms)) }
+       ; let do_one uniqs cmm = do
+                case initUs uniqs $ concatMapM (mkInfoTable platform) cmm of
+                  (b,uniqs') -> return (uniqs',b)
+                  -- NB. strictness fixes a space leak.  DO NOT REMOVE.
+       ; return (Stream.mapAccumL do_one uniqs cmms >> return ())
+       }
 
 -- Make a concrete info table, represented as a list of CmmStatic
 -- (it can't be simply a list of Word, because the SRT field is
@@ -82,7 +90,7 @@ mkInfoTable :: Platform -> CmmDecl -> UniqSM [RawCmmDecl]
 mkInfoTable _ (CmmData sec dat) 
   = return [CmmData sec dat]
 
-mkInfoTable platform (CmmProc (CmmInfo _ _ info) entry_label blocks)
+mkInfoTable platform (CmmProc info entry_label blocks)
   | CmmNonInfoTable <- info   -- Code without an info table.  Easy.
   = return [CmmProc Nothing entry_label blocks]
                                
@@ -91,7 +99,8 @@ mkInfoTable platform (CmmProc (CmmInfo _ _ info) entry_label blocks)
        ; return (top_decls  ++
                  mkInfoTableAndCode info_lbl info_cts
                                     entry_label blocks) }
-  | otherwise = panic "mkInfoTable"  -- Patern match overlap check not clever enough
+  | otherwise = panic "mkInfoTable"
+                  -- Patern match overlap check not clever enough
 
 -----------------------------------------------------
 type InfoTableContents = ( [CmmLit]	     -- The standard part
diff --git a/compiler/cmm/CmmLayoutStack.hs b/compiler/cmm/CmmLayoutStack.hs
new file mode 100644
index 0000000000..660506e7dc
--- /dev/null
+++ b/compiler/cmm/CmmLayoutStack.hs
@@ -0,0 +1,1045 @@
+{-# LANGUAGE RecordWildCards, GADTs #-}
+module CmmLayoutStack (
+       cmmLayoutStack, setInfoTableStackMap, cmmSink
+  ) where
+
+import StgCmmUtils      ( callerSaveVolatileRegs ) -- XXX
+import StgCmmForeign    ( saveThreadState, loadThreadState ) -- XXX
+
+import Cmm
+import BlockId
+import CLabel
+import CmmUtils
+import MkGraph
+import Module
+import ForeignCall
+import CmmLive
+import CmmProcPoint
+import SMRep
+import Hoopl hiding ((<*>), mkLast, mkMiddle)
+import OptimizationFuel
+import Constants
+import UniqSupply
+import Maybes
+import UniqFM
+import Util
+
+import FastString
+import Outputable
+import Data.Map (Map)
+import qualified Data.Map as Map
+import qualified Data.Set as Set
+import Control.Monad.Fix
+import Data.Array as Array
+import Data.Bits
+import Data.List (nub, partition)
+import Control.Monad (liftM)
+
+#include "HsVersions.h"
+
+
+data StackSlot = Occupied | Empty
+     -- Occupied: a return address or part of an update frame
+
+instance Outputable StackSlot where
+  ppr Occupied = ptext (sLit "XXX")
+  ppr Empty    = ptext (sLit "---")
+
+-- All stack locations are expressed as positive byte offsets from the
+-- "base", which is defined to be the address above the return address
+-- on the stack on entry to this CmmProc.
+--
+-- Lower addresses have higher StackLocs.
+--
+type StackLoc = ByteOff
+
+{-
+ A StackMap describes the stack at any given point.  At a continuation
+ it has a particular layout, like this:
+
+         |             | <- base
+         |-------------|
+         |     ret0    | <- base + 8
+         |-------------|
+         .  upd frame  . <- base + sm_ret_off
+         |-------------|
+         |             |
+         .    vars     .
+         . (live/dead) .
+         |             | <- base + sm_sp - sm_args
+         |-------------|
+         |    ret1     |
+         .  ret vals   . <- base + sm_sp    (<--- Sp points here)
+         |-------------|
+
+Why do we include the final return address (ret0) in our stack map?  I
+have absolutely no idea, but it seems to be done that way consistently
+in the rest of the code generator, so I played along here. --SDM
+
+Note that we will be constructing an info table for the continuation
+(ret1), which needs to describe the stack down to, but not including,
+the update frame (or ret0, if there is no update frame).
+-}
+
+data StackMap = StackMap
+ {  sm_sp   :: StackLoc
+       -- ^ the offset of Sp relative to the base on entry
+       -- to this block.
+ ,  sm_args :: ByteOff
+       -- ^ the number of bytes of arguments in the area for this block
+       -- Defn: the offset of young(L) relative to the base is given by
+       -- (sm_sp - sm_args) of the StackMap for block L.
+ ,  sm_ret_off :: ByteOff
+       -- ^ Number of words of stack that we do not describe with an info
+       -- table, because it contains an update frame.
+ ,  sm_regs :: UniqFM (LocalReg,StackLoc)
+       -- ^ regs on the stack
+ }
+
+instance Outputable StackMap where
+  ppr StackMap{..} =
+     text "Sp = " <> int sm_sp $$
+     text "sm_args = " <> int sm_args $$
+     text "sm_ret_off = " <> int sm_ret_off $$
+     text "sm_regs = " <> ppr (eltsUFM sm_regs)
+
+
+cmmLayoutStack :: ProcPointSet -> ByteOff -> CmmGraph
+               -> FuelUniqSM (CmmGraph, BlockEnv StackMap)
+cmmLayoutStack procpoints entry_args
+               graph0@(CmmGraph { g_entry = entry })
+  = do
+    pprTrace "cmmLayoutStack" (ppr entry_args) $ return ()
+    (graph, liveness) <- removeDeadAssignments graph0
+    pprTrace "liveness" (ppr liveness) $ return ()
+    let blocks = postorderDfs graph
+
+    (final_stackmaps, final_high_sp, new_blocks) <- liftUniq $
+          mfix $ \ ~(rec_stackmaps, rec_high_sp, _new_blocks) ->
+            layout procpoints liveness entry entry_args
+                   rec_stackmaps rec_high_sp blocks
+
+    new_blocks' <- liftUniq $ mapM lowerSafeForeignCall new_blocks
+
+    pprTrace ("Sp HWM") (ppr final_high_sp) $
+       return (ofBlockList entry new_blocks', final_stackmaps)
+
+
+
+layout :: BlockSet                      -- proc points
+       -> BlockEnv CmmLive              -- liveness
+       -> BlockId                       -- entry
+       -> ByteOff                       -- stack args on entry
+
+       -> BlockEnv StackMap             -- [final] stack maps
+       -> ByteOff                       -- [final] Sp high water mark
+
+       -> [CmmBlock]                    -- [in] blocks
+
+       -> UniqSM
+          ( BlockEnv StackMap           -- [out] stack maps
+          , ByteOff                     -- [out] Sp high water mark
+          , [CmmBlock]                  -- [out] new blocks
+          )
+
+layout procpoints liveness entry entry_args final_stackmaps final_hwm blocks
+  = go blocks init_stackmap entry_args []
+  where
+    (updfr, cont_info)  = collectContInfo blocks
+
+    init_stackmap = mapSingleton entry StackMap{ sm_sp   = entry_args
+                                               , sm_args = entry_args
+                                               , sm_ret_off = updfr
+                                               , sm_regs = emptyUFM
+                                               }
+
+    go [] acc_stackmaps acc_hwm acc_blocks
+      = return (acc_stackmaps, acc_hwm, acc_blocks)
+
+    go (b0 : bs) acc_stackmaps acc_hwm acc_blocks
+      = do
+       let (entry0@(CmmEntry entry_lbl), middle0, last0) = blockSplit b0
+    
+       let stack0@StackMap { sm_sp = sp0 }
+               = mapFindWithDefault
+                     (pprPanic "no stack map for" (ppr entry_lbl))
+                     entry_lbl acc_stackmaps
+    
+       pprTrace "layout" (ppr entry_lbl <+> ppr stack0) $ return ()
+    
+       -- (a) Update the stack map to include the effects of
+       --     assignments in this block
+       let stack1 = foldBlockNodesF (procMiddle acc_stackmaps) middle0 stack0
+    
+       -- (b) Insert assignments to reload all the live variables if this
+       --     block is a proc point
+       let middle1 = if entry_lbl `setMember` procpoints
+                        then foldr blockCons middle0 (insertReloads stack0)
+                        else middle0
+    
+       -- (c) Look at the last node and if we are making a call or
+       --     jumping to a proc point, we must save the live
+       --     variables, adjust Sp, and construct the StackMaps for
+       --     each of the successor blocks.  See handleLastNode for
+       --     details.
+       (middle2, sp_off, last1, fixup_blocks, out)
+           <- handleLastNode procpoints liveness cont_info
+                             acc_stackmaps stack1 middle0 last0
+    
+       pprTrace "layout(out)" (ppr out) $ return ()
+
+       -- (d) Manifest Sp: run over the nodes in the block and replace
+       --     CmmStackSlot with CmmLoad from Sp with a concrete offset.
+       --
+       -- our block:
+       --    middle1          -- the original middle nodes
+       --    middle2          -- live variable saves from handleLastNode
+       --    Sp = Sp + sp_off -- Sp adjustment goes here
+       --    last1            -- the last node
+       --
+       let middle_pre = blockToList $ foldl blockSnoc middle1 middle2
+
+           sp_high = final_hwm - entry_args
+              -- The stack check value is adjusted by the Sp offset on
+              -- entry to the proc, which is entry_args.  We are
+              -- assuming that we only do a stack check at the
+              -- beginning of a proc, and we don't modify Sp before the
+              -- check.
+
+           final_blocks = manifestSp final_stackmaps stack0 sp0 sp_high entry0
+                              middle_pre sp_off last1 fixup_blocks
+
+           acc_stackmaps' = mapUnion acc_stackmaps out
+
+           hwm' = maximum (acc_hwm : (sp0 - sp_off) : map sm_sp (mapElems out))
+
+       go bs acc_stackmaps' hwm' (final_blocks ++ acc_blocks)
+
+
+-- -----------------------------------------------------------------------------
+
+-- This doesn't seem right somehow.  We need to find out whether this
+-- proc will push some update frame material at some point, so that we
+-- can avoid using that area of the stack for spilling.  The
+-- updfr_space field of the CmmProc *should* tell us, but it doesn't
+-- (I think maybe it gets filled in later when we do proc-point
+-- splitting).
+--
+-- So we'll just take the max of all the cml_ret_offs.  This could be
+-- unnecessarily pessimistic, but probably not in the code we
+-- generate.
+
+collectContInfo :: [CmmBlock] -> (ByteOff, BlockEnv ByteOff)
+collectContInfo blocks
+  = (maximum ret_offs, mapFromList (catMaybes mb_argss))
+ where
+  (mb_argss, ret_offs) = mapAndUnzip get_cont blocks
+
+  get_cont b =
+     case lastNode b of
+        CmmCall { cml_cont = Just l, .. }
+           -> (Just (l, cml_ret_args), cml_ret_off)
+        CmmForeignCall { .. }
+           -> (Just (succ, 0), updfr) -- ??
+        _other -> (Nothing, 0)
+
+
+-- -----------------------------------------------------------------------------
+-- Updating the StackMap from middle nodes
+
+-- Look for loads from stack slots, and update the StackMap.  This is
+-- purelyu for optimisation reasons, so that we can avoid saving a
+-- variable back to a different stack slot if it is already on the
+-- stack.
+--
+-- This happens a lot: for example when function arguments are passed
+-- on the stack and need to be immediately saved across a call, we
+-- want to just leave them where they are on the stack.
+--
+procMiddle :: BlockEnv StackMap -> CmmNode e x -> StackMap -> StackMap
+procMiddle stackmaps node sm
+  = case node of
+     CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot area off) _)
+       -> sm { sm_regs = addToUFM (sm_regs sm) r (r,loc) }
+        where loc = getStackLoc area off stackmaps
+     CmmAssign (CmmLocal r) _other
+       -> sm { sm_regs = delFromUFM (sm_regs sm) r }
+     _other
+       -> sm
+
+getStackLoc :: Area -> ByteOff -> BlockEnv StackMap -> StackLoc
+getStackLoc Old       n _         = n
+getStackLoc (Young l) n stackmaps =
+  case mapLookup l stackmaps of
+    Nothing -> pprPanic "getStackLoc" (ppr l)
+    Just sm -> sm_sp sm - sm_args sm + n
+
+
+-- -----------------------------------------------------------------------------
+-- Handling stack allocation for a last node
+
+-- We take a single last node and turn it into:
+--
+--    C1 (some statements)
+--    Sp = Sp + N
+--    C2 (some more statements)
+--    call f()          -- the actual last node
+--
+-- plus possibly some more blocks (we may have to add some fixup code
+-- between the last node and the continuation).
+--
+-- C1: is the code for saving the variables across this last node onto
+-- the stack, if the continuation is a call or jumps to a proc point.
+--
+-- C2: if the last node is a safe foreign call, we have to inject some
+-- extra code that goes *after* the Sp adjustment.
+
+handleLastNode
+   :: ProcPointSet -> BlockEnv CmmLive -> BlockEnv ByteOff
+   -> BlockEnv StackMap -> StackMap
+   -> Block CmmNode O O
+   -> CmmNode O C
+   -> UniqSM
+      ( [CmmNode O O]      -- nodes to go *before* the Sp adjustment
+      , ByteOff            -- amount to adjust Sp
+      , CmmNode O C        -- new last node
+      , [CmmBlock]         -- new blocks
+      , BlockEnv StackMap  -- stackmaps for the continuations
+      )
+
+handleLastNode procpoints liveness cont_info stackmaps
+               stack0@StackMap { sm_sp = sp0 } middle last
+ = case last of
+    --  At each return / tail call,
+    --  adjust Sp to point to the last argument pushed, which
+    --  is cml_args, after popping any other junk from the stack.
+    CmmCall{ cml_cont = Nothing, .. } -> do
+      let sp_off = sp0 - cml_args
+      return ([], sp_off, last, [], mapEmpty)
+
+    --  At each CmmCall with a continuation:
+    CmmCall{ cml_cont = Just cont_lbl, .. } ->
+       return $ lastCall cont_lbl cml_args cml_ret_args cml_ret_off
+
+    CmmForeignCall{ succ = cont_lbl, .. } -> do
+       return $ lastCall cont_lbl wORD_SIZE wORD_SIZE (sm_ret_off stack0)
+            -- one word each for args and results: the return address
+
+    CmmBranch{..}     ->  handleProcPoints
+    CmmCondBranch{..} ->  handleProcPoints
+    CmmSwitch{..}     ->  handleProcPoints
+
+  where
+     -- Calls and ForeignCalls are handled the same way:
+     lastCall :: BlockId -> ByteOff -> ByteOff -> ByteOff
+              -> ( [CmmNode O O]
+                 , ByteOff
+                 , CmmNode O C
+                 , [CmmBlock]
+                 , BlockEnv StackMap
+                 )
+     lastCall lbl cml_args cml_ret_args cml_ret_off
+      =  ( assignments
+         , spOffsetForCall sp0 cont_stack cml_args
+         , last
+         , [] -- no new blocks
+         , mapSingleton lbl cont_stack )
+      where
+         (assignments, cont_stack) = prepareStack lbl cml_ret_args cml_ret_off
+
+
+     prepareStack lbl cml_ret_args cml_ret_off
+       | Just cont_stack <- mapLookup lbl stackmaps
+             -- If we have already seen this continuation before, then
+             -- we just have to make the stack look the same:
+       = (fixupStack stack0 cont_stack, cont_stack)
+             -- Otherwise, we have to allocate the stack frame
+       | otherwise
+       = (save_assignments, new_cont_stack)
+       where
+        (new_cont_stack, save_assignments)
+           = setupStackFrame lbl liveness cml_ret_off cml_ret_args stack0
+
+
+     -- proc point, we have to set up the stack to match what the proc
+     -- point is expecting.
+     --
+     handleProcPoints :: UniqSM ( [CmmNode O O]
+                                , ByteOff
+                                , CmmNode O C
+                                , [CmmBlock]
+                                , BlockEnv StackMap )
+
+     handleProcPoints
+         -- Note [diamond proc point]
+       | Just l <- futureContinuation middle
+       , (nub $ filter (`setMember` procpoints) $ successors last) == [l]
+       = do
+         let cont_args = mapFindWithDefault 0 l cont_info
+             (assigs, cont_stack) = prepareStack l cont_args (sm_ret_off stack0)
+             out = mapFromList [ (l', cont_stack)
+                               | l' <- successors last ]
+         return ( assigs
+                , spOffsetForCall sp0 cont_stack wORD_SIZE
+                , last
+                , []
+                , out)
+
+        | otherwise = do
+          pps <- mapM handleProcPoint (successors last)
+          let lbl_map :: LabelMap Label
+              lbl_map = mapFromList [ (l,tmp) | (l,tmp,_,_) <- pps ]
+              fix_lbl l = mapLookup l lbl_map `orElse` l
+          return ( []
+                 , 0
+                 , mapSuccessors fix_lbl last
+                 , concat [ blk | (_,_,_,blk) <- pps ]
+                 , mapFromList [ (l, sm) | (l,_,sm,_) <- pps ] )
+
+     -- For each proc point that is a successor of this block
+     --   (a) if the proc point already has a stackmap, we need to
+     --       shuffle the current stack to make it look the same.
+     --       We have to insert a new block to make this happen.
+     --   (b) otherwise, call "allocate live stack0" to make the
+     --       stack map for the proc point
+     handleProcPoint :: BlockId
+                     -> UniqSM (BlockId, BlockId, StackMap, [CmmBlock])
+     handleProcPoint l
+        | not (l `setMember` procpoints) = return (l, l, stack0, [])
+        | otherwise = do
+           tmp_lbl <- liftM mkBlockId $ getUniqueM
+           let
+               (stack2, assigs) =
+                  case mapLookup l stackmaps of
+                    Just pp_sm -> (pp_sm, fixupStack stack0 pp_sm)
+                    Nothing    ->
+                      pprTrace "first visit to proc point"
+                                   (ppr l <+> ppr stack1) $
+                      (stack1, assigs)
+                      where
+                       cont_args = mapFindWithDefault 0 l cont_info
+                       (stack1, assigs) =
+                           setupStackFrame l liveness (sm_ret_off stack0)
+                                                       cont_args stack0
+
+               sp_off = sp0 - sm_sp stack2
+
+               block = blockJoin (CmmEntry tmp_lbl)
+                                 (maybeAddSpAdj sp_off (blockFromList assigs))
+                                 (CmmBranch l)
+           --
+           return (l, tmp_lbl, stack2, [block])
+
+
+  
+-- Sp is currently pointing to current_sp,
+-- we want it to point to
+--    (sm_sp cont_stack - sm_args cont_stack + args)
+-- so the difference is
+--    sp0 - (sm_sp cont_stack - sm_args cont_stack + args)
+spOffsetForCall :: ByteOff -> StackMap -> ByteOff -> ByteOff
+spOffsetForCall current_sp cont_stack args
+  = current_sp - (sm_sp cont_stack - sm_args cont_stack + args)
+
+
+-- | create a sequence of assignments to establish the new StackMap,
+-- given the old StackMap.
+fixupStack :: StackMap -> StackMap -> [CmmNode O O]
+fixupStack old_stack new_stack = concatMap move new_locs
+ where
+     old_map :: Map LocalReg ByteOff
+     old_map  = Map.fromList (stackSlotRegs old_stack)
+     new_locs = stackSlotRegs new_stack
+
+     move (r,n)
+       | Just m <- Map.lookup r old_map, n == m = []
+       | otherwise = [CmmStore (CmmStackSlot Old n)
+                               (CmmReg (CmmLocal r))]
+
+
+
+setupStackFrame
+             :: BlockId                 -- label of continuation
+             -> BlockEnv CmmLive        -- liveness
+             -> ByteOff      -- updfr
+             -> ByteOff      -- bytes of return values on stack
+             -> StackMap     -- current StackMap
+             -> (StackMap, [CmmNode O O])
+
+setupStackFrame lbl liveness updfr_off ret_args stack0
+  = (cont_stack, assignments)
+  where
+      -- get the set of LocalRegs live in the continuation
+      live = mapFindWithDefault Set.empty lbl liveness
+
+      -- the stack from the base to updfr_off is off-limits.
+      -- our new stack frame contains:
+      --   * saved live variables
+      --   * the return address [young(C) + 8]
+      --   * the args for the call,
+      --     which are replaced by the return values at the return
+      --     point.
+
+      -- everything up to updfr_off is off-limits
+      -- stack1 contains updfr_off, plus everything we need to save
+      (stack1, assignments) = allocate updfr_off live stack0
+
+      -- And the Sp at the continuation is:
+      --   sm_sp stack1 + ret_args
+      cont_stack = stack1{ sm_sp = sm_sp stack1 + ret_args
+                         , sm_args = ret_args
+                         , sm_ret_off = updfr_off
+                         }
+
+
+-- -----------------------------------------------------------------------------
+-- Note [diamond proc point]
+--
+-- This special case looks for the pattern we get from a typical
+-- tagged case expression:
+--
+--    Sp[young(L1)] = L1
+--    if (R1 & 7) != 0 goto L1 else goto L2
+--  L2:
+--    call [R1] returns to L1
+--  L1: live: {y}
+--    x = R1
+--
+-- If we let the generic case handle this, we get
+--
+--    Sp[-16] = L1
+--    if (R1 & 7) != 0 goto L1a else goto L2
+--  L2:
+--    Sp[-8] = y
+--    Sp = Sp - 16
+--    call [R1] returns to L1
+--  L1a:
+--    Sp[-8] = y
+--    Sp = Sp - 16
+--    goto L1
+--  L1:
+--    x = R1
+--
+-- The code for saving the live vars is duplicated in each branch, and
+-- furthermore there is an extra jump in the fast path (assuming L1 is
+-- a proc point, which it probably is if there is a heap check).
+--
+-- So to fix this we want to set up the stack frame before the
+-- conditional jump.  How do we know when to do this, and when it is
+-- safe?  The basic idea is, when we see the assignment
+-- 
+--   Sp[young(L)] = L
+-- 
+-- we know that
+--   * we are definitely heading for L
+--   * there can be no more reads from another stack area, because young(L)
+--     overlaps with it.
+--
+-- We don't necessarily know that everything live at L is live now
+-- (some might be assigned between here and the jump to L).  So we
+-- simplify and only do the optimisation when we see
+--
+--   (1) a block containing an assignment of a return address L
+--   (2) ending in a branch where one (and only) continuation goes to L,
+--       and no other continuations go to proc points.
+--
+-- then we allocate the stack frame for L at the end of the block,
+-- before the branch.
+--
+-- We could generalise (2), but that would make it a bit more
+-- complicated to handle, and this currently catches the common case.
+
+futureContinuation :: Block CmmNode O O -> Maybe BlockId
+futureContinuation middle = foldBlockNodesB f middle Nothing
+   where f :: CmmNode a b -> Maybe BlockId -> Maybe BlockId
+         f (CmmStore (CmmStackSlot (Young l) _) (CmmLit (CmmBlock _))) _
+               = Just l
+         f _ r = r
+
+-- -----------------------------------------------------------------------------
+-- Saving live registers
+
+-- | Given a set of live registers and a StackMap, save all the registers
+-- on the stack and return the new StackMap and the assignments to do
+-- the saving.
+--
+allocate :: ByteOff -> RegSet -> StackMap -> (StackMap, [CmmNode O O])
+allocate ret_off live stackmap@StackMap{ sm_sp = sp0
+                                       , sm_regs = regs0 }
+ =
+  pprTrace "allocate" (ppr live $$ ppr stackmap) $
+
+   -- we only have to save regs that are not already in a slot
+   let to_save = filter (not . (`elemUFM` regs0)) (Set.elems live)
+       regs1   = filterUFM (\(r,_) -> elemRegSet r live) regs0
+   in
+
+   -- make a map of the stack
+   let stack = reverse $ Array.elems $
+               accumArray (\_ x -> x) Empty (1, toWords (max sp0 ret_off)) $
+                 ret_words ++ live_words
+            where ret_words =
+                   [ (x, Occupied)
+                   | x <- [ 1 .. toWords ret_off] ]
+                  live_words =
+                   [ (toWords x, Occupied)
+                   | (r,off) <- eltsUFM regs1,
+                     let w = localRegBytes r,
+                     x <- [ off, off-wORD_SIZE .. off - w + 1] ]
+   in
+
+   -- Pass over the stack: find slots to save all the new live variables,
+   -- choosing the oldest slots first (hence a foldr).
+   let
+       save slot ([], stack, n, assigs, regs) -- no more regs to save
+          = ([], slot:stack, n `plusW` 1, assigs, regs)
+       save slot (to_save, stack, n, assigs, regs)
+          = case slot of
+               Occupied ->  (to_save, Occupied:stack, n `plusW` 1, assigs, regs)
+               Empty
+                 | Just (stack', r, to_save') <-
+                       select_save to_save (slot:stack)
+                 -> let assig = CmmStore (CmmStackSlot Old n')
+                                         (CmmReg (CmmLocal r))
+                        n' = n `plusW` 1
+                   in
+                        (to_save', stack', n', assig : assigs, (r,(r,n')):regs)
+
+                 | otherwise
+                 -> (to_save, slot:stack, n `plusW` 1, assigs, regs)
+
+       -- we should do better here: right now we'll fit the smallest first,
+       -- but it would make more sense to fit the biggest first.
+       select_save :: [LocalReg] -> [StackSlot]
+                   -> Maybe ([StackSlot], LocalReg, [LocalReg])
+       select_save regs stack = go regs []
+         where go []     _no_fit = Nothing
+               go (r:rs) no_fit
+                 | Just rest <- dropEmpty words stack
+                 = Just (replicate words Occupied ++ rest, r, rs++no_fit)
+                 | otherwise
+                 = go rs (r:no_fit)
+                 where words = localRegWords r
+
+       -- fill in empty slots as much as possible
+       (still_to_save, save_stack, n, save_assigs, save_regs)
+          = foldr save (to_save, [], 0, [], []) stack
+
+       -- push any remaining live vars on the stack
+       (push_sp, push_assigs, push_regs)
+          = foldr push (n, [], []) still_to_save
+          where
+              push r (n, assigs, regs)
+                = (n', assig : assigs, (r,(r,n')) : regs)
+                where
+                  n' = n + localRegBytes r
+                  assig = CmmStore (CmmStackSlot Old n')
+                                   (CmmReg (CmmLocal r))
+
+       trim_sp
+          | not (null push_regs) = push_sp
+          | otherwise
+          = n `plusW` (- length (takeWhile isEmpty save_stack))
+
+       final_regs = regs1 `addListToUFM` push_regs
+                          `addListToUFM` save_regs
+
+   in
+  -- XXX should be an assert
+   if ( n /= max sp0 ret_off ) then pprPanic "allocate" (ppr n <+> ppr sp0 <+> ppr ret_off) else
+
+   if (trim_sp .&. (wORD_SIZE - 1)) /= 0  then pprPanic "allocate2" (ppr trim_sp <+> ppr final_regs <+> ppr push_sp) else
+
+   ( stackmap { sm_regs = final_regs , sm_sp = trim_sp }
+   , push_assigs ++ save_assigs )
+
+
+-- -----------------------------------------------------------------------------
+-- Manifesting Sp
+
+-- | Manifest Sp: turn all the CmmStackSlots into CmmLoads from Sp.  The
+-- block looks like this:
+--
+--    middle_pre       -- the middle nodes
+--    Sp = Sp + sp_off -- Sp adjustment goes here
+--    last             -- the last node
+--
+-- And we have some extra blocks too (that don't contain Sp adjustments)
+--
+-- The adjustment for middle_pre will be different from that for
+-- middle_post, because the Sp adjustment intervenes.
+--
+manifestSp
+   :: BlockEnv StackMap  -- StackMaps for other blocks
+   -> StackMap           -- StackMap for this block
+   -> ByteOff            -- Sp on entry to the block
+   -> ByteOff            -- SpHigh
+   -> CmmNode C O        -- first node
+   -> [CmmNode O O]      -- middle
+   -> ByteOff            -- sp_off
+   -> CmmNode O C        -- last node
+   -> [CmmBlock]         -- new blocks
+   -> [CmmBlock]         -- final blocks with Sp manifest
+
+manifestSp stackmaps stack0 sp0 sp_high
+           first middle_pre sp_off last fixup_blocks
+  = final_block : fixup_blocks'
+  where
+    area_off = getAreaOff stackmaps
+
+    adj_pre_sp, adj_post_sp :: CmmNode e x -> CmmNode e x
+    adj_pre_sp  = mapExpDeep (areaToSp sp0            sp_high area_off)
+    adj_post_sp = mapExpDeep (areaToSp (sp0 - sp_off) sp_high area_off)
+
+    final_middle = maybeAddSpAdj sp_off $
+                   blockFromList $
+                   map adj_pre_sp $
+                   elimStackStores stack0 stackmaps area_off $
+                   middle_pre
+
+    final_last    = optStackCheck (adj_post_sp last)
+
+    final_block   = blockJoin first final_middle final_last
+
+    fixup_blocks' = map (blockMapNodes3 (id, adj_post_sp, id)) fixup_blocks
+
+
+getAreaOff :: BlockEnv StackMap -> (Area -> StackLoc)
+getAreaOff _ Old = 0
+getAreaOff stackmaps (Young l) =
+  case mapLookup l stackmaps of
+    Just sm -> sm_sp sm - sm_args sm
+    Nothing -> pprPanic "getAreaOff" (ppr l)
+
+
+maybeAddSpAdj :: ByteOff -> Block CmmNode O O -> Block CmmNode O O
+maybeAddSpAdj 0      block = block
+maybeAddSpAdj sp_off block
+   = block `blockSnoc` CmmAssign spReg (cmmOffset (CmmReg spReg) sp_off)
+
+
+{-
+Sp(L) is the Sp offset on entry to block L relative to the base of the
+OLD area.
+
+SpArgs(L) is the size of the young area for L, i.e. the number of
+arguments.
+
+ - in block L, each reference to [old + N] turns into
+   [Sp + Sp(L) - N]
+
+ - in block L, each reference to [young(L') + N] turns into
+   [Sp + Sp(L) - Sp(L') + SpArgs(L') - N]
+
+ - be careful with the last node of each block: Sp has already been adjusted
+   to be Sp + Sp(L) - Sp(L')
+-}
+
+areaToSp :: ByteOff -> ByteOff -> (Area -> StackLoc) -> CmmExpr -> CmmExpr
+areaToSp sp_old _sp_hwm area_off (CmmStackSlot area n) =
+  cmmOffset (CmmReg spReg) (sp_old - area_off area - n)
+areaToSp _ sp_hwm _ (CmmLit CmmHighStackMark) = CmmLit (mkIntCLit sp_hwm)
+areaToSp _ _ _ (CmmMachOp (MO_U_Lt _)  -- Note [null stack check]
+                      [CmmMachOp (MO_Sub _)
+                              [ CmmReg (CmmGlobal Sp)
+                              , CmmLit (CmmInt 0 _)],
+                       CmmReg (CmmGlobal SpLim)]) = CmmLit (CmmInt 0 wordWidth)
+areaToSp _ _ _ other = other
+
+-- -----------------------------------------------------------------------------
+-- Note [null stack check]
+--
+-- If the high-water Sp is zero, then we end up with
+--
+--   if (Sp - 0 < SpLim) then .. else ..
+--
+-- and possibly some dead code for the failure case.  Optimising this
+-- away depends on knowing that SpLim <= Sp, so it is really the job
+-- of the stack layout algorithm, hence we do it now.  This is also
+-- convenient because control-flow optimisation later will drop the
+-- dead code.
+
+optStackCheck :: CmmNode O C -> CmmNode O C
+optStackCheck n = -- Note [null stack check]
+ case n of
+   CmmCondBranch (CmmLit (CmmInt 0 _)) _true false -> CmmBranch false
+   other -> other
+
+
+-- -----------------------------------------------------------------------------
+
+-- | Eliminate stores of the form
+--
+--    Sp[area+n] = r
+--
+-- when we know that r is already in the same slot as Sp[area+n].  We
+-- could do this in a later optimisation pass, but that would involve
+-- a separate analysis and we already have the information to hand
+-- here.  It helps clean up some extra stack stores in common cases.
+--
+-- Note that we may have to modify the StackMap as we walk through the
+-- code using procMiddle, since an assignment to a variable in the
+-- StackMap will invalidate its mapping there.
+--
+elimStackStores :: StackMap
+                -> BlockEnv StackMap
+                -> (Area -> ByteOff)
+                -> [CmmNode O O]
+                -> [CmmNode O O]
+elimStackStores stackmap stackmaps area_off nodes
+  = go stackmap nodes
+  where
+    go _stackmap [] = []
+    go stackmap (n:ns)
+     = case n of
+         CmmStore (CmmStackSlot area m) (CmmReg (CmmLocal r))
+            | Just (_,off) <- lookupUFM (sm_regs stackmap) r
+            , area_off area + m == off
+            -> pprTrace "eliminated a node!" (ppr r) $ go stackmap ns
+         _otherwise
+            -> n : go (procMiddle stackmaps n stackmap) ns
+
+
+-- -----------------------------------------------------------------------------
+-- Update info tables to include stack liveness
+
+
+setInfoTableStackMap :: BlockEnv StackMap -> CmmDecl -> CmmDecl
+setInfoTableStackMap stackmaps
+    (CmmProc top_info@TopInfo{..} l g@CmmGraph{g_entry = eid})
+  = CmmProc top_info{ info_tbl = fix_info info_tbl } l g
+  where
+    fix_info info_tbl@CmmInfoTable{ cit_rep = StackRep _ } =
+       info_tbl { cit_rep = StackRep (get_liveness eid) }
+    fix_info other = other
+
+    get_liveness :: BlockId -> Liveness
+    get_liveness lbl
+      = case mapLookup lbl stackmaps of
+          Nothing -> pprPanic "setInfoTableStackMap" (ppr lbl)
+          Just sm -> stackMapToLiveness sm
+
+setInfoTableStackMap _ d = d
+
+
+stackMapToLiveness :: StackMap -> Liveness
+stackMapToLiveness StackMap{..} =
+   reverse $ Array.elems $
+        accumArray (\_ x -> x) True (toWords sm_ret_off + 1,
+                                     toWords (sm_sp - sm_args)) live_words
+   where
+     live_words =  [ (toWords off, False)
+                   | (r,off) <- eltsUFM sm_regs, isGcPtrType (localRegType r) ]
+
+
+-- -----------------------------------------------------------------------------
+-- Lowering safe foreign calls
+
+{-
+Note [lower safe foreign calls]
+
+We start with
+
+   Sp[young(L1)] = L1
+ ,-----------------------
+ | r1 = foo(x,y,z) returns to L1
+ '-----------------------
+ L1:
+   R1 = r1 -- copyIn, inserted by mkSafeCall
+   ...
+
+the stack layout algorithm will arrange to save and reload everything
+live across the call.  Our job now is to expand the call so we get
+
+   Sp[young(L1)] = L1
+ ,-----------------------
+ | SAVE_THREAD_STATE()
+ | token = suspendThread(BaseReg, interruptible)
+ | r = foo(x,y,z)
+ | BaseReg = resumeThread(token)
+ | LOAD_THREAD_STATE()
+ | R1 = r  -- copyOut
+ | jump L1
+ '-----------------------
+ L1:
+   r = R1 -- copyIn, inserted by mkSafeCall
+   ...
+
+Note the copyOut, which saves the results in the places that L1 is
+expecting them (see Note {safe foreign call convention]).
+-}
+
+lowerSafeForeignCall :: CmmBlock -> UniqSM CmmBlock
+lowerSafeForeignCall block
+  | (entry, middle, CmmForeignCall { .. }) <- blockSplit block
+  = do
+    -- Both 'id' and 'new_base' are KindNonPtr because they're
+    -- RTS-only objects and are not subject to garbage collection
+    id <- newTemp bWord
+    new_base <- newTemp (cmmRegType (CmmGlobal BaseReg))
+    let (caller_save, caller_load) = callerSaveVolatileRegs
+    load_tso <- newTemp gcWord
+    load_stack <- newTemp gcWord
+    let suspend = saveThreadState <*>
+                  caller_save <*>
+                  mkMiddle (callSuspendThread id intrbl)
+        midCall = mkUnsafeCall tgt res args
+        resume  = mkMiddle (callResumeThread new_base id) <*>
+                  -- Assign the result to BaseReg: we
+                  -- might now have a different Capability!
+                  mkAssign (CmmGlobal BaseReg) (CmmReg (CmmLocal new_base)) <*>
+                  caller_load <*>
+                  loadThreadState load_tso load_stack
+        -- Note: The successor must be a procpoint, and we have already split,
+        --       so we use a jump, not a branch.
+        succLbl = CmmLit (CmmLabel (infoTblLbl succ))
+
+        (ret_args, copyout) = copyOutOflow NativeReturn Jump (Young succ)
+                                           (map (CmmReg . CmmLocal) res)
+                                           updfr (0, [])
+
+        jump = CmmCall { cml_target   = succLbl
+                       , cml_cont     = Just succ
+                       , cml_args     = widthInBytes wordWidth
+                       , cml_ret_args = ret_args
+                       , cml_ret_off  = updfr }
+
+    graph' <- lgraphOfAGraph $ suspend <*>
+                               midCall <*>
+                               resume  <*>
+                               copyout <*>
+                               mkLast jump
+
+    case toBlockList graph' of
+      [one] -> let (_, middle', last) = blockSplit one
+               in return (blockJoin entry (middle `blockAppend` middle') last)
+      _ -> panic "lowerSafeForeignCall0"
+
+  -- Block doesn't end in a safe foreign call:
+  | otherwise = return block
+
+
+foreignLbl :: FastString -> CmmExpr
+foreignLbl name = CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId name))
+
+newTemp :: CmmType -> UniqSM LocalReg
+newTemp rep = getUniqueM >>= \u -> return (LocalReg u rep)
+
+callSuspendThread :: LocalReg -> Bool -> CmmNode O O
+callSuspendThread id intrbl =
+  CmmUnsafeForeignCall
+       (ForeignTarget (foreignLbl (fsLit "suspendThread"))
+             (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint]))
+       [id] [CmmReg (CmmGlobal BaseReg), CmmLit (mkIntCLit (fromEnum intrbl))]
+
+callResumeThread :: LocalReg -> LocalReg -> CmmNode O O
+callResumeThread new_base id =
+  CmmUnsafeForeignCall
+       (ForeignTarget (foreignLbl (fsLit "resumeThread"))
+            (ForeignConvention CCallConv [AddrHint] [AddrHint]))
+       [new_base] [CmmReg (CmmLocal id)]
+
+-- -----------------------------------------------------------------------------
+
+plusW :: ByteOff -> WordOff -> ByteOff
+plusW b w = b + w * wORD_SIZE
+
+dropEmpty :: WordOff -> [StackSlot] -> Maybe [StackSlot]
+dropEmpty 0 ss           = Just ss
+dropEmpty n (Empty : ss) = dropEmpty (n-1) ss
+dropEmpty _ _            = Nothing
+
+isEmpty :: StackSlot -> Bool
+isEmpty Empty = True
+isEmpty _ = False
+
+localRegBytes :: LocalReg -> ByteOff
+localRegBytes r = roundUpToWords (widthInBytes (typeWidth (localRegType r)))
+
+localRegWords :: LocalReg -> WordOff
+localRegWords = toWords . localRegBytes
+
+toWords :: ByteOff -> WordOff
+toWords x = x `quot` wORD_SIZE
+
+
+insertReloads :: StackMap -> [CmmNode O O]
+insertReloads stackmap =
+   [ CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot Old sp)
+                                     (localRegType r))
+   | (r,sp) <- stackSlotRegs stackmap
+   ]
+
+
+stackSlotRegs :: StackMap -> [(LocalReg, StackLoc)]
+stackSlotRegs sm = eltsUFM (sm_regs sm)
+
+-- -----------------------------------------------------------------------------
+
+-- If we do this *before* stack layout, we might be able to avoid
+-- saving some things across calls/procpoints.
+--
+-- *but*, that will invalidate the liveness analysis, and we'll have
+-- to re-do it.
+
+cmmSink :: CmmGraph -> FuelUniqSM CmmGraph
+cmmSink graph = do
+  let liveness = cmmLiveness graph
+  return $ cmmSink' liveness graph
+
+cmmSink' :: BlockEnv CmmLive -> CmmGraph -> CmmGraph
+cmmSink' liveness graph
+  = ofBlockList (g_entry graph) $ sink mapEmpty $ postorderDfs graph
+  where
+
+  sink :: BlockEnv [(LocalReg, CmmExpr)] -> [CmmBlock] -> [CmmBlock]
+  sink _ [] = []
+  sink sunk (b:bs) =
+    pprTrace "sink" (ppr l) $
+    blockJoin first final_middle last : sink sunk' bs
+    where
+      l = entryLabel b
+      (first, middle, last) = blockSplit b
+      (middle', assigs) = walk (blockToList middle) emptyBlock
+                               (mapFindWithDefault [] l sunk)
+
+      (dropped_last, assigs') = partition (`conflictsWithLast` last) assigs
+
+      final_middle = foldl blockSnoc middle' (toNodes dropped_last)
+
+      sunk' = mapUnion sunk $
+                 mapFromList [ (l, filt assigs' (getLive l))
+                             | l <- successors last ]
+           where
+               getLive l = mapFindWithDefault Set.empty l liveness
+               filt as live = [ (r,e) | (r,e) <- as, r `Set.member` live ]
+
+
+walk :: [CmmNode O O] -> Block CmmNode O O -> [(LocalReg, CmmExpr)]
+     -> (Block CmmNode O O, [(LocalReg, CmmExpr)])
+
+walk []     acc as = (acc, as)
+walk (n:ns) acc as
+  | Just a <- collect_it  = walk ns acc (a:as)
+  | otherwise             = walk ns (foldr (flip blockSnoc) acc (n:drop_nodes)) as'
+  where
+    collect_it = case n of
+                   CmmAssign (CmmLocal r) e@(CmmReg (CmmGlobal _)) -> Just (r,e)
+--                   CmmAssign (CmmLocal r) e@(CmmLoad addr _) |
+--                      foldRegsUsed (\b r -> False) True addr -> Just (r,e)
+                   _ -> Nothing
+
+    drop_nodes = toNodes dropped
+    (dropped, as') = partition should_drop as
+       where should_drop a = a `conflicts` n
+
+toNodes as = [ CmmAssign (CmmLocal r) rhs | (r,rhs) <- as ]
+
+-- We only sink "r = G" assignments right now, so conflicts is very simple:
+(r, rhs) `conflicts` CmmAssign reg  _  | reg `regUsedIn` rhs = True
+--(r, CmmLoad _ _) `conflicts` CmmStore _ _ = True
+(r, _)   `conflicts` node
+  = foldRegsUsed (\b r' -> r == r' || b) False node
+
+(r, _) `conflictsWithLast` node
+  = foldRegsUsed (\b r' -> r == r' || b) False node
diff --git a/compiler/cmm/CmmLint.hs b/compiler/cmm/CmmLint.hs
index 01ebac6254..cd0558616e 100644
--- a/compiler/cmm/CmmLint.hs
+++ b/compiler/cmm/CmmLint.hs
@@ -1,67 +1,71 @@
 -----------------------------------------------------------------------------
 --
--- (c) The University of Glasgow 2004-2006
+-- (c) The University of Glasgow 2011
 --
 -- CmmLint: checking the correctness of Cmm statements and expressions
 --
 -----------------------------------------------------------------------------
-
-{-# OPTIONS -fno-warn-tabs #-}
--- The above warning supression flag is a temporary kludge.
--- While working on this module you are encouraged to remove it and
--- detab the module (please do the detabbing in a separate patch). See
---     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#TabsvsSpaces
--- for details
-
+{-# LANGUAGE GADTs #-}
 module CmmLint (
-  cmmLint, cmmLintTop
+    cmmLint, cmmLintDecl, cmmLintGraph
   ) where
 
+import Hoopl
+import Cmm
+import CmmUtils
+import PprCmm ()
 import BlockId
-import OldCmm
+import FastString
 import CLabel
 import Outputable
-import OldPprCmm()
 import Constants
-import FastString
-import Platform
 
 import Data.Maybe
 
+-- Things to check:
+--     - invariant on CmmBlock in CmmExpr (see comment there)
+--     - check for branches to blocks that don't exist
+--     - check types
+
 -- -----------------------------------------------------------------------------
 -- Exported entry points:
 
 cmmLint :: (Outputable d, Outputable h)
-        => Platform -> GenCmmGroup d h (ListGraph CmmStmt) -> Maybe SDoc
-cmmLint platform tops = runCmmLint platform (mapM_ (lintCmmDecl platform)) tops
+        => GenCmmGroup d h (ListGraph CmmStmt) -> Maybe SDoc
+cmmLint tops = runCmmLint (mapM_ lintCmmDecl) tops
 
-cmmLintTop :: (Outputable d, Outputable h)
-           => Platform -> GenCmmDecl d h (ListGraph CmmStmt) -> Maybe SDoc
-cmmLintTop platform top = runCmmLint platform (lintCmmDecl platform) top
+cmmLintGraph :: CmmGraph -> Maybe SDoc
+cmmLintGraph g = runCmmLint lintCmmGraph g
 
-runCmmLint :: Outputable a
-           => Platform -> (a -> CmmLint b) -> a -> Maybe SDoc
-runCmmLint _ l p =
+runCmmLint :: Outputable a => (a -> CmmLint b) -> a -> Maybe SDoc
+runCmmLint l p =
    case unCL (l p) of
-   Left err -> Just (vcat [ptext $ sLit ("Cmm lint error:"),
-                           nest 2 err,
-                           ptext $ sLit ("Program was:"),
-                           nest 2 (ppr p)])
-   Right _  -> Nothing
-
-lintCmmDecl :: Platform -> (GenCmmDecl h i (ListGraph CmmStmt)) -> CmmLint ()
-lintCmmDecl platform (CmmProc _ lbl (ListGraph blocks))
-  = addLintInfo (text "in proc " <> pprCLabel platform lbl) $
-        let labels = foldl (\s b -> setInsert (blockId b) s) setEmpty blocks
-	in  mapM_ (lintCmmBlock platform labels) blocks
-
-lintCmmDecl _ (CmmData {})
+     Left err -> Just (vcat [ptext $ sLit ("Cmm lint error:"),
+                             nest 2 err,
+                             ptext $ sLit ("Program was:"),
+                             nest 2 (ppr p)])
+     Right _  -> Nothing
+
+lintCmmDecl :: GenCmmDecl h i CmmGraph -> CmmLint ()
+lintCmmDecl (CmmProc _ lbl g)
+  = addLintInfo (text "in proc " <> ppr lbl) $ lintCmmGraph g
+lintCmmDecl (CmmData {})
   = return ()
 
-lintCmmBlock :: Platform -> BlockSet -> GenBasicBlock CmmStmt -> CmmLint ()
-lintCmmBlock platform labels (BasicBlock id stmts)
-  = addLintInfo (text "in basic block " <> ppr id) $
-	mapM_ (lintCmmStmt platform labels) stmts
+
+lintCmmGraph :: CmmGraph -> CmmLint ()
+lintCmmGraph g = mapM_ (lintCmmBlock labels) blocks
+  where
+       blocks = toBlockList g
+       labels = setFromList (map entryLabel blocks)
+
+
+lintCmmBlock :: BlockSet -> CmmBlock -> CmmLint ()
+lintCmmBlock labels block
+  = addLintInfo (\_ -> text "in basic block " <> ppr (entryLabel block)) $ do
+        let (_, middle, last) = blockSplit block
+        mapM_ lintCmmMiddle (blockToList middle)
+        lintCmmLast labels last
 
 -- -----------------------------------------------------------------------------
 -- lintCmmExpr
@@ -69,24 +73,24 @@ lintCmmBlock platform labels (BasicBlock id stmts)
 -- Checks whether a CmmExpr is "type-correct", and check for obvious-looking
 -- byte/word mismatches.
 
-lintCmmExpr :: Platform -> CmmExpr -> CmmLint CmmType
-lintCmmExpr platform (CmmLoad expr rep) = do
-  _ <- lintCmmExpr platform expr
+lintCmmExpr :: CmmExpr -> CmmLint CmmType
+lintCmmExpr (CmmLoad expr rep) = do
+  _ <- lintCmmExpr expr
   -- Disabled, if we have the inlining phase before the lint phase,
   -- we can have funny offsets due to pointer tagging. -- EZY
   -- when (widthInBytes (typeWidth rep) >= wORD_SIZE) $
   --   cmmCheckWordAddress expr
   return rep
-lintCmmExpr platform expr@(CmmMachOp op args) = do
-  tys <- mapM (lintCmmExpr platform) args
+lintCmmExpr expr@(CmmMachOp op args) = do
+  tys <- mapM lintCmmExpr args
   if map (typeWidth . cmmExprType) args == machOpArgReps op
-  	then cmmCheckMachOp op args tys
-	else cmmLintMachOpErr expr (map cmmExprType args) (machOpArgReps op)
-lintCmmExpr platform (CmmRegOff reg offset)
-  = lintCmmExpr platform (CmmMachOp (MO_Add rep)
-		[CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
+        then cmmCheckMachOp op args tys
+        else cmmLintMachOpErr expr (map cmmExprType args) (machOpArgReps op)
+lintCmmExpr (CmmRegOff reg offset)
+  = lintCmmExpr (CmmMachOp (MO_Add rep)
+                [CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
   where rep = typeWidth (cmmRegType reg)
-lintCmmExpr _ expr =
+lintCmmExpr expr =
   return (cmmExprType expr)
 
 -- Check for some common byte/word mismatches (eg. Sp + 1)
@@ -119,43 +123,61 @@ notNodeReg :: CmmExpr -> Bool
 notNodeReg (CmmReg reg) | reg == nodeReg = False
 notNodeReg _                             = True
 
-lintCmmStmt :: Platform -> BlockSet -> CmmStmt -> CmmLint ()
-lintCmmStmt platform labels = lint
-    where lint (CmmNop) = return ()
-          lint (CmmComment {}) = return ()
-          lint stmt@(CmmAssign reg expr) = do
-            erep <- lintCmmExpr platform expr
-	    let reg_ty = cmmRegType reg
+lintCmmMiddle :: CmmNode O O -> CmmLint ()
+lintCmmMiddle node = case node of
+  CmmComment _ -> return ()
+
+  CmmAssign reg expr -> do
+            erep <- lintCmmExpr expr
+            let reg_ty = cmmRegType reg
             if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
                 then return ()
-                else cmmLintAssignErr stmt erep reg_ty
-          lint (CmmStore l r) = do
-            _ <- lintCmmExpr platform l
-            _ <- lintCmmExpr platform r
+                else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
+
+  CmmStore l r -> do
+            _ <- lintCmmExpr l
+            _ <- lintCmmExpr r
             return ()
-          lint (CmmCall target _res args _) =
-              do lintTarget platform labels target
-                 mapM_ (lintCmmExpr platform . hintlessCmm) args
-          lint (CmmCondBranch e id) = checkTarget id >> lintCmmExpr platform e >> checkCond e
-          lint (CmmSwitch e branches) = do
+
+  CmmUnsafeForeignCall target _formals actuals -> do
+            lintTarget target
+            mapM_ lintCmmExpr actuals
+
+
+lintCmmLast :: BlockSet -> CmmNode O C -> CmmLint ()
+lintCmmLast labels node = case node of
+  CmmBranch id -> checkTarget id
+
+  CmmCondBranch e t f -> do
+            mapM_ checkTarget [t,f]
+            _ <- lintCmmExpr e
+            checkCond e
+
+  CmmSwitch e branches -> do
             mapM_ checkTarget $ catMaybes branches
-            erep <- lintCmmExpr platform e
+            erep <- lintCmmExpr e
             if (erep `cmmEqType_ignoring_ptrhood` bWord)
               then return ()
-              else cmmLintErr (text "switch scrutinee is not a word: " <> ppr e <>
-                               text " :: " <> ppr erep)
-          lint (CmmJump e _) = lintCmmExpr platform e >> return ()
-          lint (CmmReturn) = return ()
-          lint (CmmBranch id) = checkTarget id
-          checkTarget id = if setMember id labels then return ()
-                           else cmmLintErr (text "Branch to nonexistent id" <+> ppr id)
-
-lintTarget :: Platform -> BlockSet -> CmmCallTarget -> CmmLint ()
-lintTarget platform _      (CmmCallee e _) = do _ <- lintCmmExpr platform e
-                                                return ()
-lintTarget _        _      (CmmPrim _ Nothing) = return ()
-lintTarget platform labels (CmmPrim _ (Just stmts))
-    = mapM_ (lintCmmStmt platform labels) stmts
+              else cmmLintErr (text "switch scrutinee is not a word: " <>
+                               ppr e <> text " :: " <> ppr erep)
+
+  CmmCall { cml_target = target, cml_cont = cont } -> do
+          _ <- lintCmmExpr target
+          maybe (return ()) checkTarget cont
+
+  CmmForeignCall tgt _ args succ _ _ -> do
+          lintTarget tgt
+          mapM_ lintCmmExpr args
+          checkTarget succ
+ where
+  checkTarget id
+     | setMember id labels = return ()
+     | otherwise = cmmLintErr (\_ -> text "Branch to nonexistent id" <+> ppr id)
+
+
+lintTarget :: ForeignTarget -> CmmLint ()
+lintTarget (ForeignTarget e _) = lintCmmExpr e >> return ()
+lintTarget (PrimTarget {})     = return ()
 
 
 checkCond :: CmmExpr -> CmmLint ()
@@ -163,7 +185,7 @@ checkCond (CmmMachOp mop _) | isComparisonMachOp mop = return ()
 checkCond (CmmLit (CmmInt x t)) | x == 0 || x == 1, t == wordWidth = return () -- constant values
 checkCond expr
     = cmmLintErr (hang (text "expression is not a conditional:") 2
-                       (ppr expr))
+                         (ppr expr))
 
 -- -----------------------------------------------------------------------------
 -- CmmLint monad
@@ -173,37 +195,36 @@ checkCond expr
 newtype CmmLint a = CmmLint { unCL :: Either SDoc a }
 
 instance Monad CmmLint where
-  CmmLint m >>= k = CmmLint $ case m of 
-				Left e -> Left e
-				Right a -> unCL (k a)
-  return a = CmmLint (Right a)
+  CmmLint m >>= k = CmmLint $ \p -> case m p of
+                                      Left e -> Left e
+                                      Right a -> unCL (k a) p
+  return a = CmmLint (\_ -> Right a)
 
 cmmLintErr :: SDoc -> CmmLint a
-cmmLintErr msg = CmmLint (Left msg)
+cmmLintErr msg = CmmLint (\p -> Left (msg p))
 
 addLintInfo :: SDoc -> CmmLint a -> CmmLint a
-addLintInfo info thing = CmmLint $ 
-   case unCL thing of
-	Left err -> Left (hang info 2 err)
-	Right a  -> Right a
+addLintInfo info thing = CmmLint $ \p ->
+   case unCL thing p of
+        Left err -> Left (hang (info p) 2 err)
+        Right a  -> Right a
 
 cmmLintMachOpErr :: CmmExpr -> [CmmType] -> [Width] -> CmmLint a
 cmmLintMachOpErr expr argsRep opExpectsRep
-     = cmmLintErr (text "in MachOp application: " $$ 
-					nest 2 (ppr expr) $$
-				        (text "op is expecting: " <+> ppr opExpectsRep) $$
-					(text "arguments provide: " <+> ppr argsRep))
+     = cmmLintErr (text "in MachOp application: " $$
+                   nest 2 (ppr  expr) $$
+                      (text "op is expecting: " <+> ppr opExpectsRep) $$
+                      (text "arguments provide: " <+> ppr argsRep))
 
-cmmLintAssignErr :: CmmStmt -> CmmType -> CmmType -> CmmLint a
+cmmLintAssignErr :: CmmNode e x -> CmmType -> CmmType -> CmmLint a
 cmmLintAssignErr stmt e_ty r_ty
-  = cmmLintErr (text "in assignment: " $$ 
-		nest 2 (vcat [ppr stmt, 
-			      text "Reg ty:" <+> ppr r_ty,
-			      text "Rhs ty:" <+> ppr e_ty]))
-			 
-					
+  = cmmLintErr (text "in assignment: " $$
+                nest 2 (vcat [ppr stmt,
+                              text "Reg ty:" <+> ppr r_ty,
+                              text "Rhs ty:" <+> ppr e_ty]))
+
 
 cmmLintDubiousWordOffset :: CmmExpr -> CmmLint a
 cmmLintDubiousWordOffset expr
    = cmmLintErr (text "offset is not a multiple of words: " $$
-			nest 2 (ppr expr))
+                 nest 2 (ppr expr))
diff --git a/compiler/cmm/CmmLive.hs b/compiler/cmm/CmmLive.hs
index 9a5bb2d5ae..ac9c38b448 100644
--- a/compiler/cmm/CmmLive.hs
+++ b/compiler/cmm/CmmLive.hs
@@ -18,10 +18,9 @@ import Control.Monad
 import OptimizationFuel
 import PprCmmExpr ()
 
-import Compiler.Hoopl
+import Hoopl
 import Maybes
 import Outputable
-import UniqSet
 
 -----------------------------------------------------------------------------
 -- Calculating what variables are live on entry to a basic block
@@ -33,8 +32,10 @@ type CmmLive = RegSet
 -- | The dataflow lattice
 liveLattice :: DataflowLattice CmmLive
 liveLattice = DataflowLattice "live LocalReg's" emptyRegSet add
-    where add _ (OldFact old) (NewFact new) = case unionUniqSets old new of
-            join -> (changeIf $ sizeUniqSet join > sizeUniqSet old, join)
+    where add _ (OldFact old) (NewFact new) =
+               (changeIf $ sizeRegSet join > sizeRegSet old, join)
+              where !join = plusRegSet old new
+
 
 -- | A mapping from block labels to the variables live on entry
 type BlockEntryLiveness = BlockEnv CmmLive
@@ -43,16 +44,17 @@ type BlockEntryLiveness = BlockEnv CmmLive
 -- | Calculated liveness info for a CmmGraph
 -----------------------------------------------------------------------------
 
-cmmLiveness :: CmmGraph -> FuelUniqSM BlockEntryLiveness
+cmmLiveness :: CmmGraph -> BlockEntryLiveness
 cmmLiveness graph =
-  liftM check $ liftM snd $ dataflowPassBwd graph [] $ analBwd liveLattice xferLive
+  check $ dataflowAnalBwd graph [] $ analBwd liveLattice xferLive
   where entry = g_entry graph
-        check facts = noLiveOnEntry entry (expectJust "check" $ mapLookup entry facts) facts
+        check facts = noLiveOnEntry entry
+                        (expectJust "check" $ mapLookup entry facts) facts
 
 -- | On entry to the procedure, there had better not be any LocalReg's live-in.
 noLiveOnEntry :: BlockId -> CmmLive -> a -> a
 noLiveOnEntry bid in_fact x =
-  if isEmptyUniqSet in_fact then x
+  if nullRegSet in_fact then x
   else pprPanic "LocalReg's live-in to graph" (ppr bid <+> ppr in_fact)
 
 -- | The transfer equations use the traditional 'gen' and 'kill'
@@ -60,42 +62,42 @@ noLiveOnEntry bid in_fact x =
 gen  :: UserOfLocalRegs a    => a -> RegSet -> RegSet
 gen  a live = foldRegsUsed extendRegSet      live a
 kill :: DefinerOfLocalRegs a => a -> RegSet -> RegSet
-kill a live = foldRegsDefd delOneFromUniqSet live a
+kill a live = foldRegsDefd deleteFromRegSet live a
 
-gen_kill :: (DefinerOfLocalRegs a, UserOfLocalRegs a) => a -> CmmLive -> CmmLive
+gen_kill :: (DefinerOfLocalRegs a, UserOfLocalRegs a)
+          => a -> CmmLive -> CmmLive
 gen_kill a = gen a . kill a
 
 -- | The transfer function
--- EZY: Bits of this analysis are duplicated in CmmSpillReload, though
--- it's not really easy to efficiently reuse all of this.  Keep in mind
--- if you need to update this analysis.
 xferLive :: BwdTransfer CmmNode CmmLive
 xferLive = mkBTransfer3 fst mid lst
   where fst _ f = f
         mid :: CmmNode O O -> CmmLive -> CmmLive
         mid n f = gen_kill n f
         lst :: CmmNode O C -> FactBase CmmLive -> CmmLive
-        -- slightly inefficient: kill is unnecessary for emptyRegSet
-        lst n f = gen_kill n
-                $ case n of CmmCall{}        -> emptyRegSet
-                            CmmForeignCall{} -> emptyRegSet
-                            _                -> joinOutFacts liveLattice n f
+        lst n f = gen_kill n $ joinOutFacts liveLattice n f
 
 -----------------------------------------------------------------------------
 -- Removing assignments to dead variables
 -----------------------------------------------------------------------------
 
-removeDeadAssignments :: CmmGraph -> FuelUniqSM CmmGraph
+removeDeadAssignments :: CmmGraph -> FuelUniqSM (CmmGraph, BlockEnv CmmLive)
 removeDeadAssignments g =
-   liftM fst $ dataflowPassBwd g [] $ analRewBwd liveLattice xferLive rewrites
-   where rewrites = deepBwdRw3 nothing middle nothing
-         -- Beware: deepBwdRw with one polymorphic function seems more reasonable here,
-         -- but GHC panics while compiling, see bug #4045.
+   dataflowPassBwd g [] $ analRewBwd liveLattice xferLive rewrites
+   where rewrites = mkBRewrite3 nothing middle nothing
+         -- SDM: no need for deepBwdRw here, we only rewrite to empty
+         -- Beware: deepBwdRw with one polymorphic function seems more
+         -- reasonable here, but GHC panics while compiling, see bug
+         -- #4045.
          middle :: CmmNode O O -> Fact O CmmLive -> CmmReplGraph O O
-         middle (CmmAssign (CmmLocal reg') _) live | not (reg' `elemRegSet` live) = return $ Just emptyGraph
+         middle (CmmAssign (CmmLocal reg') _) live
+                 | not (reg' `elemRegSet` live)
+                 = return $ Just emptyGraph
          -- XXX maybe this should be somewhere else...
-         middle (CmmAssign lhs (CmmReg rhs))   _ | lhs == rhs = return $ Just emptyGraph
-         middle (CmmStore lhs (CmmLoad rhs _)) _ | lhs == rhs = return $ Just emptyGraph
+         middle (CmmAssign lhs (CmmReg rhs))   _ | lhs == rhs
+                 = return $ Just emptyGraph
+         middle (CmmStore lhs (CmmLoad rhs _)) _ | lhs == rhs
+                 = return $ Just emptyGraph
          middle _ _ = return Nothing
 
          nothing :: CmmNode e x -> Fact x CmmLive -> CmmReplGraph e x
diff --git a/compiler/cmm/CmmNode.hs b/compiler/cmm/CmmNode.hs
index 4844af9d9a..cd46794580 100644
--- a/compiler/cmm/CmmNode.hs
+++ b/compiler/cmm/CmmNode.hs
@@ -18,7 +18,7 @@ module CmmNode (
      CmmNode(..), ForeignHint(..), CmmFormal, CmmActual,
      UpdFrameOffset, Convention(..), ForeignConvention(..), ForeignTarget(..),
      mapExp, mapExpDeep, wrapRecExp, foldExp, foldExpDeep, wrapRecExpf,
-     mapExpM, mapExpDeepM, wrapRecExpM
+     mapExpM, mapExpDeepM, wrapRecExpM, mapSuccessors
   ) where
 
 import CmmExpr
@@ -35,15 +35,17 @@ import Prelude hiding (succ)
 ------------------------
 -- CmmNode
 
+#define ULabel {-# UNPACK #-} !Label
+
 data CmmNode e x where
-  CmmEntry :: Label -> CmmNode C O
+  CmmEntry :: ULabel -> CmmNode C O
 
   CmmComment :: FastString -> CmmNode O O
 
-  CmmAssign :: CmmReg -> CmmExpr -> CmmNode O O
+  CmmAssign :: !CmmReg -> !CmmExpr -> CmmNode O O
     -- Assign to register
 
-  CmmStore :: CmmExpr -> CmmExpr -> CmmNode O O
+  CmmStore :: !CmmExpr -> !CmmExpr -> CmmNode O O
     -- Assign to memory location.  Size is
     -- given by cmmExprType of the rhs.
 
@@ -60,11 +62,12 @@ data CmmNode e x where
       --            bug for what can be put in arguments, see
       --            Note [Register Parameter Passing]
 
-  CmmBranch :: Label -> CmmNode O C  -- Goto another block in the same procedure
+  CmmBranch :: ULabel -> CmmNode O C
+                                   -- Goto another block in the same procedure
 
   CmmCondBranch :: {                 -- conditional branch
       cml_pred :: CmmExpr,
-      cml_true, cml_false :: Label
+      cml_true, cml_false :: ULabel
   } -> CmmNode O C
 
   CmmSwitch :: CmmExpr -> [Maybe Label] -> CmmNode O C -- Table branch
@@ -78,6 +81,11 @@ data CmmNode e x where
 
       cml_cont :: Maybe Label,
           -- Label of continuation (Nothing for return or tail call)
+          --
+          -- Note [Continuation BlockId]: these BlockIds are called
+          -- Continuation BlockIds, and are the only BlockIds that can
+          -- occur in CmmExprs, namely as (CmmLit (CmmBlock b)) or
+          -- (CmmStackSlot (Young b) _).
 
 -- ToDO: add this:
 --       cml_args_regs :: [GlobalReg],
@@ -117,7 +125,7 @@ data CmmNode e x where
       tgt   :: ForeignTarget,   -- call target and convention
       res   :: [CmmFormal],     -- zero or more results
       args  :: [CmmActual],     -- zero or more arguments; see Note [Register parameter passing]
-      succ  :: Label,           -- Label of continuation
+      succ  :: ULabel,          -- Label of continuation
       updfr :: UpdFrameOffset,  -- where the update frame is (for building infotable)
       intrbl:: Bool             -- whether or not the call is interruptible
   } -> CmmNode O C
@@ -218,14 +226,6 @@ data Convention
   | GC               -- Entry to the garbage collector: uses the node reg!
   | PrimOpCall       -- Calling prim ops
   | PrimOpReturn     -- Returning from prim ops
-  | Foreign          -- Foreign call/return
-        ForeignConvention
-  | Private
-        -- Used for control transfers within a (pre-CPS) procedure All
-        -- jump sites known, never pushed on the stack (hence no SRT)
-        -- You can choose whatever calling convention you please
-        -- (provided you make sure all the call sites agree)!
-        -- This data type eventually to be extended to record the convention.
   deriving( Eq )
 
 data ForeignConvention
@@ -283,37 +283,6 @@ instance DefinerOfLocalRegs (CmmNode e x) where
           fold f z n = foldRegsDefd f z n
 
 
-instance UserOfSlots (CmmNode e x) where
-  foldSlotsUsed f z n = case n of
-    CmmAssign _ expr -> fold f z expr
-    CmmStore addr rval -> fold f (fold f z addr) rval
-    CmmUnsafeForeignCall _ _ args -> fold f z args
-    CmmCondBranch expr _ _ -> fold f z expr
-    CmmSwitch expr _ -> fold f z expr
-    CmmCall {cml_target=tgt} -> fold f z tgt
-    CmmForeignCall {tgt=tgt, args=args} -> fold f (fold f z tgt) args
-    _ -> z
-    where fold :: forall a b.
-                       UserOfSlots a =>
-                       (b -> SubArea -> b) -> b -> a -> b
-          fold f z n = foldSlotsUsed f z n
-
-instance UserOfSlots ForeignTarget where
-  foldSlotsUsed  f z (ForeignTarget e _) = foldSlotsUsed f z e
-  foldSlotsUsed _f z (PrimTarget _)      = z
-
-instance DefinerOfSlots (CmmNode e x) where
-  foldSlotsDefd f z n = case n of
-    CmmStore (CmmStackSlot a i) expr -> f z (a, i, widthInBytes $ typeWidth $ cmmExprType expr)
-    CmmForeignCall {res=res} -> fold f z $ map foreign_call_slot res
-    _ -> z
-    where
-          fold :: forall a b.
-                  DefinerOfSlots a =>
-                  (b -> SubArea -> b) -> b -> a -> b
-          fold f z n = foldSlotsDefd f z n
-          foreign_call_slot r = case widthInBytes $ typeWidth $ localRegType r of w -> (RegSlot r, w, w)
-
 -----------------------------------
 -- mapping Expr in CmmNode
 
@@ -416,4 +385,20 @@ foldExp f (CmmCall {cml_target=tgt}) z            = f tgt z
 foldExp f (CmmForeignCall {tgt=tgt, args=args}) z = foldr f (foldExpForeignTarget f tgt z) args
 
 foldExpDeep :: (CmmExpr -> z -> z) -> CmmNode e x -> z -> z
-foldExpDeep f = foldExp $ wrapRecExpf f
+foldExpDeep f = foldExp go
+  where -- go :: CmmExpr -> z -> z
+        go e@(CmmMachOp _ es) z = gos es $! f e z
+        go e@(CmmLoad addr _) z = go addr $! f e z
+        go e                  z = f e z
+
+        gos [] z = z
+        gos (e:es) z = gos es $! f e z
+
+-- -----------------------------------------------------------------------------
+
+mapSuccessors :: (Label -> Label) -> CmmNode O C -> CmmNode O C
+mapSuccessors f (CmmBranch bid)        = CmmBranch (f bid)
+mapSuccessors f (CmmCondBranch p y n)  = CmmCondBranch p (f y) (f n)
+mapSuccessors f (CmmSwitch e arms)     = CmmSwitch e (map (fmap f) arms)
+mapSuccessors f n = n
+
diff --git a/compiler/cmm/CmmOpt.hs b/compiler/cmm/CmmOpt.hs
index 8cc18fc1ca..7c7ed393d9 100644
--- a/compiler/cmm/CmmOpt.hs
+++ b/compiler/cmm/CmmOpt.hs
@@ -145,7 +145,7 @@ To inline _smi:
 -}
 
 countUses :: UserOfLocalRegs a => a -> UniqFM Int
-countUses a = foldRegsUsed (\m r -> addToUFM m r (count m r + 1)) emptyUFM a
+countUses a = foldRegsUsed (\m r -> addToUFM_C (+) m r 1) emptyUFM a
   where count m r = lookupWithDefaultUFM m (0::Int) r
 
 cmmMiniInline :: DynFlags -> [CmmBasicBlock] -> [CmmBasicBlock]
@@ -157,25 +157,16 @@ cmmMiniInlineStmts :: DynFlags -> UniqFM Int -> [CmmStmt] -> [CmmStmt]
 cmmMiniInlineStmts _      _    [] = []
 cmmMiniInlineStmts dflags uses (stmt@(CmmAssign (CmmLocal (LocalReg u _)) expr) : stmts)
         -- not used: just discard this assignment
-  | Nothing <- lookupUFM uses u
-  = cmmMiniInlineStmts dflags uses stmts
+  | 0 <- lookupWithDefaultUFM uses 0 u
+  = cmmMiniInlineStmts uses stmts
 
-        -- used (literal): try to inline at all the use sites
-  | Just n <- lookupUFM uses u, isLit expr
-  =
-     ncgDebugTrace ("nativeGen: inlining " ++ showSDoc dflags (pprStmt stmt)) $
-     case lookForInlineLit u expr stmts of
-         (m, stmts')
-             | n == m -> cmmMiniInlineStmts dflags (delFromUFM uses u) stmts'
-             | otherwise ->
-                 stmt : cmmMiniInlineStmts dflags (adjustUFM (\x -> x - m) uses u) stmts'
-
-        -- used (foldable to literal): try to inline at all the use sites
+        -- used (foldable to small thing): try to inline at all the use sites
   | Just n <- lookupUFM uses u,
-    e@(CmmLit _) <- wrapRecExp foldExp expr
+    e <- wrapRecExp foldExp expr,
+    isTiny e
   =
-     ncgDebugTrace ("nativeGen: inlining " ++ showSDoc dflags (pprStmt stmt)) $
-     case lookForInlineLit u e stmts of
+     ncgDebugTrace ("nativeGen: inlining " ++ showSDoc (pprStmt stmt)) $
+     case lookForInlineMany u e stmts of
          (m, stmts')
              | n == m -> cmmMiniInlineStmts dflags (delFromUFM uses u) stmts'
              | otherwise ->
@@ -188,6 +179,11 @@ cmmMiniInlineStmts dflags uses (stmt@(CmmAssign (CmmLocal (LocalReg u _)) expr)
      ncgDebugTrace ("nativeGen: inlining " ++ showSDoc dflags (pprStmt stmt)) $
      cmmMiniInlineStmts dflags uses stmts'
  where
+  isTiny (CmmLit _) = True
+  isTiny (CmmReg (CmmGlobal _)) = True
+         -- not CmmLocal: that might invalidate the usage analysis results
+  isTiny _ = False
+
   platform = targetPlatform dflags
   foldExp (CmmMachOp op args) = cmmMachOpFold platform op args
   foldExp e = e
@@ -201,26 +197,28 @@ cmmMiniInlineStmts platform uses (stmt:stmts)
 -- register, and a list of statements.  Inlines the expression at all
 -- use sites of the register.  Returns the number of substituations
 -- made and the, possibly modified, list of statements.
-lookForInlineLit :: Unique -> CmmExpr -> [CmmStmt] -> (Int, [CmmStmt])
-lookForInlineLit _ _ [] = (0, [])
-lookForInlineLit u expr stmts@(stmt : rest)
-  | Just n <- lookupUFM (countUses stmt) u
-  = case lookForInlineLit u expr rest of
-      (m, stmts) -> let z = n + m
-                    in z `seq` (z, inlineStmt u expr stmt : stmts)
-
-  | ok_to_skip
-  = case lookForInlineLit u expr rest of
+lookForInlineMany :: Unique -> CmmExpr -> [CmmStmt] -> (Int, [CmmStmt])
+lookForInlineMany u expr stmts = lookForInlineMany' u expr regset stmts
+    where regset = foldRegsUsed extendRegSet emptyRegSet expr
+
+lookForInlineMany' :: Unique -> CmmExpr -> RegSet -> [CmmStmt] -> (Int, [CmmStmt])
+lookForInlineMany' _ _ _ [] = (0, [])
+lookForInlineMany' u expr regset stmts@(stmt : rest)
+  | Just n <- lookupUFM (countUses stmt) u, okToInline expr stmt
+  = let stmt' = inlineStmt u expr stmt in
+    if okToSkip stmt' u expr regset
+       then case lookForInlineMany' u expr regset rest of
+                       (m, stmts) -> let z = n + m
+                                     in z `seq` (z, stmt' : stmts)
+       else (n, stmt' : rest)
+
+  | okToSkip stmt u expr regset
+  = case lookForInlineMany' u expr regset rest of
       (n, stmts) -> (n, stmt : stmts)
 
   | otherwise
   = (0, stmts)
-  where
-    -- We skip over assignments to registers, unless the register
-    -- being assigned to is the one we're inlining.
-    ok_to_skip = case stmt of
-        CmmAssign (CmmLocal (LocalReg u' _)) _ | u' == u -> False
-        _other -> True
+
 
 lookForInline :: Unique -> CmmExpr -> [CmmStmt] -> Maybe [CmmStmt]
 lookForInline u expr stmts = lookForInline' u expr regset stmts
@@ -229,10 +227,10 @@ lookForInline u expr stmts = lookForInline' u expr regset stmts
 lookForInline' :: Unique -> CmmExpr -> RegSet -> [CmmStmt] -> Maybe [CmmStmt]
 lookForInline' _ _    _      [] = panic "lookForInline' []"
 lookForInline' u expr regset (stmt : rest)
-  | Just 1 <- lookupUFM (countUses stmt) u, ok_to_inline
+  | Just 1 <- lookupUFM (countUses stmt) u, okToInline expr stmt
   = Just (inlineStmt u expr stmt : rest)
 
-  | ok_to_skip
+  | okToSkip stmt u expr regset
   = case lookForInline' u expr regset rest of
            Nothing    -> Nothing
            Just stmts -> Just (stmt:stmts)
@@ -240,31 +238,36 @@ lookForInline' u expr regset (stmt : rest)
   | otherwise 
   = Nothing
 
-  where
-        -- we don't inline into CmmCall if the expression refers to global
-        -- registers.  This is a HACK to avoid global registers clashing with
-        -- C argument-passing registers, really the back-end ought to be able
-        -- to handle it properly, but currently neither PprC nor the NCG can
-        -- do it.  See also CgForeignCall:load_args_into_temps.
-    ok_to_inline = case stmt of
-                     CmmCall{} -> hasNoGlobalRegs expr
-                     _ -> True
-
-   -- Expressions aren't side-effecting.  Temporaries may or may not
-   -- be single-assignment depending on the source (the old code
-   -- generator creates single-assignment code, but hand-written Cmm
-   -- and Cmm from the new code generator is not single-assignment.)
-   -- So we do an extra check to make sure that the register being
-   -- changed is not one we were relying on.  I don't know how much of a
-   -- performance hit this is (we have to create a regset for every
-   -- instruction.) -- EZY
-    ok_to_skip = case stmt of
-                 CmmNop -> True
-                 CmmComment{} -> True
-                 CmmAssign (CmmLocal r@(LocalReg u' _)) _rhs | u' /= u && not (r `elemRegSet` regset) -> True
-                 CmmAssign g@(CmmGlobal _) _rhs -> not (g `regUsedIn` expr)
-                 _other -> False
 
+-- we don't inline into CmmCall if the expression refers to global
+-- registers.  This is a HACK to avoid global registers clashing with
+-- C argument-passing registers, really the back-end ought to be able
+-- to handle it properly, but currently neither PprC nor the NCG can
+-- do it.  See also CgForeignCall:load_args_into_temps.
+okToInline :: CmmExpr -> CmmStmt -> Bool
+okToInline expr CmmCall{} = hasNoGlobalRegs expr
+okToInline _ _ = True
+
+-- Expressions aren't side-effecting.  Temporaries may or may not
+-- be single-assignment depending on the source (the old code
+-- generator creates single-assignment code, but hand-written Cmm
+-- and Cmm from the new code generator is not single-assignment.)
+-- So we do an extra check to make sure that the register being
+-- changed is not one we were relying on.  I don't know how much of a
+-- performance hit this is (we have to create a regset for every
+-- instruction.) -- EZY
+okToSkip stmt u expr regset
+   = case stmt of
+         CmmNop -> True
+         CmmComment{} -> True
+         CmmAssign (CmmLocal r@(LocalReg u' _)) _rhs | u' /= u && not (r `elemRegSet` regset) -> True
+         CmmAssign g@(CmmGlobal _) _rhs -> not (g `regUsedIn` expr)
+         CmmStore _ _ -> not_a_load expr
+         _other -> False
+  where
+    not_a_load (CmmMachOp _ args) = all not_a_load args
+    not_a_load (CmmLoad _ _) = False
+    not_a_load _ = True
 
 inlineStmt :: Unique -> CmmExpr -> CmmStmt -> CmmStmt
 inlineStmt u a (CmmAssign r e) = CmmAssign r (inlineExpr u a e)
diff --git a/compiler/cmm/CmmParse.y b/compiler/cmm/CmmParse.y
index 075ed22ea9..f46d49e022 100644
--- a/compiler/cmm/CmmParse.y
+++ b/compiler/cmm/CmmParse.y
@@ -230,35 +230,31 @@ lits	:: { [ExtFCode CmmExpr] }
 
 cmmproc :: { ExtCode }
 -- TODO: add real SRT/info tables to parsed Cmm
-	: info maybe_formals_without_hints maybe_gc_block maybe_frame '{' body '}'
-		{ do ((entry_ret_label, info, live, formals, gc_block, frame), stmts) <-
+        : info maybe_formals_without_hints '{' body '}'
+                { do ((entry_ret_label, info, live, formals), stmts) <-
 		       getCgStmtsEC' $ loopDecls $ do {
 		         (entry_ret_label, info, live) <- $1;
 		         formals <- sequence $2;
-		         gc_block <- $3;
-		         frame <- $4;
-		         $6;
-		         return (entry_ret_label, info, live, formals, gc_block, frame) }
+                         $4;
+                         return (entry_ret_label, info, live, formals) }
 		     blks <- code (cgStmtsToBlocks stmts)
-		     code (emitInfoTableAndCode entry_ret_label (CmmInfo gc_block frame info) formals blks) }
+                     code (emitInfoTableAndCode entry_ret_label info formals blks) }
 
 	| info maybe_formals_without_hints ';'
 		{ do (entry_ret_label, info, live) <- $1;
 		     formals <- sequence $2;
-		     code (emitInfoTableAndCode entry_ret_label (CmmInfo Nothing Nothing info) formals []) }
+                     code (emitInfoTableAndCode entry_ret_label info formals []) }
 
-	| NAME maybe_formals_without_hints maybe_gc_block maybe_frame '{' body '}'
+        | NAME maybe_formals_without_hints '{' body '}'
 		{% withThisPackage $ \pkg ->
 		   do	newFunctionName $1 pkg
-		   	((formals, gc_block, frame), stmts) <-
+                        (formals, stmts) <-
 			 	getCgStmtsEC' $ loopDecls $ do {
 		          		formals <- sequence $2;
-		          		gc_block <- $3;
-			  		frame <- $4;
-		          		$6;
-		          		return (formals, gc_block, frame) }
+                                        $4;
+                                        return formals }
 			blks <- code (cgStmtsToBlocks stmts)
-			code (emitProc (CmmInfo gc_block frame CmmNonInfoTable) (mkCmmCodeLabel pkg $1) formals blks) }
+                        code (emitProc CmmNonInfoTable (mkCmmCodeLabel pkg $1) formals blks) }
 
 info	:: { ExtFCode (CLabel, CmmInfoTable, [Maybe LocalReg]) }
 	: 'INFO_TABLE' '(' NAME ',' INT ',' INT ',' INT ',' STRING ',' STRING ')'
@@ -599,18 +595,7 @@ formals_without_hints :: { [ExtFCode LocalReg] }
 formal_without_hint :: { ExtFCode LocalReg }
 	: type NAME		{ newLocal $1 $2 }
 
-maybe_frame :: { ExtFCode (Maybe UpdateFrame) }
-	: {- empty -}			{ return Nothing }
-	| 'jump' expr '(' exprs0 ')'	{ do { target <- $2;
-					       args <- sequence $4;
-					       return $ Just (UpdateFrame target args) } }
-
-maybe_gc_block :: { ExtFCode (Maybe BlockId) }
-	: {- empty -}			{ return Nothing }
-	| 'goto' NAME
-		{ do l <- lookupLabel $2; return (Just l) }
-
-type	:: { CmmType }
+type    :: { CmmType }
 	: 'bits8'		{ b8 }
 	| typenot8		{ $1 }
 
@@ -1073,7 +1058,8 @@ parseCmmFile dflags filename = do
         let msg = mkPlainErrMsg dflags span err
         return ((emptyBag, unitBag msg), Nothing)
     POk pst code -> do
-        cmm <- initC dflags no_module (getCmm (unEC code initEnv [] >> return ()))
+        st <- initC
+        let (cmm,_) = runC dflags no_module st (getCmm (unEC code initEnv [] >> return ()))
         let ms = getMessages pst
         if (errorsFound dflags ms)
          then return (ms, Nothing)
diff --git a/compiler/cmm/CmmPipeline.hs b/compiler/cmm/CmmPipeline.hs
index 7361bbf385..adc27ab1ff 100644
--- a/compiler/cmm/CmmPipeline.hs
+++ b/compiler/cmm/CmmPipeline.hs
@@ -11,15 +11,17 @@ module CmmPipeline (
 
 import CLabel
 import Cmm
+import CmmLint
 import CmmLive
 import CmmBuildInfoTables
 import CmmCommonBlockElim
 import CmmProcPoint
-import CmmSpillReload
 import CmmRewriteAssignments
-import CmmStackLayout
 import CmmContFlowOpt
 import OptimizationFuel
+import CmmLayoutStack
+import Hoopl
+import CmmUtils
 
 import DynFlags
 import ErrUtils
@@ -28,6 +30,8 @@ import Data.Maybe
 import Control.Monad
 import Data.Map (Map)
 import qualified Data.Map as Map
+import Data.Set (Set)
+import qualified Data.Set as Set
 import Outputable
 import StaticFlags
 
@@ -53,32 +57,28 @@ import StaticFlags
 --    we actually need to do the initial pass.
 cmmPipeline  :: HscEnv -- Compilation env including
                        -- dynamic flags: -dcmm-lint -ddump-cps-cmm
-             -> (TopSRT, [CmmGroup])    -- SRT table and accumulating list of compiled procs
+             -> TopSRT     -- SRT table and accumulating list of compiled procs
              -> CmmGroup             -- Input C-- with Procedures
-             -> IO (TopSRT, [CmmGroup]) -- Output CPS transformed C--
-cmmPipeline hsc_env (topSRT, rst) prog =
+             -> IO (TopSRT, CmmGroup) -- Output CPS transformed C--
+cmmPipeline hsc_env topSRT prog =
   do let dflags = hsc_dflags hsc_env
      --
      showPass dflags "CPSZ"
 
-     let tops = runCmmContFlowOpts prog
-     (cafEnvs, tops) <- liftM unzip $ mapM (cpsTop hsc_env) tops
+     (cafEnvs, tops) <- {-# SCC "tops" #-} liftM unzip $ mapM (cpsTop hsc_env) prog
      -- tops :: [[(CmmDecl,CAFSet]]  (one list per group)
 
-     let topCAFEnv = mkTopCAFInfo (concat cafEnvs)
+     let topCAFEnv = {-# SCC "topCAFEnv" #-} mkTopCAFInfo (concat cafEnvs)
 
      -- folding over the groups
-     (topSRT, tops) <- foldM (toTops hsc_env topCAFEnv) (topSRT, []) tops
+     (topSRT, tops) <- {-# SCC "toTops" #-} foldM (toTops hsc_env topCAFEnv) (topSRT, []) tops
 
      let cmms :: CmmGroup
          cmms = reverse (concat tops)
 
      dumpIfSet_dyn dflags Opt_D_dump_cps_cmm "Post CPS Cmm" (ppr cmms)
 
-     -- SRT is not affected by control flow optimization pass
-     let prog' = runCmmContFlowOpts cmms
-
-     return (topSRT, prog' : rst)
+     return (topSRT, cmms)
 
 {- [Note global fuel]
 ~~~~~~~~~~~~~~~~~~~~~
@@ -92,75 +92,63 @@ global to one compiler session.
 -- -ddump-cmmz
 
 cpsTop :: HscEnv -> CmmDecl -> IO ([(CLabel, CAFSet)], [(CAFSet, CmmDecl)])
-cpsTop _ p@(CmmData {}) = return ([], [(Map.empty, p)])
+cpsTop _ p@(CmmData {}) = return ([], [(Set.empty, p)])
 cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}}) l g) =
     do
-       -- Why bother doing these early: dualLivenessWithInsertion,
-       -- insertLateReloads, rewriteAssignments?
+       ----------- Control-flow optimisations ---------------
+       g <- {-# SCC "cmmCfgOpts(1)" #-} return $ cmmCfgOpts g
+       dump Opt_D_dump_cmmz_cfg "Post control-flow optimsations" g
 
        ----------- Eliminate common blocks -------------------
-       g <- return $ elimCommonBlocks g
+       g <- {-# SCC "elimCommonBlocks" #-} return $ elimCommonBlocks g
        dump Opt_D_dump_cmmz_cbe "Post common block elimination" g
-       -- Any work storing block Labels must be performed _after_ elimCommonBlocks
+       -- Any work storing block Labels must be performed _after_
+       -- elimCommonBlocks
 
        ----------- Proc points -------------------
-       let callPPs = callProcPoints g
-       procPoints <- run $ minimalProcPointSet (targetPlatform dflags) callPPs g
-       g <- run $ addProcPointProtocols callPPs procPoints g
-       dump Opt_D_dump_cmmz_proc "Post Proc Points Added" g
-
-       ----------- Spills and reloads -------------------
-       g <- run $ dualLivenessWithInsertion procPoints g
-       dump Opt_D_dump_cmmz_spills "Post spills and reloads" g
-
-       ----------- Sink and inline assignments -------------------
-       g <- runOptimization $ rewriteAssignments platform g
-       dump Opt_D_dump_cmmz_rewrite "Post rewrite assignments" g
-
-       ----------- Eliminate dead assignments -------------------
-       g <- runOptimization $ removeDeadAssignments g
-       dump Opt_D_dump_cmmz_dead "Post remove dead assignments" g
-
-       ----------- Zero dead stack slots (Debug only) ---------------
-       -- Debugging: stubbing slots on death can cause crashes early
-       g <- if opt_StubDeadValues
-                then run $ stubSlotsOnDeath g
-                else return g
-       dump Opt_D_dump_cmmz_stub "Post stub dead stack slots" g
-
-       --------------- Stack layout ----------------
-       slotEnv <- run $ liveSlotAnal g
-       let spEntryMap = getSpEntryMap entry_off g
-       mbpprTrace "live slot analysis results: " (ppr slotEnv) $ return ()
-       let areaMap = layout procPoints spEntryMap slotEnv entry_off g
-       mbpprTrace "areaMap" (ppr areaMap) $ return ()
-
-       ------------  Manifest the stack pointer --------
-       g  <- run $ manifestSP spEntryMap areaMap entry_off g
-       dump Opt_D_dump_cmmz_sp "Post manifestSP" g
-       -- UGH... manifestSP can require updates to the procPointMap.
-       -- We can probably do something quicker here for the update...
+       let callPPs = {-# SCC "callProcPoints" #-} callProcPoints g
+       procPoints <- {-# SCC "minimalProcPointSet" #-} run $
+                     minimalProcPointSet (targetPlatform dflags) callPPs g
+
+       ----------- Layout the stack and manifest Sp ---------------
+       -- (also does: removeDeadAssignments, and lowerSafeForeignCalls)
+       (g, stackmaps) <- {-# SCC "layoutStack" #-}
+                         run $ cmmLayoutStack procPoints entry_off g
+       dump Opt_D_dump_cmmz_sp "Layout Stack" g
+
+       g <- {-# SCC "sink" #-} run $ cmmSink g
+       dump Opt_D_dump_cmmz_rewrite "Sink assignments" g
+
+--       ----------- Sink and inline assignments -------------------
+--       g <- {-# SCC "rewriteAssignments" #-} runOptimization $
+--            rewriteAssignments platform g
+--       dump Opt_D_dump_cmmz_rewrite "Post rewrite assignments" g
 
        ------------- Split into separate procedures ------------
-       procPointMap  <- run $ procPointAnalysis procPoints g
-       dump Opt_D_dump_cmmz_procmap "procpoint map" procPointMap
-       gs <- run $ splitAtProcPoints l callPPs procPoints procPointMap
-                                       (CmmProc h l g)
-       mapM_ (dump Opt_D_dump_cmmz_split "Post splitting") gs
-
-       ------------- More CAFs and foreign calls ------------
-       cafEnv <- run $ cafAnal g
-       let localCAFs = catMaybes $ map (localCAFInfo cafEnv) gs
+       procPointMap  <- {-# SCC "procPointAnalysis" #-} run $
+                        procPointAnalysis procPoints g
+       dumpWith dflags Opt_D_dump_cmmz_procmap "procpoint map" procPointMap
+       gs <- {-# SCC "splitAtProcPoints" #-} run $
+             splitAtProcPoints l callPPs procPoints procPointMap (CmmProc h l g)
+       dumps Opt_D_dump_cmmz_split "Post splitting" gs
+
+       ------------- More CAFs ------------------------------
+       let cafEnv = {-# SCC "cafAnal" #-} cafAnal platform g
+       let localCAFs = {-# SCC "localCAFs" #-} catMaybes $ map (localCAFInfo platform cafEnv) gs
        mbpprTrace "localCAFs" (ppr localCAFs) $ return ()
 
-       gs <- run $ mapM (lowerSafeForeignCalls areaMap) gs
-       mapM_ (dump Opt_D_dump_cmmz_lower "Post lowerSafeForeignCalls") gs
-
        -- NO MORE GRAPH TRANSFORMATION AFTER HERE -- JUST MAKING INFOTABLES
-       gs <- return $ map (setInfoTableStackMap slotEnv areaMap) gs
-       mapM_ (dump Opt_D_dump_cmmz_info "after setInfoTableStackMap") gs
-       gs <- return $ map (bundleCAFs cafEnv) gs
-       mapM_ (dump Opt_D_dump_cmmz_cafs "after bundleCAFs") gs
+       gs <- {-# SCC "setInfoTableStackMap" #-}
+             return $ map (setInfoTableStackMap stackmaps) gs
+       dumps Opt_D_dump_cmmz_info "after setInfoTableStackMap" gs
+
+       ----------- Control-flow optimisations ---------------
+       gs <- {-# SCC "cmmCfgOpts(2)" #-} return $ map cmmCfgOptsProc gs
+       dumps Opt_D_dump_cmmz_cfg "Post control-flow optimsations" gs
+
+       gs <- {-# SCC "bundleCAFs" #-} return $ map (bundleCAFs cafEnv) gs
+       dumps Opt_D_dump_cmmz_cafs "after bundleCAFs" gs
+
        return (localCAFs, gs)
 
               -- gs        :: [ (CAFSet, CmmDecl) ]
@@ -168,21 +156,40 @@ cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}})
 
   where dflags = hsc_dflags hsc_env
         platform = targetPlatform dflags
-        mbpprTrace x y z = if dopt Opt_D_dump_cmmz dflags then pprTrace x y z else z
-        dump f = dumpWith ppr f
-        dumpWith pprFun f txt g = do
-            -- ToDo: No easy way of say "dump all the cmmz, *and* split
-            -- them into files."  Also, -ddump-cmmz doesn't play nicely
-            -- with -ddump-to-file, since the headers get omitted.
-            dumpIfSet_dyn dflags f txt (pprFun g)
-            when (not (dopt f dflags)) $
-                dumpIfSet_dyn dflags Opt_D_dump_cmmz txt (pprFun g)
+        mbpprTrace x y z | dopt Opt_D_dump_cmmz dflags = pprTrace x y z
+                         | otherwise = z
+        dump = dumpGraph dflags
+
+        dumps flag name
+           = mapM_ (dumpWith dflags flag name)
+
         -- Runs a required transformation/analysis
         run = runInfiniteFuelIO (hsc_OptFuel hsc_env)
         -- Runs an optional transformation/analysis (and should
         -- thus be subject to optimization fuel)
         runOptimization = runFuelIO (hsc_OptFuel hsc_env)
 
+
+dumpGraph :: DynFlags -> DynFlag -> String -> CmmGraph -> IO ()
+dumpGraph dflags flag name g = do
+  when (dopt Opt_DoCmmLinting dflags) $ do_lint g
+  dumpWith dflags flag name g
+ where
+  do_lint g = case cmmLintGraph (targetPlatform dflags) g of
+                 Just err -> do { printDump err
+                                ; ghcExit dflags 1
+                                }
+                 Nothing  -> return ()
+
+dumpWith :: Outputable a => DynFlags -> DynFlag -> String -> a -> IO ()
+dumpWith dflags flag txt g = do
+         -- ToDo: No easy way of say "dump all the cmmz, *and* split
+         -- them into files."  Also, -ddump-cmmz doesn't play nicely
+         -- with -ddump-to-file, since the headers get omitted.
+   dumpIfSet_dyn dflags flag txt (ppr g)
+   when (not (dopt flag dflags)) $
+      dumpIfSet_dyn dflags Opt_D_dump_cmmz txt (ppr g)
+
 -- This probably belongs in CmmBuildInfoTables?
 -- We're just finishing the job here: once we know what CAFs are defined
 -- in non-static closures, we can build the SRTs.
diff --git a/compiler/cmm/CmmProcPoint.hs b/compiler/cmm/CmmProcPoint.hs
index f50d850b3a..8dda51b9b7 100644
--- a/compiler/cmm/CmmProcPoint.hs
+++ b/compiler/cmm/CmmProcPoint.hs
@@ -4,7 +4,7 @@
 module CmmProcPoint
     ( ProcPointSet, Status(..)
     , callProcPoints, minimalProcPointSet
-    , addProcPointProtocols, splitAtProcPoints, procPointAnalysis
+    , splitAtProcPoints, procPointAnalysis
     )
 where
 
@@ -28,7 +28,7 @@ import Platform
 import UniqSet
 import UniqSupply
 
-import Compiler.Hoopl
+import Hoopl
 
 import qualified Data.Map as Map
 
@@ -103,34 +103,50 @@ instance Outputable Status where
                     (hsep $ punctuate comma $ map ppr $ setElems ps)
   ppr ProcPoint = text "<procpt>"
 
-lattice :: DataflowLattice Status
-lattice = DataflowLattice "direct proc-point reachability" unreached add_to
-    where unreached = ReachedBy setEmpty
-          add_to _ (OldFact ProcPoint) _ = (NoChange, ProcPoint)
-          add_to _ _ (NewFact ProcPoint) = (SomeChange, ProcPoint) -- because of previous case
-          add_to _ (OldFact (ReachedBy p)) (NewFact (ReachedBy p')) =
-              let union = setUnion p' p
-              in  if setSize union > setSize p then (SomeChange, ReachedBy union)
-                                               else (NoChange, ReachedBy p)
 --------------------------------------------------
+-- Proc point analysis
+
+procPointAnalysis :: ProcPointSet -> CmmGraph -> FuelUniqSM (BlockEnv Status)
+-- Once you know what the proc-points are, figure out
+-- what proc-points each block is reachable from
+procPointAnalysis procPoints g =
+  -- pprTrace "procPointAnalysis" (ppr procPoints) $
+  dataflowAnalFwdBlocks g initProcPoints $ analFwd lattice forward
+  where initProcPoints = [(id, ProcPoint) | id <- setElems procPoints]
+
 -- transfer equations
 
 forward :: FwdTransfer CmmNode Status
-forward = mkFTransfer3 first middle ((mkFactBase lattice . ) . last)
-    where first :: CmmNode C O -> Status -> Status
-          first (CmmEntry id) ProcPoint = ReachedBy $ setSingleton id
-          first  _ x = x
+forward = mkFTransfer3 first middle last
+    where
+      first :: CmmNode C O -> Status -> Status
+      first (CmmEntry id) ProcPoint = ReachedBy $ setSingleton id
+      first  _ x = x
 
-          middle _ x = x
+      middle _ x = x
 
-          last :: CmmNode O C -> Status -> [(Label, Status)]
-          last (CmmCall {cml_cont = Just k}) _ = [(k, ProcPoint)]
-          last (CmmForeignCall {succ = k})   _ = [(k, ProcPoint)]
-          last l x = map (\id -> (id, x)) (successors l)
+      last :: CmmNode O C -> Status -> FactBase Status
+      last l x = mkFactBase lattice $ map (\id -> (id, x)) (successors l)
 
--- It is worth distinguishing two sets of proc points:
--- those that are induced by calls in the original graph
--- and those that are introduced because they're reachable from multiple proc points.
+lattice :: DataflowLattice Status
+lattice = DataflowLattice "direct proc-point reachability" unreached add_to
+    where unreached = ReachedBy setEmpty
+          add_to _ (OldFact ProcPoint) _ = (NoChange, ProcPoint)
+          add_to _ _ (NewFact ProcPoint) = (SomeChange, ProcPoint)
+                       -- because of previous case
+          add_to _ (OldFact (ReachedBy p)) (NewFact (ReachedBy p'))
+             | setSize union > setSize p = (SomeChange, ReachedBy union)
+             | otherwise                 = (NoChange, ReachedBy p)
+           where
+             union = setUnion p' p
+
+----------------------------------------------------------------------
+
+-- It is worth distinguishing two sets of proc points: those that are
+-- induced by calls in the original graph and those that are
+-- introduced because they're reachable from multiple proc points.
+--
+-- Extract the set of Continuation BlockIds, see Note [Continuation BlockIds].
 callProcPoints      :: CmmGraph -> ProcPointSet
 callProcPoints g = foldGraphBlocks add (setSingleton (g_entry g)) g
   where add :: CmmBlock -> BlockSet -> BlockSet
@@ -139,21 +155,17 @@ callProcPoints g = foldGraphBlocks add (setSingleton (g_entry g)) g
                       CmmForeignCall {succ=k}     -> setInsert k set
                       _ -> set
 
-minimalProcPointSet :: Platform -> ProcPointSet -> CmmGraph -> FuelUniqSM ProcPointSet
+minimalProcPointSet :: Platform -> ProcPointSet -> CmmGraph
+                    -> FuelUniqSM ProcPointSet
 -- Given the set of successors of calls (which must be proc-points)
 -- figure out the minimal set of necessary proc-points
-minimalProcPointSet platform callProcPoints g = extendPPSet platform g (postorderDfs g) callProcPoints
-
-procPointAnalysis :: ProcPointSet -> CmmGraph -> FuelUniqSM (BlockEnv Status)
--- Once you know what the proc-points are, figure out
--- what proc-points each block is reachable from
-procPointAnalysis procPoints g =
-  liftM snd $ dataflowPassFwd g initProcPoints $ analFwd lattice forward
-  where initProcPoints = [(id, ProcPoint) | id <- setElems procPoints]
+minimalProcPointSet platform callProcPoints g
+  = extendPPSet platform g (postorderDfs g) callProcPoints
 
 extendPPSet :: Platform -> CmmGraph -> [CmmBlock] -> ProcPointSet -> FuelUniqSM ProcPointSet
 extendPPSet platform g blocks procPoints =
     do env <- procPointAnalysis procPoints g
+       -- pprTrace "extensPPSet" (ppr env) $ return ()
        let add block pps = let id = entryLabel block
                            in  case mapLookup id env of
                                  Just ProcPoint -> setInsert id pps
@@ -179,183 +191,13 @@ extendPPSet platform g blocks procPoints =
            pps -> extendPPSet g blocks
                     (foldl extendBlockSet procPoints' pps)
 -}
-       case newPoint of Just id ->
-                          if setMember id procPoints' then panic "added old proc pt"
-                          else extendPPSet platform g blocks (setInsert id procPoints')
-                        Nothing -> return procPoints'
-
-
-------------------------------------------------------------------------
---                    Computing Proc-Point Protocols                  --
-------------------------------------------------------------------------
+       case newPoint of
+         Just id ->
+             if setMember id procPoints'
+                then panic "added old proc pt"
+                else extendPPSet platform g blocks (setInsert id procPoints')
+         Nothing -> return procPoints'
 
-{-
-
-There is one major trick, discovered by Michael Adams, which is that
-we want to choose protocols in a way that enables us to optimize away
-some continuations.  The optimization is very much like branch-chain
-elimination, except that it involves passing results as well as
-control.  The idea is that if a call's continuation k does nothing but
-CopyIn its results and then goto proc point P, the call's continuation
-may be changed to P, *provided* P's protocol is identical to the
-protocol for the CopyIn.  We choose protocols to make this so.
-
-Here's an explanatory example; we begin with the source code (lines
-separate basic blocks):
-
-  ..1..;
-  x, y = g();
-  goto P;
-  -------
-  P: ..2..;
-
-Zipperization converts this code as follows:
-
-  ..1..;
-  call g() returns to k;
-  -------
-  k: CopyIn(x, y);
-     goto P;
-  -------
-  P: ..2..;
-
-What we'd like to do is assign P the same CopyIn protocol as k, so we
-can eliminate k:
-
-  ..1..;
-  call g() returns to P;
-  -------
-  P: CopyIn(x, y); ..2..;
-
-Of course, P may be the target of more than one continuation, and
-different continuations may have different protocols.  Michael Adams
-implemented a voting mechanism, but he thinks a simple greedy
-algorithm would be just as good, so that's what we do.
-
--}
-
-data Protocol = Protocol Convention [CmmFormal] Area
-  deriving Eq
-instance Outputable Protocol where
-  ppr (Protocol c fs a) = text "Protocol" <+> ppr c <+> ppr fs <+> ppr a
-
--- | Function 'optimize_calls' chooses protocols only for those proc
--- points that are relevant to the optimization explained above.
--- The others are assigned by 'add_unassigned', which is not yet clever.
-
-addProcPointProtocols :: ProcPointSet -> ProcPointSet -> CmmGraph -> FuelUniqSM CmmGraph
-addProcPointProtocols callPPs procPoints g =
-  do liveness <- cmmLiveness g
-     (protos, g') <- optimize_calls liveness g
-     blocks'' <- add_CopyOuts protos procPoints g'
-     return $ ofBlockMap (g_entry g) blocks''
-    where optimize_calls liveness g =  -- see Note [Separate Adams optimization]
-            do let (protos, blocks') =
-                       foldGraphBlocks maybe_add_call (mapEmpty, mapEmpty) g
-                   protos' = add_unassigned liveness procPoints protos
-               let g' = ofBlockMap (g_entry g) (add_CopyIns callPPs protos' blocks')
-               return (protos', removeUnreachableBlocks g')
-          maybe_add_call :: CmmBlock -> (BlockEnv Protocol, BlockEnv CmmBlock)
-                         -> (BlockEnv Protocol, BlockEnv CmmBlock)
-          -- ^ If the block is a call whose continuation goes to a proc point
-          -- whose protocol either matches the continuation's or is not yet set,
-          -- redirect the call (cf 'newblock') and set the protocol if necessary
-          maybe_add_call block (protos, blocks) =
-              case lastNode block of
-                CmmCall tgt (Just k) args res s
-                    | Just proto <- mapLookup k protos,
-                      Just pee   <- branchesToProcPoint k
-                    -> let newblock = replaceLastNode block (CmmCall tgt (Just pee)
-                                                                     args res s)
-                           changed_blocks   = insertBlock newblock blocks
-                           unchanged_blocks = insertBlock block    blocks
-                       in case mapLookup pee protos of
-                            Nothing -> (mapInsert pee proto protos, changed_blocks)
-                            Just proto' ->
-                              if proto == proto' then (protos, changed_blocks)
-                              else (protos, unchanged_blocks)
-                _ -> (protos, insertBlock block blocks)
-
-          branchesToProcPoint :: BlockId -> Maybe BlockId
-          -- ^ Tells whether the named block is just a branch to a proc point
-          branchesToProcPoint id =
-              let block = mapLookup id (toBlockMap g) `orElse`
-                                    panic "branch out of graph"
-              in case blockToNodeList block of
-                   (_, [], JustC (CmmBranch pee)) | setMember pee procPoints -> Just pee
-                   _                                                         -> Nothing
-
--- | For now, following a suggestion by Ben Lippmeier, we pass all
--- live variables as arguments, hoping that a clever register
--- allocator might help.
-
-add_unassigned :: BlockEnv CmmLive -> ProcPointSet -> BlockEnv Protocol ->
-                  BlockEnv Protocol
-add_unassigned = pass_live_vars_as_args
-
-pass_live_vars_as_args :: BlockEnv CmmLive -> ProcPointSet ->
-                          BlockEnv Protocol -> BlockEnv Protocol
-pass_live_vars_as_args _liveness procPoints protos = protos'
-  where protos' = setFold addLiveVars protos procPoints
-        addLiveVars :: BlockId -> BlockEnv Protocol -> BlockEnv Protocol
-        addLiveVars id protos =
-            case mapLookup id protos of
-              Just _  -> protos
-              Nothing -> let live = emptyRegSet
-                                    --lookupBlockEnv _liveness id `orElse`
-                                    --panic ("no liveness at block " ++ show id)
-                             formals = uniqSetToList live
-                             prot = Protocol Private formals $ CallArea $ Young id
-                         in  mapInsert id prot protos
-
-
--- | Add copy-in instructions to each proc point that did not arise from a call
--- instruction. (Proc-points that arise from calls already have their copy-in instructions.)
-
-add_CopyIns :: ProcPointSet -> BlockEnv Protocol -> BlockEnv CmmBlock -> BlockEnv CmmBlock
-add_CopyIns callPPs protos blocks = mapFold maybe_insert_CopyIns mapEmpty blocks
-    where maybe_insert_CopyIns block blocks
-             | not $ setMember bid callPPs
-             , Just (Protocol c fs _area) <- mapLookup bid protos
-             = let nodes     = copyInSlot c fs
-                   (h, m, l) = blockToNodeList block
-               in insertBlock (blockOfNodeList (h, nodes ++ m, l)) blocks
-             | otherwise = insertBlock block blocks
-           where bid = entryLabel block
-
-
--- | Add a CopyOut node before each procpoint.
--- If the predecessor is a call, then the copy outs should already be done by the callee.
--- Note: If we need to add copy-out instructions, they may require stack space,
--- so we accumulate a map from the successors to the necessary stack space,
--- then update the successors after we have finished inserting the copy-outs.
-
-add_CopyOuts :: BlockEnv Protocol -> ProcPointSet -> CmmGraph ->
-                FuelUniqSM (BlockEnv CmmBlock)
-add_CopyOuts protos procPoints g = foldGraphBlocks mb_copy_out (return mapEmpty) g
-    where mb_copy_out :: CmmBlock -> FuelUniqSM (BlockEnv CmmBlock) ->
-                                     FuelUniqSM (BlockEnv CmmBlock)
-          mb_copy_out b z | entryLabel b == g_entry g = skip b z
-          mb_copy_out b z =
-            case lastNode b of
-              CmmCall {}        -> skip b z -- copy out done by callee
-              CmmForeignCall {} -> skip b z -- copy out done by callee
-              _ -> copy_out b z
-          copy_out b z = foldr trySucc init (successors b) >>= finish
-            where init = (\bmap -> (b, bmap)) `liftM` z
-                  trySucc succId z =
-                    if setMember succId procPoints then
-                      case mapLookup succId protos of
-                        Nothing -> z
-                        Just (Protocol c fs _area) -> insert z succId $ copyOutSlot c fs
-                    else z
-                  insert z succId m =
-                    do (b, bmap) <- z
-                       (b, bs)   <- insertBetween b m succId
-                       -- pprTrace "insert for succ" (ppr succId <> ppr m) $ do
-                       return $ (b, foldl (flip insertBlock) bmap bs)
-                  finish (b, bmap) = return $ insertBlock b bmap
-          skip b bs = insertBlock b `liftM` bs
 
 -- At this point, we have found a set of procpoints, each of which should be
 -- the entry point of a procedure.
@@ -384,15 +226,18 @@ splitAtProcPoints entry_label callPPs procPoints procMap
                  []   -> graphEnv
                  [id] -> add graphEnv id bid b 
                  _    -> panic "Each block should be reachable from only one ProcPoint"
-             Nothing -> pprPanic "block not reached by a proc point?" (ppr bid)
+             Nothing -> graphEnv
            where bid = entryLabel b
          add graphEnv procId bid b = mapInsert procId graph' graphEnv
                where graph  = mapLookup procId graphEnv `orElse` mapEmpty
                      graph' = mapInsert bid b graph
+
      graphEnv <- return $ foldGraphBlocks addBlock emptyBlockMap g
+
      -- Build a map from proc point BlockId to pairs of:
      --  * Labels for their new procedures
-     --  * Labels for the info tables of their new procedures (only if the proc point is a callPP)
+     --  * Labels for the info tables of their new procedures (only if
+     --    the proc point is a callPP)
      -- Due to common blockification, we may overestimate the set of procpoints.
      let add_label map pp = Map.insert pp lbls map
            where lbls | pp == entry = (entry_label, Just entry_info_lbl)
@@ -401,30 +246,15 @@ splitAtProcPoints entry_label callPPs procPoints procMap
                  entry_info_lbl = cit_lbl info_tbl
          procLabels = foldl add_label Map.empty
                             (filter (flip mapMember (toBlockMap g)) (setElems procPoints))
-     -- For each procpoint, we need to know the SP offset on entry.
-     -- If the procpoint is:
-     --  - continuation of a call, the SP offset is in the call
-     --  - otherwise, 0 (and left out of the spEntryMap)
-     let add_sp_off :: CmmBlock -> BlockEnv CmmStackInfo -> BlockEnv CmmStackInfo
-         add_sp_off b env =
-           case lastNode b of
-             CmmCall {cml_cont = Just succ, cml_ret_args = off, cml_ret_off = updfr_off} ->
-               mapInsert succ (StackInfo { arg_space = off, updfr_space = Just updfr_off}) env
-             CmmForeignCall {succ = succ, updfr = updfr_off} ->
-               mapInsert succ (StackInfo { arg_space = wORD_SIZE, updfr_space = Just updfr_off}) env
-             _ -> env
-         spEntryMap = foldGraphBlocks add_sp_off (mapInsert entry stack_info emptyBlockMap) g
-         getStackInfo id = mapLookup id spEntryMap `orElse` StackInfo {arg_space = 0, updfr_space = Nothing}
      -- In each new graph, add blocks jumping off to the new procedures,
      -- and replace branches to procpoints with branches to the jump-off blocks
      let add_jump_block (env, bs) (pp, l) =
            do bid <- liftM mkBlockId getUniqueM
-              let b = blockOfNodeList (JustC (CmmEntry bid), [], JustC jump)
-                  StackInfo {arg_space = argSpace, updfr_space = off} = getStackInfo pp
-                  jump = CmmCall (CmmLit (CmmLabel l)) Nothing argSpace 0
-                                 (off `orElse` 0) -- Jump's shouldn't need the offset...
+              let b = blockJoin (CmmEntry bid) emptyBlock jump
+                  jump = CmmCall (CmmLit (CmmLabel l)) Nothing 0 0 0
               return (mapInsert pp bid env, b : bs)
-         add_jumps (newGraphEnv) (ppId, blockEnv) =
+
+         add_jumps newGraphEnv (ppId, blockEnv) =
            do let needed_jumps = -- find which procpoints we currently branch to
                     mapFold add_if_branch_to_pp [] blockEnv
                   add_if_branch_to_pp :: CmmBlock -> [(BlockId, CLabel)] -> [(BlockId, CLabel)]
@@ -441,17 +271,16 @@ splitAtProcPoints entry_label callPPs procPoints procMap
                  foldM add_jump_block (mapEmpty, []) needed_jumps
                   -- update the entry block
               let b = expectJust "block in env" $ mapLookup ppId blockEnv
-                  off = getStackInfo ppId
                   blockEnv' = mapInsert ppId b blockEnv
                   -- replace branches to procpoints with branches to jumps
                   blockEnv'' = toBlockMap $ replaceBranches jumpEnv $ ofBlockMap ppId blockEnv'
                   -- add the jump blocks to the graph
                   blockEnv''' = foldl (flip insertBlock) blockEnv'' jumpBlocks
-              let g' = (off, ofBlockMap ppId blockEnv''')
+              let g' = ofBlockMap ppId blockEnv'''
               -- pprTrace "g' pre jumps" (ppr g') $ do
               return (mapInsert ppId g' newGraphEnv)
      graphEnv <- foldM add_jumps emptyBlockMap $ mapToList graphEnv
-     let to_proc (bid, (stack_info, g)) = case expectJust "pp label" $ Map.lookup bid procLabels of
+     let to_proc (bid, g) = case expectJust "pp label" $ Map.lookup bid procLabels of
              (lbl, Just info_lbl)
                | bid == entry
                -> CmmProc (TopInfo {info_tbl=info_tbl, stack_info=stack_info})
@@ -462,15 +291,22 @@ splitAtProcPoints entry_label callPPs procPoints procMap
              (lbl, Nothing)
                -> CmmProc (TopInfo {info_tbl=CmmNonInfoTable, stack_info=stack_info})
                           lbl (replacePPIds g)
-         -- References to procpoint IDs can now be replaced with the infotable's label
-         replacePPIds g = mapGraphNodes (id, mapExp repl, mapExp repl) g
+            where
+             stack_info = panic "No StackInfo"
+
+         -- References to procpoint IDs can now be replaced with the
+         -- infotable's label
+         replacePPIds g = {-# SCC "replacePPIds" #-}
+                          mapGraphNodes (id, mapExp repl, mapExp repl) g
            where repl e@(CmmLit (CmmBlock bid)) =
                    case Map.lookup bid procLabels of
                      Just (_, Just info_lbl)  -> CmmLit (CmmLabel info_lbl)
                      _ -> e
                  repl e = e
-     -- The C back end expects to see return continuations before the call sites.
-     -- Here, we sort them in reverse order -- it gets reversed later.
+
+     -- The C back end expects to see return continuations before the
+     -- call sites.  Here, we sort them in reverse order -- it gets
+     -- reversed later.
      let (_, block_order) = foldl add_block_num (0::Int, emptyBlockMap) (postorderDfs g)
          add_block_num (i, map) block = (i+1, mapInsert (entryLabel block) i map)
          sort_fn (bid, _) (bid', _) =
@@ -482,6 +318,27 @@ splitAtProcPoints entry_label callPPs procPoints procMap
             procs
 splitAtProcPoints _ _ _ _ t@(CmmData _ _) = return [t]
 
+
+-- Only called from CmmProcPoint.splitAtProcPoints. NB. does a
+-- recursive lookup, see comment below.
+replaceBranches :: BlockEnv BlockId -> CmmGraph -> CmmGraph
+replaceBranches env cmmg
+  = {-# SCC "replaceBranches" #-}
+    ofBlockMap (g_entry cmmg) $ mapMap f $ toBlockMap cmmg
+  where
+    f block = replaceLastNode block $ last (lastNode block)
+
+    last :: CmmNode O C -> CmmNode O C
+    last (CmmBranch id)          = CmmBranch (lookup id)
+    last (CmmCondBranch e ti fi) = CmmCondBranch e (lookup ti) (lookup fi)
+    last (CmmSwitch e tbl)       = CmmSwitch e (map (fmap lookup) tbl)
+    last l@(CmmCall {})          = l
+    last l@(CmmForeignCall {})   = l
+    lookup id = fmap lookup (mapLookup id env) `orElse` id
+            -- XXX: this is a recursive lookup, it follows chains
+            -- until the lookup returns Nothing, at which point we
+            -- return the last BlockId
+
 ----------------------------------------------------------------
 
 {-
diff --git a/compiler/cmm/CmmRewriteAssignments.hs b/compiler/cmm/CmmRewriteAssignments.hs
index ecf3f7e0c3..2c33b7b5ac 100644
--- a/compiler/cmm/CmmRewriteAssignments.hs
+++ b/compiler/cmm/CmmRewriteAssignments.hs
@@ -27,7 +27,7 @@ import UniqFM
 import Unique
 import BlockId
 
-import Compiler.Hoopl hiding (Unique)
+import Hoopl
 import Data.Maybe
 import Prelude hiding (succ, zip)
 
@@ -404,8 +404,8 @@ clobbers (CmmReg (CmmGlobal Hp), _) (_, _) = False
 -- ToDo: Also catch MachOp case
 clobbers (ss@CmmStackSlot{}, CmmReg (CmmLocal r)) (u, CmmLoad (ss'@CmmStackSlot{}) _)
     | getUnique r == u, ss == ss' = False -- No-op on the stack slot (XXX: Do we need this special case?)
-clobbers (CmmStackSlot (CallArea a) o, rhs) (_, expr) = f expr
-    where f (CmmLoad (CmmStackSlot (CallArea a') o') t)
+clobbers (CmmStackSlot a o, rhs) (_, expr) = f expr
+    where f (CmmLoad (CmmStackSlot a' o') t)
             = (a, o, widthInBytes (cmmExprWidth rhs)) `overlaps` (a', o', widthInBytes (typeWidth t))
           f (CmmLoad e _)    = containsStackSlot e
           f (CmmMachOp _ es) = or (map f es)
@@ -416,9 +416,6 @@ clobbers (CmmStackSlot (CallArea a) o, rhs) (_, expr) = f expr
           containsStackSlot (CmmMachOp _ es) = or (map containsStackSlot es)
           containsStackSlot (CmmStackSlot{}) = True
           containsStackSlot _ = False
-clobbers (CmmStackSlot (RegSlot l) _, _) (_, expr) = f expr
-    where f (CmmLoad (CmmStackSlot (RegSlot l') _) _) = l == l'
-          f _ = False
 clobbers _ (_, e) = f e
     where f (CmmLoad (CmmStackSlot _ _) _) = False
           f (CmmLoad{}) = True -- conservative
@@ -432,7 +429,7 @@ clobbers _ (_, e) = f e
 --      [ I32  ]
 --      [    F64     ]
 --      s'   -w'-    o'
-type CallSubArea = (AreaId, Int, Int) -- area, offset, width
+type CallSubArea = (Area, Int, Int) -- area, offset, width
 overlaps :: CallSubArea -> CallSubArea -> Bool
 overlaps (a, _, _) (a', _, _) | a /= a' = False
 overlaps (_, o, w) (_, o', w') =
@@ -457,7 +454,7 @@ invalidateVolatile :: BlockId -> AssignmentMap -> AssignmentMap
 invalidateVolatile k m = mapUFM p m
   where p (AlwaysInline e) = if exp e then AlwaysInline e else NeverOptimize
             where exp CmmLit{} = True
-                  exp (CmmLoad (CmmStackSlot (CallArea (Young k')) _) _)
+                  exp (CmmLoad (CmmStackSlot (Young k') _) _)
                     | k' == k = False
                   exp (CmmLoad (CmmStackSlot _ _) _) = True
                   exp (CmmMachOp _ es) = and (map exp es)
@@ -596,10 +593,6 @@ assignmentRewrite = mkFRewrite3 first middle last
                           where rep = typeWidth (localRegType r)
               _ -> old
         -- See Note [Soundness of store rewriting]
-        inlineExp assign old@(CmmLoad (CmmStackSlot (RegSlot r) _) _)
-          = case lookupUFM assign r of
-              Just (AlwaysInline x) -> x
-              _ -> old
         inlineExp _ old = old
 
         inlinable :: CmmNode e x -> Bool
diff --git a/compiler/cmm/CmmSpillReload.hs b/compiler/cmm/CmmSpillReload.hs
deleted file mode 100644
index 9e762fe48a..0000000000
--- a/compiler/cmm/CmmSpillReload.hs
+++ /dev/null
@@ -1,166 +0,0 @@
-{-# LANGUAGE GADTs, NoMonoLocalBinds, FlexibleContexts #-}
--- Norman likes local bindings
--- If this module lives on I'd like to get rid of this flag in due course
-
-{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
-
--- TODO: Get rid of this flag:
-{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
-
-module CmmSpillReload
-  ( dualLivenessWithInsertion
-  )
-where
-
-import BlockId
-import Cmm
-import CmmUtils
-import CmmLive
-import OptimizationFuel
-
-import Control.Monad
-import Outputable hiding (empty)
-import qualified Outputable as PP
-import UniqSet
-
-import Compiler.Hoopl hiding (Unique)
-import Data.Maybe
-import Prelude hiding (succ, zip)
-
-{- Note [Overview of spill/reload]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The point of this module is to insert spills and reloads to establish
-the invariant that at a call or any proc point with an established
-protocol all live variables not expected in registers are sitting on the
-stack.  We use a backward dual liveness analysis (both traditional
-register liveness as well as register slot liveness on the stack) to
-insert spills and reloads.  It should be followed by a forward
-transformation to sink reloads as deeply as possible, so as to reduce
-register pressure: this transformation is performed by
-CmmRewriteAssignments.
-
-A variable can be expected to be live in a register, live on the
-stack, or both.  This analysis ensures that spills and reloads are
-inserted as needed to make sure that every live variable needed
-after a call is available on the stack.  Spills are placed immediately
-after their reaching definitions, but reloads are placed immediately
-after a return from a call (the entry point.)
-
-Note that we offer no guarantees about the consistency of the value
-in memory and the value in the register, except that they are
-equal across calls/procpoints.  If the variable is changed, this
-mapping breaks: but as the original value of the register may still
-be useful in a different context, the memory location is not updated.
--}
-
-data DualLive = DualLive { on_stack :: RegSet, in_regs :: RegSet }
-
-changeStack, changeRegs :: (RegSet -> RegSet) -> DualLive -> DualLive
-changeStack f live = live { on_stack = f (on_stack live) }
-changeRegs  f live = live { in_regs  = f (in_regs  live) }
-
-dualLiveLattice :: DataflowLattice DualLive
-dualLiveLattice = DataflowLattice "variables live in registers and on stack" empty add
-    where empty = DualLive emptyRegSet emptyRegSet
-          add _ (OldFact old) (NewFact new) = (changeIf $ change1 || change2, DualLive stack regs)
-            where (change1, stack) = add1 (on_stack old) (on_stack new)
-                  (change2, regs)  = add1 (in_regs old)  (in_regs new)
-          add1 old new = if sizeUniqSet join > sizeUniqSet old then (True, join) else (False, old)
-            where join = unionUniqSets old new
-
-dualLivenessWithInsertion :: BlockSet -> CmmGraph -> FuelUniqSM CmmGraph
-dualLivenessWithInsertion procPoints g =
-  liftM fst $ dataflowPassBwd g [] $ analRewBwd dualLiveLattice
-                                                (dualLiveTransfers (g_entry g) procPoints)
-                                                (insertSpillsAndReloads g procPoints)
-
--- Note [Live registers on entry to procpoints]
--- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- Remember that the transfer function is only ever run on the rewritten
--- version of a graph, and the rewrite function for spills and reloads
--- enforces the invariant that no local registers are live on entry to
--- a procpoint.  Accordingly, we check for this invariant here.  An old
--- version of this code incorrectly claimed that any live registers were
--- live on the stack before entering the function: this is wrong, but
--- didn't cause bugs because it never actually was invoked.
-
-dualLiveTransfers :: BlockId -> BlockSet -> (BwdTransfer CmmNode DualLive)
-dualLiveTransfers entry procPoints = mkBTransfer3 first middle last
-    where first :: CmmNode C O -> DualLive -> DualLive
-          first (CmmEntry id) live -- See Note [Live registers on entry to procpoints]
-            | id == entry || setMember id procPoints = noLiveOnEntry id (in_regs live) live
-            | otherwise                              = live
-
-          middle :: CmmNode O O -> DualLive -> DualLive
-          middle m = changeStack updSlots
-                   . changeRegs  updRegs
-            where -- Reuse middle of liveness analysis from CmmLive
-                  updRegs = case getBTransfer3 xferLive of (_, middle, _) -> middle m
-
-                  updSlots live = foldSlotsUsed reload (foldSlotsDefd spill live m) m
-                  spill  live s@(RegSlot r, _, _) = check s $ deleteFromRegSet live r
-                  spill  live _ = live
-                  reload live s@(RegSlot r, _, _) = check s $ extendRegSet live r
-                  reload live _ = live
-                  -- Ensure the assignment refers to the entirety of the
-                  -- register slot (and not just a slice).
-                  check (RegSlot (LocalReg _ ty), o, w) x
-                     | o == w && w == widthInBytes (typeWidth ty) = x
-                  check _ _ = panic "dualLiveTransfers: slices unsupported"
-
-          -- Register analysis is identical to liveness analysis from CmmLive.
-          last :: CmmNode O C -> FactBase DualLive -> DualLive
-          last l fb = changeRegs (gen_kill l) $ case l of
-            CmmCall {cml_cont=Nothing} -> empty
-            CmmCall {cml_cont=Just k}  -> keep_stack_only k
-            CmmForeignCall {succ=k}    -> keep_stack_only k
-            _                          -> joinOutFacts dualLiveLattice l fb
-            where empty = fact_bot dualLiveLattice
-                  lkp k = fromMaybe empty (lookupFact k fb)
-                  keep_stack_only k = DualLive (on_stack (lkp k)) emptyRegSet
-
-insertSpillsAndReloads :: CmmGraph -> BlockSet -> CmmBwdRewrite DualLive
-insertSpillsAndReloads graph procPoints = deepBwdRw3 first middle nothing
-  -- Beware: deepBwdRw with one polymorphic function seems more reasonable here,
-  -- but GHC miscompiles it, see bug #4044.
-    where first :: CmmNode C O -> Fact O DualLive -> CmmReplGraph C O
-          first e@(CmmEntry id) live = return $
-            if id /= (g_entry graph) && setMember id procPoints then
-              case map reload (uniqSetToList (in_regs live)) of
-                [] -> Nothing
-                is -> Just $ mkFirst e <*> mkMiddles is
-            else Nothing
-          -- EZY: There was some dead code for handling the case where
-          -- we were not splitting procedures.  Check Git history if
-          -- you're interested (circa e26ea0f41).
-
-          middle :: CmmNode O O -> Fact O DualLive -> CmmReplGraph O O
-          -- Don't add spills next to reloads.
-          middle (CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot (RegSlot reg') _) _)) _ | reg == reg' = return Nothing
-          -- Spill if register is live on stack.
-          middle m@(CmmAssign (CmmLocal reg) _) live
-            | reg `elemRegSet` on_stack live = return (Just (mkMiddles [m, spill reg]))
-          middle _ _ = return Nothing
-
-          nothing _ _ = return Nothing
-
-spill, reload :: LocalReg -> CmmNode O O
-spill  r = CmmStore  (regSlot r) (CmmReg $ CmmLocal r)
-reload r = CmmAssign (CmmLocal r) (CmmLoad (regSlot r) $ localRegType r)
-
----------------------
--- prettyprinting
-
-ppr_regs :: String -> RegSet -> SDoc
-ppr_regs s regs = text s <+> commafy (map ppr $ uniqSetToList regs)
-  where commafy xs = hsep $ punctuate comma xs
-
-instance Outputable DualLive where
-  ppr (DualLive {in_regs = regs, on_stack = stack}) =
-      if isEmptyUniqSet regs && isEmptyUniqSet stack then
-          text "<nothing-live>"
-      else
-          nest 2 $ fsep [if isEmptyUniqSet regs then PP.empty
-                         else (ppr_regs "live in regs =" regs),
-                         if isEmptyUniqSet stack then PP.empty
-                         else (ppr_regs "live on stack =" stack)]
diff --git a/compiler/cmm/CmmUtils.hs b/compiler/cmm/CmmUtils.hs
index 0756c87583..d831a8aba5 100644
--- a/compiler/cmm/CmmUtils.hs
+++ b/compiler/cmm/CmmUtils.hs
@@ -60,13 +60,14 @@ module CmmUtils(
         -- * Operations that probably don't belong here
         modifyGraph,
 
-        lastNode, replaceLastNode, insertBetween,
+        lastNode, replaceLastNode,
         ofBlockMap, toBlockMap, insertBlock,
         ofBlockList, toBlockList, bodyToBlockList,
         foldGraphBlocks, mapGraphNodes, postorderDfs, mapGraphNodes1,
       
         analFwd, analBwd, analRewFwd, analRewBwd,
-        dataflowPassFwd, dataflowPassBwd
+        dataflowPassFwd, dataflowPassBwd, dataflowAnalFwd, dataflowAnalBwd,
+        dataflowAnalFwdBlocks
   ) where
 
 #include "HsVersions.h"
@@ -89,7 +90,7 @@ import Data.Word
 import Data.Maybe
 import Data.Bits
 import Control.Monad
-import Compiler.Hoopl hiding ( Unique )
+import Hoopl
 
 ---------------------------------------------------
 --
@@ -402,13 +403,13 @@ mkLiveness (reg:regs)
 modifyGraph :: (Graph n C C -> Graph n' C C) -> GenCmmGraph n -> GenCmmGraph n'
 modifyGraph f g = CmmGraph {g_entry=g_entry g, g_graph=f (g_graph g)}
 
-toBlockMap :: CmmGraph -> LabelMap CmmBlock
+toBlockMap :: CmmGraph -> BlockEnv CmmBlock
 toBlockMap (CmmGraph {g_graph=GMany NothingO body NothingO}) = body
 
-ofBlockMap :: BlockId -> LabelMap CmmBlock -> CmmGraph
+ofBlockMap :: BlockId -> BlockEnv CmmBlock -> CmmGraph
 ofBlockMap entry bodyMap = CmmGraph {g_entry=entry, g_graph=GMany NothingO bodyMap NothingO}
 
-insertBlock :: CmmBlock -> LabelMap CmmBlock -> LabelMap CmmBlock
+insertBlock :: CmmBlock -> BlockEnv CmmBlock -> BlockEnv CmmBlock
 insertBlock block map =
   ASSERT (isNothing $ mapLookup id map)
   mapInsert id block map
@@ -418,7 +419,8 @@ toBlockList :: CmmGraph -> [CmmBlock]
 toBlockList g = mapElems $ toBlockMap g
 
 ofBlockList :: BlockId -> [CmmBlock] -> CmmGraph
-ofBlockList entry blocks = CmmGraph {g_entry=entry, g_graph=GMany NothingO body NothingO}
+ofBlockList entry blocks = CmmGraph { g_entry = entry
+                                    , g_graph = GMany NothingO body NothingO }
   where body = foldr addBlock emptyBody blocks
 
 bodyToBlockList :: Body CmmNode -> [CmmBlock]
@@ -439,87 +441,67 @@ foldGraphBlocks :: (CmmBlock -> a -> a) -> a -> CmmGraph -> a
 foldGraphBlocks k z g = mapFold k z $ toBlockMap g
 
 postorderDfs :: CmmGraph -> [CmmBlock]
-postorderDfs g = postorder_dfs_from (toBlockMap g) (g_entry g)
-
--------------------------------------------------
--- Manipulating CmmBlocks
-
-lastNode :: CmmBlock -> CmmNode O C
-lastNode block = foldBlockNodesF3 (nothing, nothing, const) block ()
-  where nothing :: a -> b -> ()
-        nothing _ _ = ()
-
-replaceLastNode :: Block CmmNode e C -> CmmNode O C -> Block CmmNode e C
-replaceLastNode block last = blockOfNodeList (first, middle, JustC last)
-  where (first, middle, _) = blockToNodeList block
-
-----------------------------------------------------------------------
------ Splicing between blocks
--- Given a middle node, a block, and a successor BlockId,
--- we can insert the middle node between the block and the successor.
--- We return the updated block and a list of new blocks that must be added
--- to the graph.
--- The semantics is a bit tricky. We consider cases on the last node:
--- o For a branch, we can just insert before the branch,
---   but sometimes the optimizer does better if we actually insert
---   a fresh basic block, enabling some common blockification.
--- o For a conditional branch, switch statement, or call, we must insert
---   a new basic block.
--- o For a jump or return, this operation is impossible.
-
-insertBetween :: MonadUnique m => CmmBlock -> [CmmNode O O] -> BlockId -> m (CmmBlock, [CmmBlock])
-insertBetween b ms succId = insert $ lastNode b
-  where insert :: MonadUnique m => CmmNode O C -> m (CmmBlock, [CmmBlock])
-        insert (CmmBranch bid) =
-          if bid == succId then
-            do (bid', bs) <- newBlocks
-               return (replaceLastNode b (CmmBranch bid'), bs)
-          else panic "tried invalid block insertBetween"
-        insert (CmmCondBranch c t f) =
-          do (t', tbs) <- if t == succId then newBlocks else return $ (t, [])
-             (f', fbs) <- if f == succId then newBlocks else return $ (f, [])
-             return (replaceLastNode b (CmmCondBranch c t' f'), tbs ++ fbs)
-        insert (CmmSwitch e ks) =
-          do (ids, bs) <- mapAndUnzipM mbNewBlocks ks
-             return (replaceLastNode b (CmmSwitch e ids), join bs)
-        insert (CmmCall {}) =
-          panic "unimp: insertBetween after a call -- probably not a good idea"
-        insert (CmmForeignCall {}) =
-          panic "unimp: insertBetween after a foreign call -- probably not a good idea"
-
-        newBlocks :: MonadUnique m => m (BlockId, [CmmBlock])
-        newBlocks = do id <- liftM mkBlockId $ getUniqueM
-                       return $ (id, [blockOfNodeList (JustC (CmmEntry id), ms, JustC (CmmBranch succId))])
-        mbNewBlocks :: MonadUnique m => Maybe BlockId -> m (Maybe BlockId, [CmmBlock])
-        mbNewBlocks (Just k) = if k == succId then liftM fstJust newBlocks
-                               else return (Just k, [])
-        mbNewBlocks Nothing  = return (Nothing, [])
-        fstJust (id, bs) = (Just id, bs)
+postorderDfs g = {-# SCC "postorderDfs" #-} postorder_dfs_from (toBlockMap g) (g_entry g)
 
 -------------------------------------------------
 -- Running dataflow analysis and/or rewrites
 
 -- Constructing forward and backward analysis-only pass
-analFwd    :: Monad m => DataflowLattice f -> FwdTransfer n f -> FwdPass m n f
-analBwd    :: Monad m => DataflowLattice f -> BwdTransfer n f -> BwdPass m n f
+analFwd    :: DataflowLattice f -> FwdTransfer n f -> FwdPass FuelUniqSM n f
+analBwd    :: DataflowLattice f -> BwdTransfer n f -> BwdPass FuelUniqSM n f
 
 analFwd lat xfer = analRewFwd lat xfer noFwdRewrite
 analBwd lat xfer = analRewBwd lat xfer noBwdRewrite
 
 -- Constructing forward and backward analysis + rewrite pass
-analRewFwd :: Monad m => DataflowLattice f -> FwdTransfer n f -> FwdRewrite m n f -> FwdPass m n f
-analRewBwd :: Monad m => DataflowLattice f -> BwdTransfer n f -> BwdRewrite m n f -> BwdPass m n f
+analRewFwd :: DataflowLattice f -> FwdTransfer n f
+           -> FwdRewrite FuelUniqSM n f
+           -> FwdPass FuelUniqSM n f
+
+analRewBwd :: DataflowLattice f
+           -> BwdTransfer n f
+           -> BwdRewrite FuelUniqSM n f
+           -> BwdPass FuelUniqSM n f
 
 analRewFwd lat xfer rew = FwdPass {fp_lattice = lat, fp_transfer = xfer, fp_rewrite = rew}
 analRewBwd lat xfer rew = BwdPass {bp_lattice = lat, bp_transfer = xfer, bp_rewrite = rew}
 
 -- Running forward and backward dataflow analysis + optional rewrite
-dataflowPassFwd :: NonLocal n => GenCmmGraph n -> [(BlockId, f)] -> FwdPass FuelUniqSM n f -> FuelUniqSM (GenCmmGraph n, BlockEnv f)
+dataflowPassFwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> FwdPass FuelUniqSM n f
+                -> FuelUniqSM (GenCmmGraph n, BlockEnv f)
 dataflowPassFwd (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd = do
   (graph, facts, NothingO) <- analyzeAndRewriteFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
   return (CmmGraph {g_entry=entry, g_graph=graph}, facts)
 
-dataflowPassBwd :: NonLocal n => GenCmmGraph n -> [(BlockId, f)] -> BwdPass FuelUniqSM n f -> FuelUniqSM (GenCmmGraph n, BlockEnv f)
+dataflowAnalFwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> FwdPass FuelUniqSM n f
+                -> BlockEnv f
+dataflowAnalFwd (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd =
+  analyzeFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
+
+dataflowAnalFwdBlocks :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> FwdPass FuelUniqSM n f
+                -> FuelUniqSM (BlockEnv f)
+dataflowAnalFwdBlocks (CmmGraph {g_entry=entry, g_graph=graph}) facts fwd = do
+--  (graph, facts, NothingO) <- analyzeAndRewriteFwd fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts)
+--  return facts
+  return (analyzeFwdBlocks fwd (JustC [entry]) graph (mkFactBase (fp_lattice fwd) facts))
+
+dataflowAnalBwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> BwdPass FuelUniqSM n f
+                -> BlockEnv f
+dataflowAnalBwd (CmmGraph {g_entry=entry, g_graph=graph}) facts bwd =
+  analyzeBwd bwd (JustC [entry]) graph (mkFactBase (bp_lattice bwd) facts)
+
+dataflowPassBwd :: NonLocal n =>
+                   GenCmmGraph n -> [(BlockId, f)]
+                -> BwdPass FuelUniqSM n f
+                -> FuelUniqSM (GenCmmGraph n, BlockEnv f)
 dataflowPassBwd (CmmGraph {g_entry=entry, g_graph=graph}) facts bwd = do
   (graph, facts, NothingO) <- analyzeAndRewriteBwd bwd (JustC [entry]) graph (mkFactBase (bp_lattice bwd) facts)
   return (CmmGraph {g_entry=entry, g_graph=graph}, facts)
diff --git a/compiler/cmm/Hoopl.hs b/compiler/cmm/Hoopl.hs
new file mode 100644
index 0000000000..404482e047
--- /dev/null
+++ b/compiler/cmm/Hoopl.hs
@@ -0,0 +1,124 @@
+module Hoopl (
+    module Compiler.Hoopl,
+    module Hoopl.Dataflow,
+    deepBwdRw3, deepBwdRw,
+    thenFwdRw
+  ) where
+
+import Compiler.Hoopl hiding
+  ( Unique,
+    FwdTransfer(..), FwdRewrite(..), FwdPass(..),
+    BwdTransfer(..), BwdRewrite(..), BwdPass(..),
+    noFwdRewrite, noBwdRewrite,
+--    analyzeAndRewriteFwd, analyzeAndRewriteBwd,
+    mkFactBase, Fact,
+    mkBRewrite, mkBRewrite3, mkBTransfer, mkBTransfer3,
+    mkFRewrite, mkFRewrite3, mkFTransfer, mkFTransfer3,
+    deepFwdRw, deepFwdRw3, thenFwdRw, iterFwdRw,
+    deepBwdRw, deepBwdRw3, thenBwdRw, iterBwdRw
+  )
+
+import Hoopl.Dataflow
+import OptimizationFuel
+import Control.Monad
+
+deepFwdRw3 :: (n C O -> f -> FuelUniqSM (Maybe (Graph n C O)))
+           -> (n O O -> f -> FuelUniqSM (Maybe (Graph n O O)))
+           -> (n O C -> f -> FuelUniqSM (Maybe (Graph n O C)))
+           -> (FwdRewrite FuelUniqSM n f)
+deepFwdRw :: (forall e x . n e x -> f -> FuelUniqSM (Maybe (Graph n e x))) -> FwdRewrite FuelUniqSM n f
+deepFwdRw3 f m l = iterFwdRw $ mkFRewrite3 f m l
+deepFwdRw f = deepFwdRw3 f f f
+
+-- N.B. rw3, rw3', and rw3a are triples of functions.
+-- But rw and rw' are single functions.
+thenFwdRw :: forall n f.
+             FwdRewrite FuelUniqSM n f
+          -> FwdRewrite FuelUniqSM n f 
+          -> FwdRewrite FuelUniqSM n f
+thenFwdRw rw3 rw3' = wrapFR2 thenrw rw3 rw3'
+ where
+  thenrw :: forall e x t t1.
+               (t -> t1 -> FuelUniqSM (Maybe (Graph n e x, FwdRewrite FuelUniqSM n f)))
+            -> (t -> t1 -> FuelUniqSM (Maybe (Graph n e x, FwdRewrite FuelUniqSM n f)))
+            -> t
+            -> t1
+            -> FuelUniqSM (Maybe (Graph n e x, FwdRewrite FuelUniqSM n f))
+  thenrw rw rw' n f = rw n f >>= fwdRes
+     where fwdRes Nothing   = rw' n f
+           fwdRes (Just gr) = return $ Just $ fadd_rw rw3' gr
+
+iterFwdRw :: forall m n f.
+             FwdRewrite FuelUniqSM n f
+          -> FwdRewrite FuelUniqSM n f
+iterFwdRw rw3 = wrapFR iter rw3
+ where iter :: forall a e x t.
+               (t -> a -> FuelUniqSM (Maybe (Graph n e x, FwdRewrite FuelUniqSM n f)))
+               -> t
+               -> a
+               -> FuelUniqSM (Maybe (Graph n e x, FwdRewrite FuelUniqSM n f))
+       iter rw n = (liftM $ liftM $ fadd_rw (iterFwdRw rw3)) . rw n
+
+-- | Function inspired by 'rew' in the paper
+_frewrite_cps :: ((Graph n e x, FwdRewrite FuelUniqSM n f) -> FuelUniqSM a)
+             -> FuelUniqSM a
+             -> (forall e x . n e x -> f -> FuelUniqSM (Maybe (Graph n e x, FwdRewrite FuelUniqSM n f)))
+             -> n e x
+             -> f
+             -> FuelUniqSM a
+_frewrite_cps j n rw node f =
+    do mg <- rw node f
+       case mg of Nothing -> n
+                  Just gr -> j gr
+
+
+
+-- | Function inspired by 'add' in the paper
+fadd_rw :: FwdRewrite FuelUniqSM n f
+        -> (Graph n e x, FwdRewrite FuelUniqSM n f)
+        -> (Graph n e x, FwdRewrite FuelUniqSM n f)
+fadd_rw rw2 (g, rw1) = (g, rw1 `thenFwdRw` rw2)
+
+
+
+deepBwdRw3 ::
+              (n C O -> f          -> FuelUniqSM (Maybe (Graph n C O)))
+           -> (n O O -> f          -> FuelUniqSM (Maybe (Graph n O O)))
+           -> (n O C -> FactBase f -> FuelUniqSM (Maybe (Graph n O C)))
+           -> (BwdRewrite FuelUniqSM n f)
+deepBwdRw  :: (forall e x . n e x -> Fact x f -> FuelUniqSM (Maybe (Graph n e x)))
+           -> BwdRewrite FuelUniqSM n f
+deepBwdRw3 f m l = iterBwdRw $ mkBRewrite3 f m l
+deepBwdRw  f = deepBwdRw3 f f f
+
+
+thenBwdRw :: forall n f. BwdRewrite FuelUniqSM n f -> BwdRewrite FuelUniqSM n f -> BwdRewrite FuelUniqSM n f
+thenBwdRw rw1 rw2 = wrapBR2 f rw1 rw2
+  where f :: forall t t1 t2 e x.
+             t
+             -> (t1 -> t2 -> FuelUniqSM (Maybe (Graph n e x, BwdRewrite FuelUniqSM n f)))
+             -> (t1 -> t2 -> FuelUniqSM (Maybe (Graph n e x, BwdRewrite FuelUniqSM n f)))
+             -> t1
+             -> t2
+             -> FuelUniqSM (Maybe (Graph n e x, BwdRewrite FuelUniqSM n f))
+        f _ rw1 rw2' n f = do
+          res1 <- rw1 n f
+          case res1 of
+            Nothing -> rw2' n f
+            Just gr -> return $ Just $ badd_rw rw2 gr
+
+iterBwdRw :: forall n f. BwdRewrite FuelUniqSM n f -> BwdRewrite FuelUniqSM n f
+iterBwdRw rw = wrapBR f rw
+  where f :: forall t e x t1 t2.
+             t
+             -> (t1 -> t2 -> FuelUniqSM (Maybe (Graph n e x, BwdRewrite FuelUniqSM n f)))
+             -> t1
+             -> t2
+             -> FuelUniqSM (Maybe (Graph n e x, BwdRewrite FuelUniqSM n f))
+        f _ rw' n f = liftM (liftM (badd_rw (iterBwdRw rw))) (rw' n f)
+
+-- | Function inspired by 'add' in the paper
+badd_rw :: BwdRewrite FuelUniqSM n f
+        -> (Graph n e x, BwdRewrite FuelUniqSM n f)
+        -> (Graph n e x, BwdRewrite FuelUniqSM n f)
+badd_rw rw2 (g, rw1) = (g, rw1 `thenBwdRw` rw2)
diff --git a/compiler/cmm/Hoopl/Dataflow.hs b/compiler/cmm/Hoopl/Dataflow.hs
new file mode 100644
index 0000000000..cdab2cd2fe
--- /dev/null
+++ b/compiler/cmm/Hoopl/Dataflow.hs
@@ -0,0 +1,890 @@
+{-# LANGUAGE RankNTypes, ScopedTypeVariables, GADTs, EmptyDataDecls, PatternGuards, TypeFamilies, MultiParamTypeClasses #-}
+#if __GLASGOW_HASKELL__ >= 703
+{-# OPTIONS_GHC -fprof-auto-top #-}
+#endif
+#if __GLASGOW_HASKELL__ >= 701
+{-# LANGUAGE Trustworthy #-}
+#endif
+
+module Hoopl.Dataflow
+  ( DataflowLattice(..), OldFact(..), NewFact(..), Fact, mkFactBase
+  , ChangeFlag(..)
+  , FwdPass(..), FwdTransfer, mkFTransfer, mkFTransfer3, getFTransfer3
+  -- * Respecting Fuel
+
+  -- $fuel
+  , FwdRewrite,  mkFRewrite,  mkFRewrite3,  getFRewrite3, noFwdRewrite
+  , wrapFR, wrapFR2
+  , BwdPass(..), BwdTransfer, mkBTransfer, mkBTransfer3, getBTransfer3
+  , wrapBR, wrapBR2
+  , BwdRewrite,  mkBRewrite,  mkBRewrite3,  getBRewrite3, noBwdRewrite
+  , analyzeAndRewriteFwd,  analyzeAndRewriteBwd
+  , analyzeFwd, analyzeFwdBlocks, analyzeBwd
+  )
+where
+
+import OptimizationFuel
+
+import Data.Maybe
+import Data.Array
+
+import Compiler.Hoopl.Collections
+import Compiler.Hoopl.Fuel
+import Compiler.Hoopl.Graph hiding (Graph) -- hiding so we can redefine
+                                           -- and include definition in paper
+import qualified Compiler.Hoopl.GraphUtil as U
+import Compiler.Hoopl.Label
+import Compiler.Hoopl.Dataflow (JoinFun)
+import Compiler.Hoopl.Util
+
+import Compiler.Hoopl.Dataflow (
+    DataflowLattice(..), OldFact(..), NewFact(..), Fact
+  , ChangeFlag(..), mkFactBase
+  , FwdPass(..), FwdRewrite(..), FwdTransfer(..), mkFRewrite,  getFRewrite3, mkFTransfer, mkFTransfer3
+  , wrapFR, wrapFR2
+  , BwdPass(..), BwdRewrite(..),  BwdTransfer(..), mkBTransfer, mkBTransfer3, getBTransfer3
+  , wrapBR, wrapBR2
+  , mkBRewrite,  getBRewrite3
+  )
+
+-- import Debug.Trace
+
+noRewrite :: a -> b -> FuelUniqSM (Maybe c)
+noRewrite _ _ = return Nothing
+
+noFwdRewrite :: FwdRewrite FuelUniqSM n f
+noFwdRewrite = FwdRewrite3 (noRewrite, noRewrite, noRewrite)
+
+-- | Functions passed to 'mkFRewrite3' should not be aware of the fuel supply.
+-- The result returned by 'mkFRewrite3' respects fuel.
+mkFRewrite3 :: forall n f.
+               (n C O -> f -> FuelUniqSM (Maybe (Graph n C O)))
+            -> (n O O -> f -> FuelUniqSM (Maybe (Graph n O O)))
+            -> (n O C -> f -> FuelUniqSM (Maybe (Graph n O C)))
+            -> FwdRewrite FuelUniqSM n f
+mkFRewrite3 f m l = FwdRewrite3 (lift f, lift m, lift l)
+  where lift :: forall t t1 a. (t -> t1 -> FuelUniqSM (Maybe a))
+                             -> t -> t1 -> FuelUniqSM (Maybe (a, FwdRewrite FuelUniqSM n f))
+        {-# INLINE lift #-}
+        lift rw node fact = do
+             a <- rw node fact
+             case a of
+               Nothing -> return Nothing
+               Just a  -> do f <- getFuel
+                             if f == 0
+                                then return Nothing
+                                else setFuel (f-1) >> return (Just (a,noFwdRewrite))
+
+noBwdRewrite :: BwdRewrite FuelUniqSM n f
+noBwdRewrite = BwdRewrite3 (noRewrite, noRewrite, noRewrite)
+
+mkBRewrite3 :: forall n f.
+               (n C O -> f          -> FuelUniqSM (Maybe (Graph n C O)))
+            -> (n O O -> f          -> FuelUniqSM (Maybe (Graph n O O)))
+            -> (n O C -> FactBase f -> FuelUniqSM (Maybe (Graph n O C)))
+            -> BwdRewrite FuelUniqSM n f
+mkBRewrite3 f m l = BwdRewrite3 (lift f, lift m, lift l)
+  where lift :: forall t t1 a. (t -> t1 -> FuelUniqSM (Maybe a))
+                             -> t -> t1 -> FuelUniqSM (Maybe (a, BwdRewrite FuelUniqSM n f))
+        {-# INLINE lift #-}
+        lift rw node fact = do
+             a <- rw node fact
+             case a of
+               Nothing -> return Nothing
+               Just a  -> do f <- getFuel
+                             if f == 0
+                                then return Nothing
+                                else setFuel (f-1) >> return (Just (a,noBwdRewrite))
+
+-----------------------------------------------------------------------------
+--              Analyze and rewrite forward: the interface
+-----------------------------------------------------------------------------
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeAndRewriteFwd
+   :: forall n f e x .  NonLocal n =>
+      FwdPass FuelUniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n  e x -> Fact e f
+   -> FuelUniqSM (Graph n e x, FactBase f, MaybeO x f)
+analyzeAndRewriteFwd pass entries g f =
+  do (rg, fout) <- arfGraph pass (fmap targetLabels entries) g f
+     let (g', fb) = normalizeGraph rg
+     return (g', fb, distinguishedExitFact g' fout)
+
+distinguishedExitFact :: forall n e x f . Graph n e x -> Fact x f -> MaybeO x f
+distinguishedExitFact g f = maybe g
+    where maybe :: Graph n e x -> MaybeO x f
+          maybe GNil       = JustO f
+          maybe (GUnit {}) = JustO f
+          maybe (GMany _ _ x) = case x of NothingO -> NothingO
+                                          JustO _  -> JustO f
+
+----------------------------------------------------------------
+--       Forward Implementation
+----------------------------------------------------------------
+
+type Entries e = MaybeC e [Label]
+
+arfGraph :: forall n f e x .  NonLocal n =>
+            FwdPass FuelUniqSM n f ->
+            Entries e -> Graph n e x -> Fact e f -> FuelUniqSM (DG f n e x, Fact x f)
+arfGraph pass@FwdPass { fp_lattice = lattice,
+                        fp_transfer = transfer,
+                        fp_rewrite  = rewrite } entries g in_fact = graph g in_fact
+  where
+    {- nested type synonyms would be so lovely here 
+    type ARF  thing = forall e x . thing e x -> f        -> m (DG f n e x, Fact x f)
+    type ARFX thing = forall e x . thing e x -> Fact e f -> m (DG f n e x, Fact x f)
+    -}
+    graph ::              Graph n e x -> Fact e f -> FuelUniqSM (DG f n e x, Fact x f)
+    block :: forall e x .
+             Block n e x -> f -> FuelUniqSM (DG f n e x, Fact x f)
+
+    body  :: [Label] -> LabelMap (Block n C C)
+          -> Fact C f -> FuelUniqSM (DG f n C C, Fact C f)
+                    -- Outgoing factbase is restricted to Labels *not* in
+                    -- in the Body; the facts for Labels *in*
+                    -- the Body are in the 'DG f n C C'
+
+    cat :: forall e a x f1 f2 f3.
+           (f1 -> FuelUniqSM (DG f n e a, f2))
+        -> (f2 -> FuelUniqSM (DG f n a x, f3))
+        -> (f1 -> FuelUniqSM (DG f n e x, f3))
+
+    graph GNil            f = return (dgnil, f)
+    graph (GUnit blk)     f = block blk f
+    graph (GMany e bdy x) f = ((e `ebcat` bdy) `cat` exit x) f
+     where
+      ebcat :: MaybeO e (Block n O C) -> Body n -> Fact e f -> FuelUniqSM (DG f n e C, Fact C f)
+      exit  :: MaybeO x (Block n C O)           -> Fact C f -> FuelUniqSM (DG f n C x, Fact x f)
+      exit (JustO blk) f = arfx block blk f
+      exit NothingO    f = return (dgnilC, f)
+      ebcat entry bdy f = c entries entry f
+       where c :: MaybeC e [Label] -> MaybeO e (Block n O C)
+                -> Fact e f -> FuelUniqSM (DG f n e C, Fact C f)
+             c NothingC (JustO entry)   f = (block entry `cat` body (successors entry) bdy) f
+             c (JustC entries) NothingO f = body entries bdy f
+             c _ _ _ = error "bogus GADT pattern match failure"
+
+    -- Lift from nodes to blocks
+    block BNil            f = return (dgnil, f)
+    block (BlockCO n b)   f = (node n `cat` block b) f
+    block (BlockCC l b n) f = (node l `cat` block b `cat` node n) f
+    block (BlockOC   b n) f =              (block b `cat` node n) f
+
+    block (BMiddle n)     f = node n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BHead h n)     f = (block h  `cat` node n) f
+    block (BTail n t)     f = (node  n  `cat` block t) f
+
+    {-# INLINE node #-}
+    node :: forall e x . (ShapeLifter e x)
+         => n e x -> f -> FuelUniqSM (DG f n e x, Fact x f)
+    node n f
+     = do { grw <- frewrite rewrite n f
+          ; case grw of
+              Nothing -> return ( singletonDG f n
+                                , ftransfer transfer n f )
+              Just (g, rw) ->
+                  let pass' = pass { fp_rewrite = rw }
+                      f'    = fwdEntryFact n f
+                  in  arfGraph pass' (fwdEntryLabel n) g f' }
+
+    -- | Compose fact transformers and concatenate the resulting
+    -- rewritten graphs.
+    {-# INLINE cat #-} 
+    cat ft1 ft2 f = do { (g1,f1) <- ft1 f
+                       ; (g2,f2) <- ft2 f1
+                       ; let !g = g1 `dgSplice` g2
+                       ; return (g, f2) }
+
+    arfx :: forall x .
+            (Block n C x ->        f -> FuelUniqSM (DG f n C x, Fact x f))
+         -> (Block n C x -> Fact C f -> FuelUniqSM (DG f n C x, Fact x f))
+    arfx arf thing fb = 
+      arf thing $ fromJust $ lookupFact (entryLabel thing) $ joinInFacts lattice fb
+     -- joinInFacts adds debugging information
+
+
+                    -- Outgoing factbase is restricted to Labels *not* in
+                    -- in the Body; the facts for Labels *in*
+                    -- the Body are in the 'DG f n C C'
+    body entries blockmap init_fbase
+      = fixpoint Fwd lattice do_block entries blockmap init_fbase
+      where
+        lattice = fp_lattice pass
+        do_block :: forall x . Block n C x -> FactBase f
+                 -> FuelUniqSM (DG f n C x, Fact x f)
+        do_block b fb = block b entryFact
+          where entryFact = getFact lattice (entryLabel b) fb
+
+
+-- Join all the incoming facts with bottom.
+-- We know the results _shouldn't change_, but the transfer
+-- functions might, for example, generate some debugging traces.
+joinInFacts :: DataflowLattice f -> FactBase f -> FactBase f
+joinInFacts (lattice @ DataflowLattice {fact_bot = bot, fact_join = fj}) fb =
+  mkFactBase lattice $ map botJoin $ mapToList fb
+    where botJoin (l, f) = (l, snd $ fj l (OldFact bot) (NewFact f))
+
+forwardBlockList :: (NonLocal n)
+                 => [Label] -> Body n -> [Block n C C]
+-- This produces a list of blocks in order suitable for forward analysis,
+-- along with the list of Labels it may depend on for facts.
+forwardBlockList entries blks = postorder_dfs_from blks entries
+
+----------------------------------------------------------------
+--       Forward Analysis only
+----------------------------------------------------------------
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeFwd
+   :: forall n f e .  NonLocal n =>
+      FwdPass FuelUniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n e C -> Fact e f
+   -> FactBase f
+analyzeFwd FwdPass { fp_lattice = lattice,
+                     fp_transfer = FwdTransfer3 (ftr, mtr, ltr) }
+  entries g in_fact = graph g in_fact
+  where
+    graph :: Graph n e C -> Fact e f -> FactBase f
+    graph (GMany entry blockmap NothingO)
+      = case (entries, entry) of
+         (NothingC, JustO entry)   -> block entry `cat` body (successors entry)
+         (JustC entries, NothingO) -> body entries
+         _ -> error "bogus GADT pattern match failure"
+     where
+       body  :: [Label] -> Fact C f -> Fact C f
+       body entries f
+         = fixpointAnal Fwd lattice do_block entries blockmap f
+         where
+           do_block :: forall x . Block n C x -> FactBase f -> Fact x f
+           do_block b fb = block b entryFact
+             where entryFact = getFact lattice (entryLabel b) fb
+
+    -- NB. eta-expand block, GHC can't do this by itself.  See #5809.
+    block :: forall e x . Block n e x -> f -> Fact x f
+    block BNil            f = f
+    block (BlockCO n b)   f = (ftr n `cat`  block b) f
+    block (BlockCC l b n) f = (ftr l `cat` (block b `cat` ltr n)) f
+    block (BlockOC   b n) f =              (block b `cat` ltr n) f
+
+    block (BMiddle n)     f = mtr n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BHead h n)     f = (block h  `cat` mtr n) f
+    block (BTail n t)     f = (mtr  n   `cat` block t) f
+
+    {-# INLINE cat #-}
+    cat :: forall f1 f2 f3 . (f1 -> f2) -> (f2 -> f3) -> (f1 -> f3)
+    cat ft1 ft2 = \f -> ft2 $! ft1 f
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeFwdBlocks
+   :: forall n f e .  NonLocal n =>
+      FwdPass FuelUniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n e C -> Fact e f
+   -> FactBase f
+analyzeFwdBlocks FwdPass { fp_lattice = lattice,
+                           fp_transfer = FwdTransfer3 (ftr, _, ltr) }
+  entries g in_fact = graph g in_fact
+  where
+    graph :: Graph n e C -> Fact e f -> FactBase f
+    graph (GMany entry blockmap NothingO)
+      = case (entries, entry) of
+         (NothingC, JustO entry)   -> block entry `cat` body (successors entry)
+         (JustC entries, NothingO) -> body entries
+         _ -> error "bogus GADT pattern match failure"
+     where
+       body  :: [Label] -> Fact C f -> Fact C f
+       body entries f
+         = fixpointAnal Fwd lattice do_block entries blockmap f
+         where
+           do_block :: forall x . Block n C x -> FactBase f -> Fact x f
+           do_block b fb = block b entryFact
+             where entryFact = getFact lattice (entryLabel b) fb
+
+    -- NB. eta-expand block, GHC can't do this by itself.  See #5809.
+    block :: forall e x . Block n e x -> f -> Fact x f
+    block BNil            f = f
+    block (BlockCO n _)   f = ftr n f
+    block (BlockCC l _ n) f = (ftr l `cat` ltr n) f
+    block (BlockOC   _ n) f = ltr n f
+
+    {-# INLINE cat #-}
+    cat :: forall f1 f2 f3 . (f1 -> f2) -> (f2 -> f3) -> (f1 -> f3)
+    cat ft1 ft2 = \f -> ft2 $! ft1 f
+
+----------------------------------------------------------------
+--       Backward Analysis only
+----------------------------------------------------------------
+
+-- | if the graph being analyzed is open at the entry, there must
+--   be no other entry point, or all goes horribly wrong...
+analyzeBwd
+   :: forall n f e .  NonLocal n =>
+      BwdPass FuelUniqSM n f
+   -> MaybeC e [Label]
+   -> Graph n e C -> Fact C f
+   -> FactBase f
+analyzeBwd BwdPass { bp_lattice = lattice,
+                     bp_transfer = BwdTransfer3 (ftr, mtr, ltr) }
+   entries g in_fact = graph g in_fact
+  where
+    graph :: Graph n e C -> Fact C f -> FactBase f
+    graph (GMany entry blockmap NothingO)
+      = case (entries, entry) of
+         (NothingC, JustO entry)   -> body (successors entry)
+         (JustC entries, NothingO) -> body entries
+         _ -> error "bogus GADT pattern match failure"
+     where
+       body  :: [Label] -> Fact C f -> Fact C f
+       body entries f
+         = fixpointAnal Bwd lattice do_block entries blockmap f
+         where
+           do_block :: forall x . Block n C x -> Fact x f -> FactBase f
+           do_block b fb = mapSingleton (entryLabel b) (block b fb)
+
+    -- NB. eta-expand block, GHC can't do this by itself.  See #5809.
+    block :: forall e x . Block n e x -> Fact x f -> f
+    block BNil            f = f
+    block (BlockCO n b)   f = (ftr n `cat`  block b) f
+    block (BlockCC l b n) f = ((ftr l `cat` block b) `cat` ltr n) f
+    block (BlockOC   b n) f =              (block b `cat` ltr n) f
+
+    block (BMiddle n)     f = mtr n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BHead h n)     f = (block h  `cat` mtr n) f
+    block (BTail n t)     f = (mtr  n   `cat` block t) f
+
+    {-# INLINE cat #-}
+    cat :: forall f1 f2 f3 . (f2 -> f3) -> (f1 -> f2) -> (f1 -> f3)
+    cat ft1 ft2 = \f -> ft1 $! ft2 f
+
+-----------------------------------------------------------------------------
+--              Backward analysis and rewriting: the interface
+-----------------------------------------------------------------------------
+
+
+-- | if the graph being analyzed is open at the exit, I don't
+--   quite understand the implications of possible other exits
+analyzeAndRewriteBwd
+   :: NonLocal n
+   => BwdPass FuelUniqSM n f
+   -> MaybeC e [Label] -> Graph n e x -> Fact x f
+   -> FuelUniqSM (Graph n e x, FactBase f, MaybeO e f)
+analyzeAndRewriteBwd pass entries g f =
+  do (rg, fout) <- arbGraph pass (fmap targetLabels entries) g f
+     let (g', fb) = normalizeGraph rg
+     return (g', fb, distinguishedEntryFact g' fout)
+
+distinguishedEntryFact :: forall n e x f . Graph n e x -> Fact e f -> MaybeO e f
+distinguishedEntryFact g f = maybe g
+    where maybe :: Graph n e x -> MaybeO e f
+          maybe GNil       = JustO f
+          maybe (GUnit {}) = JustO f
+          maybe (GMany e _ _) = case e of NothingO -> NothingO
+                                          JustO _  -> JustO f
+
+
+-----------------------------------------------------------------------------
+--              Backward implementation
+-----------------------------------------------------------------------------
+
+arbGraph :: forall n f e x .
+            NonLocal n =>
+            BwdPass FuelUniqSM n f ->
+            Entries e -> Graph n e x -> Fact x f -> FuelUniqSM (DG f n e x, Fact e f)
+arbGraph pass@BwdPass { bp_lattice  = lattice,
+                        bp_transfer = transfer,
+                        bp_rewrite  = rewrite } entries g in_fact = graph g in_fact
+  where
+    {- nested type synonyms would be so lovely here 
+    type ARB  thing = forall e x . thing e x -> Fact x f -> m (DG f n e x, f)
+    type ARBX thing = forall e x . thing e x -> Fact x f -> m (DG f n e x, Fact e f)
+    -}
+    graph ::              Graph n e x -> Fact x f -> FuelUniqSM (DG f n e x, Fact e f)
+    block :: forall e x . Block n e x -> Fact x f -> FuelUniqSM (DG f n e x, f)
+    body  :: [Label] -> Body n -> Fact C f -> FuelUniqSM (DG f n C C, Fact C f)
+    node  :: forall e x . (ShapeLifter e x) 
+             => n e x       -> Fact x f -> FuelUniqSM (DG f n e x, f)
+    cat :: forall e a x info info' info''.
+           (info' -> FuelUniqSM (DG f n e a, info''))
+        -> (info  -> FuelUniqSM (DG f n a x, info'))
+        -> (info  -> FuelUniqSM (DG f n e x, info''))
+
+    graph GNil            f = return (dgnil, f)
+    graph (GUnit blk)     f = block blk f
+    graph (GMany e bdy x) f = ((e `ebcat` bdy) `cat` exit x) f
+     where
+      ebcat :: MaybeO e (Block n O C) -> Body n -> Fact C f -> FuelUniqSM (DG f n e C, Fact e f)
+      exit  :: MaybeO x (Block n C O)           -> Fact x f -> FuelUniqSM (DG f n C x, Fact C f)
+      exit (JustO blk) f = arbx block blk f
+      exit NothingO    f = return (dgnilC, f)
+      ebcat entry bdy f = c entries entry f
+       where c :: MaybeC e [Label] -> MaybeO e (Block n O C)
+                -> Fact C f -> FuelUniqSM (DG f n e C, Fact e f)
+             c NothingC (JustO entry)   f = (block entry `cat` body (successors entry) bdy) f
+             c (JustC entries) NothingO f = body entries bdy f
+             c _ _ _ = error "bogus GADT pattern match failure"
+
+    -- Lift from nodes to blocks
+    block BNil            f = return (dgnil, f)
+    block (BlockCO n b)   f = (node n `cat` block b) f
+    block (BlockCC l b n) f = (node l `cat` block b `cat` node n) f
+    block (BlockOC   b n) f =              (block b `cat` node n) f
+
+    block (BMiddle n)     f = node n f
+    block (BCat b1 b2)    f = (block b1 `cat` block b2) f
+    block (BHead h n)     f = (block h  `cat` node n) f
+    block (BTail n t)     f = (node  n  `cat` block t) f
+
+    {-# INLINE node #-}
+    node n f
+      = do { bwdres <- brewrite rewrite n f
+           ; case bwdres of
+               Nothing -> return (singletonDG entry_f n, entry_f)
+                            where entry_f = btransfer transfer n f
+               Just (g, rw) ->
+                          do { let pass' = pass { bp_rewrite = rw }
+                             ; (g, f) <- arbGraph pass' (fwdEntryLabel n) g f
+                             ; return (g, bwdEntryFact lattice n f)} }
+
+    -- | Compose fact transformers and concatenate the resulting
+    -- rewritten graphs.
+    {-# INLINE cat #-} 
+    cat ft1 ft2 f = do { (g2,f2) <- ft2 f
+                       ; (g1,f1) <- ft1 f2
+                       ; let !g = g1 `dgSplice` g2
+                       ; return (g, f1) }
+
+    arbx :: forall x .
+            (Block n C x -> Fact x f -> FuelUniqSM (DG f n C x, f))
+         -> (Block n C x -> Fact x f -> FuelUniqSM (DG f n C x, Fact C f))
+
+    arbx arb thing f = do { (rg, f) <- arb thing f
+                          ; let fb = joinInFacts (bp_lattice pass) $
+                                     mapSingleton (entryLabel thing) f
+                          ; return (rg, fb) }
+     -- joinInFacts adds debugging information
+
+                    -- Outgoing factbase is restricted to Labels *not* in
+                    -- in the Body; the facts for Labels *in*
+                    -- the Body are in the 'DG f n C C'
+    body entries blockmap init_fbase
+      = fixpoint Bwd (bp_lattice pass) do_block entries blockmap init_fbase
+      where
+        do_block :: forall x. Block n C x -> Fact x f -> FuelUniqSM (DG f n C x, LabelMap f)
+        do_block b f = do (g, f) <- block b f
+                          return (g, mapSingleton (entryLabel b) f)
+
+
+{-
+
+The forward and backward cases are not dual.  In the forward case, the
+entry points are known, and one simply traverses the body blocks from
+those points.  In the backward case, something is known about the exit
+points, but this information is essentially useless, because we don't
+actually have a dual graph (that is, one with edges reversed) to
+compute with.  (Even if we did have a dual graph, it would not avail
+us---a backward analysis must include reachable blocks that don't
+reach the exit, as in a procedure that loops forever and has side
+effects.)
+
+-}
+
+-----------------------------------------------------------------------------
+--      fixpoint
+-----------------------------------------------------------------------------
+
+data Direction = Fwd | Bwd
+
+-- | fixpointing for analysis-only
+--
+fixpointAnal :: forall n f. NonLocal n
+ => Direction
+ -> DataflowLattice f
+ -> (Block n C C -> Fact C f -> Fact C f)
+ -> [Label]
+ -> LabelMap (Block n C C)
+ -> Fact C f -> FactBase f
+
+fixpointAnal direction DataflowLattice{ fact_bot = bot, fact_join = join }
+              do_block entries blockmap init_fbase
+  = loop start init_fbase
+  where
+    blocks     = sortBlocks direction entries blockmap
+    n          = length blocks
+    block_arr  = {-# SCC "block_arr" #-} listArray (0,n-1) blocks
+    start      = {-# SCC "start" #-} [0..n-1]
+    dep_blocks = {-# SCC "dep_blocks" #-} mkDepBlocks direction blocks
+
+    loop
+       :: IntHeap      -- blocks still to analyse
+       -> FactBase f  -- current factbase (increases monotonically)
+       -> FactBase f
+
+    loop []        fbase = fbase
+    loop (ix:todo) fbase =
+           let
+               blk = block_arr ! ix
+
+               out_facts = {-# SCC "do_block" #-} do_block blk fbase
+
+               !(todo', fbase') = {-# SCC "mapFoldWithKey" #-}
+                     mapFoldWithKey (updateFact join dep_blocks)
+                                    (todo,fbase) out_facts
+           in
+           -- trace ("analysing: " ++ show (entryLabel blk)) $
+           -- trace ("fbase': " ++ show (mapKeys fbase')) $ return ()
+           -- trace ("changed: " ++ show changed) $ return ()
+           -- trace ("to analyse: " ++ show to_analyse) $ return ()
+
+           loop todo' fbase'
+
+
+-- | fixpointing for combined analysis/rewriting
+--
+fixpoint :: forall n f. NonLocal n
+ => Direction
+ -> DataflowLattice f
+ -> (Block n C C -> Fact C f -> FuelUniqSM (DG f n C C, Fact C f))
+ -> [Label]
+ -> LabelMap (Block n C C)
+ -> (Fact C f -> FuelUniqSM (DG f n C C, Fact C f))
+
+fixpoint direction DataflowLattice{ fact_bot = bot, fact_join = join }
+         do_block entries blockmap init_fbase
+  = do
+        -- trace ("fixpoint: " ++ show (case direction of Fwd -> True; Bwd -> False) ++ " " ++ show (mapKeys blockmap) ++ show entries ++ " " ++ show (mapKeys init_fbase)) $ return()
+        (fbase, newblocks) <- loop start init_fbase mapEmpty
+        -- trace ("fixpoint DONE: " ++ show (mapKeys fbase) ++ show (mapKeys newblocks)) $ return()
+        return (GMany NothingO newblocks NothingO,
+                mapDeleteList (mapKeys blockmap) fbase)
+    -- The successors of the Graph are the the Labels
+    -- for which we have facts and which are *not* in
+    -- the blocks of the graph
+  where
+    blocks     = sortBlocks direction entries blockmap
+    n          = length blocks
+    block_arr  = {-# SCC "block_arr" #-} listArray (0,n-1) blocks
+    start      = {-# SCC "start" #-} [0..n-1]
+    dep_blocks = {-# SCC "dep_blocks" #-} mkDepBlocks direction blocks
+
+    loop
+       :: IntHeap
+       -> FactBase f  -- current factbase (increases monotonically)
+       -> LabelMap (DBlock f n C C)  -- transformed graph
+       -> FuelUniqSM (FactBase f, LabelMap (DBlock f n C C))
+
+    loop [] fbase newblocks = return (fbase, newblocks)
+    loop (ix:todo) fbase !newblocks = do
+           let blk = block_arr ! ix
+
+           -- trace ("analysing: " ++ show (entryLabel blk)) $ return ()
+           (rg, out_facts) <- do_block blk fbase
+           let !(todo', fbase') =
+                  mapFoldWithKey (updateFact join dep_blocks)
+                                 (todo,fbase) out_facts
+
+           -- trace ("fbase': " ++ show (mapKeys fbase')) $ return ()
+           -- trace ("changed: " ++ show changed) $ return ()
+           -- trace ("to analyse: " ++ show to_analyse) $ return ()
+
+           let newblocks' = case rg of
+                              GMany _ blks _ -> mapUnion blks newblocks
+     
+           loop todo' fbase' newblocks'
+
+
+{-  Note [TxFactBase invariants]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The TxFactBase is used only during a fixpoint iteration (or "sweep"),
+and accumulates facts (and the transformed code) during the fixpoint
+iteration.
+
+* tfb_fbase increases monotonically, across all sweeps
+
+* At the beginning of each sweep
+      tfb_cha  = NoChange
+      tfb_lbls = {}
+
+* During each sweep we process each block in turn.  Processing a block
+  is done thus:
+    1.  Read from tfb_fbase the facts for its entry label (forward)
+        or successors labels (backward)
+    2.  Transform those facts into new facts for its successors (forward)
+        or entry label (backward)
+    3.  Augment tfb_fbase with that info
+  We call the labels read in step (1) the "in-labels" of the sweep
+
+* The field tfb_lbls is the set of in-labels of all blocks that have
+  been processed so far this sweep, including the block that is
+  currently being processed.  tfb_lbls is initialised to {}.  It is a
+  subset of the Labels of the *original* (not transformed) blocks.
+
+* The tfb_cha field is set to SomeChange iff we decide we need to
+  perform another iteration of the fixpoint loop. It is initialsed to NoChange.
+
+  Specifically, we set tfb_cha to SomeChange in step (3) iff
+    (a) The fact in tfb_fbase for a block L changes
+    (b) L is in tfb_lbls
+  Reason: until a label enters the in-labels its accumuated fact in tfb_fbase
+  has not been read, hence cannot affect the outcome
+
+Note [Unreachable blocks]
+~~~~~~~~~~~~~~~~~~~~~~~~~
+A block that is not in the domain of tfb_fbase is "currently unreachable".
+A currently-unreachable block is not even analyzed.  Reason: consider 
+constant prop and this graph, with entry point L1:
+  L1: x:=3; goto L4
+  L2: x:=4; goto L4
+  L4: if x>3 goto L2 else goto L5
+Here L2 is actually unreachable, but if we process it with bottom input fact,
+we'll propagate (x=4) to L4, and nuke the otherwise-good rewriting of L4.
+
+* If a currently-unreachable block is not analyzed, then its rewritten
+  graph will not be accumulated in tfb_rg.  And that is good:
+  unreachable blocks simply do not appear in the output.
+
+* Note that clients must be careful to provide a fact (even if bottom)
+  for each entry point. Otherwise useful blocks may be garbage collected.
+
+* Note that updateFact must set the change-flag if a label goes from
+  not-in-fbase to in-fbase, even if its fact is bottom.  In effect the
+  real fact lattice is
+       UNR
+       bottom
+       the points above bottom
+
+* Even if the fact is going from UNR to bottom, we still call the
+  client's fact_join function because it might give the client
+  some useful debugging information.
+
+* All of this only applies for *forward* ixpoints.  For the backward
+  case we must treat every block as reachable; it might finish with a
+  'return', and therefore have no successors, for example.
+-}
+
+
+-----------------------------------------------------------------------------
+--  Pieces that are shared by fixpoint and fixpoint_anal
+-----------------------------------------------------------------------------
+
+-- | Sort the blocks into the right order for analysis.
+sortBlocks :: NonLocal n => Direction -> [Label] -> LabelMap (Block n C C)
+           -> [Block n C C]
+sortBlocks direction entries blockmap
+   = case direction of Fwd -> fwd
+                       Bwd -> reverse fwd
+  where fwd = forwardBlockList entries blockmap
+
+-- | construct a mapping from L -> block indices.  If the fact for L
+-- changes, re-analyse the given blocks.
+mkDepBlocks :: NonLocal n => Direction -> [Block n C C] -> LabelMap [Int]
+mkDepBlocks Fwd blocks = go blocks 0 mapEmpty
+  where go []     !_  m = m
+        go (b:bs) !n m = go bs (n+1) $! mapInsert (entryLabel b) [n] m
+mkDepBlocks Bwd blocks = go blocks 0 mapEmpty
+  where go []     !_ m = m
+        go (b:bs) !n m = go bs (n+1) $! go' (successors b) m
+            where go' [] m = m
+                  go' (l:ls) m = go' ls (mapInsertWith (++) l [n] m)
+
+
+-- | After some new facts have been generated by analysing a block, we
+-- fold this function over them to generate (a) a list of block
+-- indices to (re-)analyse, and (b) the new FactBase.
+--
+updateFact :: JoinFun f -> LabelMap [Int]
+           -> Label -> f       -- out fact
+           -> (IntHeap, FactBase f)
+           -> (IntHeap, FactBase f)
+
+updateFact fact_join dep_blocks lbl new_fact (todo, fbase)
+  = case lookupFact lbl fbase of
+      Nothing       -> let !z = mapInsert lbl new_fact fbase in (changed, z)
+                           -- Note [no old fact]
+      Just old_fact ->
+        case fact_join lbl (OldFact old_fact) (NewFact new_fact) of
+          (NoChange, _) -> (todo, fbase)
+          (_,        f) -> let !z = mapInsert lbl f fbase in (changed, z)
+  where
+     changed = foldr insertIntHeap todo $
+                 mapFindWithDefault [] lbl dep_blocks
+
+{-
+Note [no old fact]
+
+We know that the new_fact is >= _|_, so we don't need to join.  However,
+if the new fact is also _|_, and we have already analysed its block,
+we don't need to record a change.  So there's a tradeoff here.  It turns
+out that always recording a change is faster.
+-}
+
+-----------------------------------------------------------------------------
+--      DG: an internal data type for 'decorated graphs'
+--          TOTALLY internal to Hoopl; each block is decorated with a fact
+-----------------------------------------------------------------------------
+
+type Graph = Graph' Block
+type DG f  = Graph' (DBlock f)
+data DBlock f n e x = DBlock f (Block n e x) -- ^ block decorated with fact
+
+instance NonLocal n => NonLocal (DBlock f n) where
+  entryLabel (DBlock _ b) = entryLabel b
+  successors (DBlock _ b) = successors b
+
+--- constructors
+
+dgnil  :: DG f n O O
+dgnilC :: DG f n C C
+dgSplice  :: NonLocal n => DG f n e a -> DG f n a x -> DG f n e x
+
+---- observers
+
+normalizeGraph :: forall n f e x .
+                  NonLocal n => DG f n e x
+               -> (Graph n e x, FactBase f)
+                 -- A Graph together with the facts for that graph
+                 -- The domains of the two maps should be identical
+
+normalizeGraph g = (graphMapBlocks dropFact g, facts g)
+    where dropFact :: DBlock t t1 t2 t3 -> Block t1 t2 t3
+          dropFact (DBlock _ b) = b
+          facts :: DG f n e x -> FactBase f
+          facts GNil = noFacts
+          facts (GUnit _) = noFacts
+          facts (GMany _ body exit) = bodyFacts body `mapUnion` exitFacts exit
+          exitFacts :: MaybeO x (DBlock f n C O) -> FactBase f
+          exitFacts NothingO = noFacts
+          exitFacts (JustO (DBlock f b)) = mapSingleton (entryLabel b) f
+          bodyFacts :: LabelMap (DBlock f n C C) -> FactBase f
+          bodyFacts body = mapFoldWithKey f noFacts body
+            where f :: forall t a x. (NonLocal t) => Label -> DBlock a t C x -> LabelMap a -> LabelMap a
+                  f lbl (DBlock f _) fb = mapInsert lbl f fb
+
+--- implementation of the constructors (boring)
+
+dgnil  = GNil
+dgnilC = GMany NothingO emptyBody NothingO
+
+dgSplice = U.splice fzCat
+  where fzCat :: DBlock f n e O -> DBlock t n O x -> DBlock f n e x
+        fzCat (DBlock f b1) (DBlock _ b2) = DBlock f $! b1 `U.cat` b2
+        -- NB. strictness, this function is hammered.
+
+----------------------------------------------------------------
+--       Utilities
+----------------------------------------------------------------
+
+-- Lifting based on shape:
+--  - from nodes to blocks
+--  - from facts to fact-like things
+-- Lowering back:
+--  - from fact-like things to facts
+-- Note that the latter two functions depend only on the entry shape.
+class ShapeLifter e x where
+ singletonDG   :: f -> n e x -> DG f n e x
+ fwdEntryFact  :: NonLocal n => n e x -> f -> Fact e f
+ fwdEntryLabel :: NonLocal n => n e x -> MaybeC e [Label]
+ ftransfer :: FwdTransfer n f -> n e x -> f -> Fact x f
+ frewrite  :: FwdRewrite m n f -> n e x
+           -> f -> m (Maybe (Graph n e x, FwdRewrite m n f))
+-- @ end node.tex
+ bwdEntryFact :: NonLocal n => DataflowLattice f -> n e x -> Fact e f -> f
+ btransfer    :: BwdTransfer n f -> n e x -> Fact x f -> f
+ brewrite     :: BwdRewrite m n f -> n e x
+              -> Fact x f -> m (Maybe (Graph n e x, BwdRewrite m n f))
+
+instance ShapeLifter C O where
+  singletonDG f n = gUnitCO (DBlock f (BlockCO n BNil))
+  fwdEntryFact     n f  = mapSingleton (entryLabel n) f
+  bwdEntryFact lat n fb = getFact lat (entryLabel n) fb
+  ftransfer (FwdTransfer3 (ft, _, _)) n f = ft n f
+  btransfer (BwdTransfer3 (bt, _, _)) n f = bt n f
+  frewrite  (FwdRewrite3  (fr, _, _)) n f = fr n f
+  brewrite  (BwdRewrite3  (br, _, _)) n f = br n f
+  fwdEntryLabel n = JustC [entryLabel n]
+
+instance ShapeLifter O O where
+  singletonDG f = gUnitOO . DBlock f . BMiddle
+  fwdEntryFact   _ f = f
+  bwdEntryFact _ _ f = f
+  ftransfer (FwdTransfer3 (_, ft, _)) n f = ft n f
+  btransfer (BwdTransfer3 (_, bt, _)) n f = bt n f
+  frewrite  (FwdRewrite3  (_, fr, _)) n f = fr n f
+  brewrite  (BwdRewrite3  (_, br, _)) n f = br n f
+  fwdEntryLabel _ = NothingC
+
+instance ShapeLifter O C where
+  singletonDG f n = gUnitOC (DBlock f (BlockOC BNil n))
+  fwdEntryFact   _ f = f
+  bwdEntryFact _ _ f = f
+  ftransfer (FwdTransfer3 (_, _, ft)) n f = ft n f
+  btransfer (BwdTransfer3 (_, _, bt)) n f = bt n f
+  frewrite  (FwdRewrite3  (_, _, fr)) n f = fr n f
+  brewrite  (BwdRewrite3  (_, _, br)) n f = br n f
+  fwdEntryLabel _ = NothingC
+
+{-
+class ShapeLifter e x where
+  singletonDG   :: f -> n e x -> DG f n e x
+
+instance ShapeLifter C O where
+  singletonDG f n = gUnitCO (DBlock f (BlockCO n BNil))
+
+instance ShapeLifter O O where
+  singletonDG f = gUnitOO . DBlock f . BMiddle
+
+instance ShapeLifter O C where
+  singletonDG f n = gUnitOC (DBlock f (BlockOC BNil n))
+-}
+
+-- Fact lookup: the fact `orelse` bottom
+getFact  :: DataflowLattice f -> Label -> FactBase f -> f
+getFact lat l fb = case lookupFact l fb of Just  f -> f
+                                           Nothing -> fact_bot lat
+
+
+
+{-  Note [Respects fuel]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-}
+-- $fuel
+-- A value of type 'FwdRewrite' or 'BwdRewrite' /respects fuel/ if 
+-- any function contained within the value satisfies the following properties:
+--
+--   * When fuel is exhausted, it always returns 'Nothing'.
+--
+--   * When it returns @Just g rw@, it consumes /exactly/ one unit
+--     of fuel, and new rewrite 'rw' also respects fuel.
+--
+-- Provided that functions passed to 'mkFRewrite', 'mkFRewrite3', 
+-- 'mkBRewrite', and 'mkBRewrite3' are not aware of the fuel supply,
+-- the results respect fuel.
+--
+-- It is an /unchecked/ run-time error for the argument passed to 'wrapFR',
+-- 'wrapFR2', 'wrapBR', or 'warpBR2' to return a function that does not respect fuel.
+
+-- -----------------------------------------------------------------------------
+-- a Heap of Int
+
+-- We should really use a proper Heap here, but my attempts to make
+-- one have not succeeded in beating the simple ordered list.  Another
+-- alternative is IntSet (using deleteFindMin), but that was also
+-- slower than the ordered list in my experiments --SDM 25/1/2012
+
+type IntHeap = [Int] -- ordered
+
+insertIntHeap :: Int -> [Int] -> [Int]
+insertIntHeap x [] = [x]
+insertIntHeap x (y:ys)
+  | x < y     = x : y : ys
+  | x == y    = x : ys
+  | otherwise = y : insertIntHeap x ys
diff --git a/compiler/cmm/MkGraph.hs b/compiler/cmm/MkGraph.hs
index 04586b1029..797b785de2 100644
--- a/compiler/cmm/MkGraph.hs
+++ b/compiler/cmm/MkGraph.hs
@@ -1,29 +1,19 @@
 {-# LANGUAGE GADTs #-}
 
--- ToDo: remove -fno-warn-warnings-deprecations
-{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
--- ToDo: remove -fno-warn-incomplete-patterns
-{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
-
--- Module for building CmmAGraphs.
-
--- As the CmmAGraph is a wrapper over Graph CmmNode O x, it is different
--- from Hoopl's AGraph. The current clients expect functions with the
--- same names Hoopl uses, so this module cannot be in the same namespace
--- as Compiler.Hoopl.
-
 module MkGraph
-  ( CmmAGraph
-  , emptyAGraph, (<*>), catAGraphs, outOfLine
-  , mkLabel, mkMiddle, mkLast
-  , withFreshLabel, withUnique, lgraphOfAGraph, labelAGraph
+  ( CmmAGraph, CgStmt(..)
+  , (<*>), catAGraphs
+  , mkLabel, mkMiddle, mkLast, outOfLine
+  , lgraphOfAGraph, labelAGraph
 
   , stackStubExpr
-  , mkNop, mkAssign, mkStore, mkCall, mkCmmCall, mkSafeCall, mkUnsafeCall, mkFinalCall
-         , mkJump, mkDirectJump, mkForeignJump, mkJumpGC, mkCbranch, mkSwitch
-         , mkReturn, mkReturnSimple, mkComment, mkCallEntry
-         , mkBranch, mkCmmIfThenElse, mkCmmIfThen, mkCmmWhileDo
-         , copyInOflow, copyInSlot, copyOutOflow, copyOutSlot
+  , mkNop, mkAssign, mkStore, mkUnsafeCall, mkFinalCall, mkCallReturnsTo
+  , mkJump, mkDirectJump, mkForeignJump, mkForeignJumpExtra, mkJumpGC
+  , mkCbranch, mkSwitch
+  , mkReturn, mkReturnSimple, mkComment, mkCallEntry, mkBranch
+  , copyInOflow, copyOutOflow
+  , noExtraStack
+  , toCall, Transfer(..)
   )
 where
 
@@ -31,250 +21,232 @@ import BlockId
 import Cmm
 import CmmCallConv (assignArgumentsPos, ParamLocation(..))
 
+
 import Compiler.Hoopl hiding (Unique, (<*>), mkFirst, mkMiddle, mkLast, mkLabel, mkBranch, Shape(..))
-import qualified Compiler.Hoopl as H
-import Compiler.Hoopl.GHC (uniqueToLbl)
 import FastString
 import ForeignCall
 import Outputable
 import Prelude hiding (succ)
 import SMRep (ByteOff)
-import StaticFlags
-import Unique
 import UniqSupply
-import Util
+import OrdList
 
 #include "HsVersions.h"
 
-{-
-A 'CmmAGraph' is an abstract version of a 'Graph CmmNode O x' from module
-'Cmm'.  The difference is that the 'CmmAGraph' can be eigher open of closed at
-exit and it can supply fresh Labels and Uniques.
-
-It also supports a splicing operation <*>, which is different from the Hoopl's
-<*>, because it splices two CmmAGraphs. Specifically, it can splice Graph
-O C and Graph O x. In this case, the open beginning of the second graph is
-thrown away.  In the debug mode this sequence is checked to be empty or
-containing a branch (see note [Branch follows branch]).
-
-When an CmmAGraph open at exit is being converted to a CmmGraph, the output
-exit sequence is considered unreachable. If the graph consist of one block
-only, if it not the case and we crash. Otherwise we just throw the exit
-sequence away (and in debug mode we test that it really was unreachable).
--}
-
-{-
-Node [Branch follows branch]
-============================
-Why do we say it's ok for a Branch to follow a Branch?
-Because the standard constructor mkLabel has fall-through
-semantics. So if you do a mkLabel, you finish the current block,
-giving it a label, and start a new one that branches to that label.
-Emitting a Branch at this point is fine:
-       goto L1; L2: ...stuff...
--}
-
-data CmmGraphOC = Opened (Graph CmmNode O O)
-                | Closed (Graph CmmNode O C)
-type CmmAGraph = UniqSM CmmGraphOC     -- Graph open at entry
-
-{-
-MS: I began with
-  newtype CmmAGraph = forall x. AG (UniqSM (Graph CmmNode O x))
-but that does not work well, because we cannot take the graph
-out of the monad -- we do not know the type of what we would take
-out and pattern matching does not help, as we cannot pattern match
-on a graph inside the monad.
--}
 
-data Transfer = Call | Jump | Ret deriving Eq
+-----------------------------------------------------------------------------
+-- Building Graphs
+
+
+-- | CmmAGraph is a chunk of code consisting of:
+--
+--   * ordinary statements (assignments, stores etc.)
+--   * jumps
+--   * labels
+--   * out-of-line labelled blocks
+--
+-- The semantics is that control falls through labels and out-of-line
+-- blocks.  Everything after a jump up to the next label is by
+-- definition unreachable code, and will be discarded.
+--
+-- Two CmmAGraphs can be stuck together with <*>, with the meaning that
+-- control flows from the first to the second.
+--
+-- A 'CmmAGraph' can be turned into a 'CmmGraph' (closed at both ends)
+-- by providing a label for the entry point; see 'labelAGraph'.
+--
+type CmmAGraph = OrdList CgStmt
+
+data CgStmt
+  = CgLabel BlockId
+  | CgStmt  (CmmNode O O)
+  | CgLast  (CmmNode O C)
+  | CgFork  BlockId CmmAGraph
+
+flattenCmmAGraph :: BlockId -> CmmAGraph -> CmmGraph
+flattenCmmAGraph id stmts =
+    CmmGraph { g_entry = id,
+               g_graph = GMany NothingO body NothingO }
+  where
+  (block, blocks) = flatten (fromOL stmts)
+  entry = blockJoinHead (CmmEntry id) block
+  body = foldr addBlock emptyBody (entry:blocks)
+
+  flatten :: [CgStmt] -> (Block CmmNode O C, [Block CmmNode C C])
+  flatten [] = panic "flatten []"
+
+  -- A label at the end of a function or fork: this label must not be reachable,
+  -- but it might be referred to from another BB that also isn't reachable.
+  -- Eliminating these has to be done with a dead-code analysis.  For now,
+  -- we just make it into a well-formed block by adding a recursive jump.
+  flatten [CgLabel id]
+    = (goto_id, [blockJoinHead (CmmEntry id) goto_id] )
+    where goto_id = blockJoinTail emptyBlock (CmmBranch id)
+
+  -- A jump/branch: throw away all the code up to the next label, because
+  -- it is unreachable.  Be careful to keep forks that we find on the way.
+  flatten (CgLast stmt : stmts)
+    = case dropWhile isOrdinaryStmt stmts of
+        [] ->
+            ( sing, [] )
+        [CgLabel id] ->
+            ( sing, [blockJoin (CmmEntry id) emptyBlock (CmmBranch id)] )
+        (CgLabel id : stmts) ->
+            ( sing, blockJoinHead (CmmEntry id) block : blocks )
+            where (block,blocks) = flatten stmts
+        (CgFork fork_id stmts : ss) -> 
+            flatten (CgFork fork_id stmts : CgLast stmt : ss)
+        _ -> panic "MkGraph.flatten"
+    where
+      sing = blockJoinTail emptyBlock stmt
+
+  flatten (s:ss) = 
+        case s of
+          CgStmt stmt -> (blockCons stmt block, blocks)
+          CgLabel id  -> (blockJoinTail emptyBlock (CmmBranch id),
+                          blockJoinHead (CmmEntry id) block : blocks)
+          CgFork fork_id stmts -> 
+                (block, blockJoinHead (CmmEntry fork_id) fork_block : fork_blocks ++ blocks)
+                where (fork_block, fork_blocks) = flatten (fromOL stmts)
+          _ -> panic "MkGraph.flatten"
+    where (block,blocks) = flatten ss
+
+isOrdinaryStmt :: CgStmt -> Bool
+isOrdinaryStmt (CgStmt _) = True
+isOrdinaryStmt (CgLast _) = True
+isOrdinaryStmt _          = False
+
+
 
 ---------- AGraph manipulation
 
-emptyAGraph    :: CmmAGraph
 (<*>)          :: CmmAGraph -> CmmAGraph -> CmmAGraph
+(<*>)           = appOL
+
 catAGraphs     :: [CmmAGraph] -> CmmAGraph
+catAGraphs      = concatOL
+
+-- | created a sequence "goto id; id:" as an AGraph
+mkLabel        :: BlockId -> CmmAGraph
+mkLabel bid     = unitOL (CgLabel bid)
+
+-- | creates an open AGraph from a given node
+mkMiddle        :: CmmNode O O -> CmmAGraph
+mkMiddle middle = unitOL (CgStmt middle)
 
-mkLabel        :: BlockId     -> CmmAGraph  -- created a sequence "goto id; id:" as an AGraph
-mkMiddle       :: CmmNode O O -> CmmAGraph  -- creates an open AGraph from a given node
-mkLast         :: CmmNode O C -> CmmAGraph  -- created a closed AGraph from a given node
+-- | created a closed AGraph from a given node
+mkLast         :: CmmNode O C -> CmmAGraph
+mkLast last     = unitOL (CgLast last)
 
-withFreshLabel :: String -> (BlockId -> CmmAGraph) -> CmmAGraph
-withUnique     :: (Unique -> CmmAGraph) -> CmmAGraph
+-- | A labelled code block; should end in a last node
+outOfLine      :: BlockId -> CmmAGraph -> CmmAGraph
+outOfLine l g   = unitOL (CgFork l g)
 
+-- | allocate a fresh label for the entry point
 lgraphOfAGraph :: CmmAGraph -> UniqSM CmmGraph
-  -- ^ allocate a fresh label for the entry point
+lgraphOfAGraph g = do u <- getUniqueM
+                      return (flattenCmmAGraph (mkBlockId u) g)
+
+-- | use the given BlockId as the label of the entry point
 labelAGraph    :: BlockId -> CmmAGraph -> UniqSM CmmGraph
-  -- ^ use the given BlockId as the label of the entry point
+labelAGraph lbl ag = return (flattenCmmAGraph lbl ag)
 
 ---------- No-ops
 mkNop        :: CmmAGraph
+mkNop         = nilOL
+
 mkComment    :: FastString -> CmmAGraph
+#ifdef DEBUG
+-- SDM: generating all those comments takes time, this saved about 4% for me
+mkComment fs  = mkMiddle $ CmmComment fs
+#else
+mkComment _   = nilOL
+#endif
 
 ---------- Assignment and store
 mkAssign     :: CmmReg  -> CmmExpr -> CmmAGraph
-mkStore      :: CmmExpr -> CmmExpr -> CmmAGraph
+mkAssign l r  = mkMiddle $ CmmAssign l r
 
----------- Calls
-mkCall       :: CmmExpr -> (Convention, Convention) -> [CmmFormal] -> [CmmActual] ->
-                  UpdFrameOffset -> CmmAGraph
-mkCmmCall    :: CmmExpr ->              [CmmFormal] -> [CmmActual] ->
-                  UpdFrameOffset -> CmmAGraph
-  -- Native C-- calling convention
-mkSafeCall    :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> UpdFrameOffset -> Bool -> CmmAGraph
-mkUnsafeCall  :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> CmmAGraph
-mkFinalCall   :: CmmExpr -> CCallConv -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-  -- Never returns; like exit() or barf()
+mkStore      :: CmmExpr -> CmmExpr -> CmmAGraph
+mkStore  l r  = mkMiddle $ CmmStore  l r
 
 ---------- Control transfer
-mkJump          ::               CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkDirectJump    ::               CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkJumpGC        ::               CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkForeignJump   :: Convention -> CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
-mkCbranch       :: CmmExpr -> BlockId -> BlockId          -> CmmAGraph
-mkSwitch        :: CmmExpr -> [Maybe BlockId]             -> CmmAGraph
+mkJump          :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkJump e actuals updfr_off =
+  lastWithArgs Jump Old NativeNodeCall actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkDirectJump    :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkDirectJump e actuals updfr_off =
+  lastWithArgs Jump Old NativeDirectCall actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkJumpGC        :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkJumpGC e actuals updfr_off =
+  lastWithArgs Jump Old GC actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkForeignJump   :: Convention -> CmmExpr -> [CmmActual] -> UpdFrameOffset
+                -> CmmAGraph
+mkForeignJump conv e actuals updfr_off =
+  mkForeignJumpExtra conv e actuals updfr_off noExtraStack
+
+mkForeignJumpExtra :: Convention -> CmmExpr -> [CmmActual]
+                -> UpdFrameOffset -> (ByteOff, [(CmmExpr, ByteOff)])
+                -> CmmAGraph
+mkForeignJumpExtra conv e actuals updfr_off extra_stack =
+  lastWithArgsAndExtraStack Jump Old conv actuals updfr_off extra_stack $
+    toCall e Nothing updfr_off 0
+
+mkCbranch       :: CmmExpr -> BlockId -> BlockId -> CmmAGraph
+mkCbranch pred ifso ifnot = mkLast (CmmCondBranch pred ifso ifnot)
+
+mkSwitch        :: CmmExpr -> [Maybe BlockId] -> CmmAGraph
+mkSwitch e tbl   = mkLast $ CmmSwitch e tbl
+
 mkReturn        :: CmmExpr -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkReturn e actuals updfr_off =
+  lastWithArgs Ret  Old NativeReturn actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
 mkReturnSimple  :: [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkReturnSimple actuals updfr_off =
+  mkReturn e actuals updfr_off
+  where e = CmmLoad (CmmStackSlot Old updfr_off) gcWord
 
 mkBranch        :: BlockId -> CmmAGraph
-mkCmmIfThenElse :: CmmExpr -> CmmAGraph -> CmmAGraph -> CmmAGraph
-mkCmmIfThen     :: CmmExpr -> CmmAGraph -> CmmAGraph
-mkCmmWhileDo    :: CmmExpr -> CmmAGraph -> CmmAGraph
-
-outOfLine       :: CmmAGraph -> CmmAGraph
--- ^ The argument is an CmmAGraph that must have an
--- empty entry sequence and be closed at the end.
--- The result is a new CmmAGraph that is open at the
--- end and goes directly from entry to exit, with the
--- original graph sitting to the side out-of-line.
---
--- Example:  mkMiddle (x = 3)
---           <*> outOfLine (mkLabel L <*> ...stuff...)
---           <*> mkMiddle (y = x)
--- Control will flow directly from x=3 to y=x;
--- the block starting with L is "on the side".
---
--- N.B. algebraically forall g g' : g <*> outOfLine g' == outOfLine g' <*> g
+mkBranch bid     = mkLast (CmmBranch bid)
+
+mkFinalCall   :: CmmExpr -> CCallConv -> [CmmActual] -> UpdFrameOffset
+              -> CmmAGraph
+mkFinalCall f _ actuals updfr_off =
+  lastWithArgs Call Old NativeDirectCall actuals updfr_off $
+    toCall f Nothing updfr_off 0
+
+mkCallReturnsTo :: CmmExpr -> Convention -> [CmmActual]
+                -> BlockId
+                -> ByteOff
+                -> UpdFrameOffset
+                -> (ByteOff, [(CmmExpr,ByteOff)])
+                -> CmmAGraph
+mkCallReturnsTo f callConv actuals ret_lbl ret_off updfr_off extra_stack = do
+  lastWithArgsAndExtraStack Call (Young ret_lbl) callConv actuals
+     updfr_off extra_stack $
+       toCall f (Just ret_lbl) updfr_off ret_off
+
+mkUnsafeCall  :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> CmmAGraph
+mkUnsafeCall t fs as = mkMiddle $ CmmUnsafeForeignCall t fs as
+
 
 --------------------------------------------------------------------------
 
--- ================ IMPLEMENTATION ================--
-
---------------------------------------------------
--- Raw CmmAGraph handling
-
-emptyAGraph = return $ Opened emptyGraph
-ag <*> ah = do g <- ag
-               h <- ah
-               return (case (g, h) of
-                 (Opened g, Opened h) -> Opened $ g H.<*> h
-                 (Opened g, Closed h) -> Closed $ g H.<*> h
-                 (Closed g, Opened GNil) -> Closed g
-                 (Closed g, Opened (GUnit e)) -> note_unreachable e $ Closed g
-                 (Closed g, Opened (GMany (JustO e) b x)) -> note_unreachable e $ Opened $ g H.|*><*| GMany NothingO b x
-                 (Closed g, Closed (GMany (JustO e) b x)) -> note_unreachable e $ Closed $ g H.|*><*| GMany NothingO b x
-                 :: CmmGraphOC)
-catAGraphs = foldl (<*>) emptyAGraph
-
-outOfLine ag = withFreshLabel "outOfLine" $ \l ->
-               do g <- ag
-                  return (case g of
-                    Closed (GMany (JustO e) b _) -> note_unreachable e $ Opened $
-                                                      GMany (JustO $ BLast $ CmmBranch l) b (JustO $ BFirst $ CmmEntry l)
-                    _                            -> panic "outOfLine"
-                    :: CmmGraphOC)
-
-note_unreachable :: Block CmmNode O x -> a -> a
-note_unreachable block graph =
-  ASSERT (block_is_empty_or_label)  -- Note [Branch follows branch]
-  graph
-  where block_is_empty_or_label :: Bool
-        block_is_empty_or_label = case blockToNodeList block of
-                                    (NothingC, [], NothingC)            -> True
-                                    (NothingC, [], JustC (CmmBranch _)) -> True
-                                    _                                   -> False
-
-mkLabel bid = return $ Opened $ H.mkLast (CmmBranch bid) |*><*| H.mkFirst (CmmEntry bid)
-mkMiddle middle = return $ Opened $ H.mkMiddle middle
-mkLast last = return $ Closed $ H.mkLast last
-
-withUnique f = getUniqueM >>= f
-withFreshLabel _name f = getUniqueM >>= f . uniqueToLbl . intToUnique . getKey
 
-lgraphOfAGraph g = do u <- getUniqueM
-                      labelAGraph (mkBlockId u) g
-
-labelAGraph lbl ag = do g <- ag
-                        return $ CmmGraph {g_entry=lbl, g_graph=H.mkFirst (CmmEntry lbl) H.<*> closed g}
-  where closed :: CmmGraphOC -> Graph CmmNode O C
-        closed (Closed g) = g
-        closed (Opened g@(GMany entry body (JustO exit))) =
-          ASSERT (entryLabel exit `notElem` map entryLabel (postorder_dfs g))
-          GMany entry body NothingO
-        closed (Opened _) = panic "labelAGraph"
-
---------------------------------------------------
--- CmmAGraph constructions
-
-mkNop                     = emptyAGraph
-mkComment fs              = mkMiddle $ CmmComment fs
-mkStore  l r              = mkMiddle $ CmmStore  l r
-
--- NEED A COMPILER-DEBUGGING FLAG HERE
--- Sanity check: any value assigned to a pointer must be non-zero.
--- If it's 0, cause a crash immediately.
-mkAssign l r = if opt_StubDeadValues then assign l r <*> check l else assign l r
-  where assign l r = mkMiddle (CmmAssign l r)
-        check (CmmGlobal _) = mkNop
-        check l@(CmmLocal reg) = -- if a ptr arg is NULL, cause a crash!
-          if isGcPtrType ty then
-            mkCmmIfThen (CmmMachOp (MO_Eq w) [r, stackStubExpr w])
-                        (assign l (CmmLoad (CmmLit (CmmInt 0 w)) ty))
-          else mkNop
-            where ty = localRegType reg
-                  w  = typeWidth ty
-                  r  = CmmReg l
 
 
 -- Why are we inserting extra blocks that simply branch to the successors?
 -- Because in addition to the branch instruction, @mkBranch@ will insert
 -- a necessary adjustment to the stack pointer.
-mkCbranch pred ifso ifnot = mkLast (CmmCondBranch pred ifso ifnot)
-mkSwitch e tbl            = mkLast $ CmmSwitch e tbl
-
-mkSafeCall   t fs as upd i = withFreshLabel "safe call" $ body
-  where
-    body k =
-     (    mkStore (CmmStackSlot (CallArea (Young k)) (widthInBytes wordWidth))
-                  (CmmLit (CmmBlock k))
-      <*> mkLast (CmmForeignCall {tgt=t, res=fs, args=as, succ=k, updfr=upd, intrbl=i})
-      <*> mkLabel k)
-mkUnsafeCall t fs as = mkMiddle $ CmmUnsafeForeignCall t fs as
 
-mkBranch bid = mkLast (CmmBranch bid)
-
-mkCmmIfThenElse e tbranch fbranch =
-  withFreshLabel "end of if"     $ \endif ->
-  withFreshLabel "start of then" $ \tid ->
-  withFreshLabel "start of else" $ \fid ->
-    mkCbranch e tid fid <*>
-    mkLabel tid <*> tbranch <*> mkBranch endif <*>
-    mkLabel fid <*> fbranch <*> mkLabel endif
-
-mkCmmIfThen e tbranch
-  = withFreshLabel "end of if"     $ \endif ->
-    withFreshLabel "start of then" $ \tid ->
-      mkCbranch e tid endif <*>
-      mkLabel tid <*> tbranch <*> mkLabel endif
-
-mkCmmWhileDo e body =
-  withFreshLabel "loop test" $ \test ->
-  withFreshLabel "loop head" $ \head ->
-  withFreshLabel "end while" $ \endwhile ->
-    -- Forrest Baskett's while-loop layout
-    mkBranch test <*> mkLabel head <*> body
-                  <*> mkLabel test <*> mkCbranch e head endwhile
-                  <*> mkLabel endwhile
 
 -- For debugging purposes, we can stub out dead stack slots:
 stackStubExpr :: Width -> CmmExpr
@@ -286,12 +258,9 @@ stackStubExpr w = CmmLit (CmmInt 0 w)
 -- Therefore, for copying arguments and results, we provide different
 -- functions to pass the arguments in an overflow area and to pass them in spill slots.
 copyInOflow  :: Convention -> Area -> [CmmFormal] -> (Int, CmmAGraph)
-copyInSlot   :: Convention -> [CmmFormal] -> [CmmNode O O]
-copyOutSlot  :: Convention -> [LocalReg] -> [CmmNode O O]
 
 copyInOflow conv area formals = (offset, catAGraphs $ map mkMiddle nodes)
   where (offset, nodes) = copyIn oneCopyOflowI conv area formals
-copyInSlot c f = snd $ copyIn oneCopySlotI c (panic "no area for copying to slots") f
 
 type SlotCopier = Area -> (LocalReg, ByteOff) -> (ByteOff, [CmmNode O O]) ->
                           (ByteOff, [CmmNode O O])
@@ -312,104 +281,86 @@ copyIn oflow conv area formals =
                 adjust rst x@(_, RegisterParam _) = x : rst
 
 -- Copy-in one arg, using overflow space if needed.
-oneCopyOflowI, oneCopySlotI :: SlotCopier
+oneCopyOflowI :: SlotCopier
 oneCopyOflowI area (reg, off) (n, ms) =
   (max n off, CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot area off) ty) : ms)
   where ty = localRegType reg
 
--- Copy-in one arg, using spill slots if needed -- used for calling conventions at
--- a procpoint that is not a return point. The offset is irrelevant here...
-oneCopySlotI _ (reg, _) (n, ms) =
-  (n, CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot (RegSlot reg) w) ty) : ms)
-  where ty = localRegType reg
-        w  = widthInBytes (typeWidth ty)
-
-
 -- Factoring out the common parts of the copyout functions yielded something
 -- more complicated:
 
-copyOutOflow :: Convention -> Transfer -> Area -> [CmmActual] -> UpdFrameOffset ->
-                              (Int, CmmAGraph)
+data Transfer = Call | Jump | Ret deriving Eq
+
+copyOutOflow :: Convention -> Transfer -> Area -> [CmmActual]
+             -> UpdFrameOffset
+             -> (ByteOff, [(CmmExpr,ByteOff)]) -- extra stack stuff
+             -> (Int, CmmAGraph)
+
 -- Generate code to move the actual parameters into the locations
--- required by the calling convention.  This includes a store for the return address.
+-- required by the calling convention.  This includes a store for the
+-- return address.
 --
--- The argument layout function ignores the pointer to the info table, so we slot that
--- in here. When copying-out to a young area, we set the info table for return
--- and adjust the offsets of the other parameters.
--- If this is a call instruction, we adjust the offsets of the other parameters.
-copyOutOflow conv transfer area@(CallArea a) actuals updfr_off
-  = foldr co (init_offset, emptyAGraph) args'
+-- The argument layout function ignores the pointer to the info table,
+-- so we slot that in here. When copying-out to a young area, we set
+-- the info table for return and adjust the offsets of the other
+-- parameters.  If this is a call instruction, we adjust the offsets
+-- of the other parameters.
+copyOutOflow conv transfer area actuals updfr_off
+  (extra_stack_off, extra_stack_stuff)
+  = foldr co (init_offset, mkNop) (args' ++ stack_params)
   where 
     co (v, RegisterParam r) (n, ms) = (n, mkAssign (CmmGlobal r) v <*> ms)
     co (v, StackParam off)  (n, ms) = (max n off, mkStore (CmmStackSlot area off) v <*> ms)
 
+    stack_params = [ (e, StackParam (off + init_offset))
+                   | (e,off) <- extra_stack_stuff ]
+
     (setRA, init_offset) =
-      case a of Young id -> id `seq` -- Generate a store instruction for
-                                     -- the return address if making a call
+      case area of
+            Young id -> id `seq` -- Generate a store instruction for
+           	    	     	     -- the return address if making a call
                   if transfer == Call then
                     ([(CmmLit (CmmBlock id), StackParam init_offset)],
                      widthInBytes wordWidth)
                   else ([], 0)
-                Old -> ([], updfr_off)
+            Old -> ([], updfr_off)
+
+    arg_offset = init_offset + extra_stack_off
 
     args :: [(CmmExpr, ParamLocation)]   -- The argument and where to put it
     args = assignArgumentsPos conv cmmExprType actuals
 
     args' = foldl adjust setRA args
-      where adjust rst   (v, StackParam off)  = (v, StackParam (off + init_offset)) : rst
+      where adjust rst   (v, StackParam off)  = (v, StackParam (off + arg_offset)) : rst
             adjust rst x@(_, RegisterParam _) = x : rst
 
-copyOutOflow _ _ (RegSlot _) _ _ = panic "cannot copy arguments into a register slot"
 
--- Args passed only in registers and stack slots; no overflow space.
--- No return address may apply!
-copyOutSlot conv actuals = foldr co [] args
-  where co (v, RegisterParam r) ms = CmmAssign (CmmGlobal r) (toExp v) : ms
-        co (v, StackParam off)  ms = CmmStore  (CmmStackSlot (RegSlot v) off) (toExp v) : ms
-        toExp r = CmmReg (CmmLocal r)
-        args = assignArgumentsPos conv localRegType actuals
 
 mkCallEntry :: Convention -> [CmmFormal] -> (Int, CmmAGraph)
-mkCallEntry conv formals = copyInOflow conv (CallArea Old) formals
+mkCallEntry conv formals = copyInOflow conv Old formals
 
-lastWithArgs :: Transfer -> Area -> Convention -> [CmmActual] -> UpdFrameOffset ->
-                (ByteOff -> CmmAGraph) -> CmmAGraph
+lastWithArgs :: Transfer -> Area -> Convention -> [CmmActual]
+             -> UpdFrameOffset
+             -> (ByteOff -> CmmAGraph)
+             -> CmmAGraph
 lastWithArgs transfer area conv actuals updfr_off last =
-  let (outArgs, copies) = copyOutOflow conv transfer area actuals updfr_off in
+  lastWithArgsAndExtraStack transfer area conv actuals
+                            updfr_off noExtraStack last
+
+lastWithArgsAndExtraStack :: Transfer -> Area -> Convention -> [CmmActual]
+             -> UpdFrameOffset -> (ByteOff, [(CmmExpr,ByteOff)])
+             -> (ByteOff -> CmmAGraph)
+             -> CmmAGraph
+lastWithArgsAndExtraStack transfer area conv actuals updfr_off
+                          extra_stack last =
+  let (outArgs, copies) = copyOutOflow conv transfer area actuals
+                             updfr_off extra_stack in
   copies <*> last outArgs
 
--- The area created for the jump and return arguments is the same area as the
--- procedure entry.
-old :: Area
-old = CallArea Old
-toCall :: CmmExpr -> Maybe BlockId -> UpdFrameOffset -> ByteOff -> ByteOff -> CmmAGraph
+noExtraStack :: (ByteOff, [(CmmExpr,ByteOff)])
+noExtraStack = (0,[])
+
+toCall :: CmmExpr -> Maybe BlockId -> UpdFrameOffset -> ByteOff -> ByteOff
+       -> CmmAGraph
 toCall e cont updfr_off res_space arg_space =
   mkLast $ CmmCall e cont arg_space res_space updfr_off
-mkJump e actuals updfr_off =
-  lastWithArgs Jump old NativeNodeCall actuals updfr_off $ toCall e Nothing updfr_off 0
-mkDirectJump e actuals updfr_off =
-  lastWithArgs Jump old NativeDirectCall actuals updfr_off $ toCall e Nothing updfr_off 0
-mkJumpGC e actuals updfr_off =
-  lastWithArgs Jump old GC actuals updfr_off $ toCall e Nothing updfr_off 0
-mkForeignJump conv e actuals updfr_off =
-  lastWithArgs Jump old conv actuals updfr_off $ toCall e Nothing updfr_off 0
-mkReturn e actuals updfr_off =
-  lastWithArgs Ret  old NativeReturn actuals updfr_off $ toCall e Nothing updfr_off 0
-    -- where e = CmmLoad (CmmStackSlot (CallArea Old) updfr_off) gcWord
-mkReturnSimple actuals updfr_off =
-  lastWithArgs Ret  old NativeReturn actuals updfr_off $ toCall e Nothing updfr_off 0
-    where e = CmmLoad (CmmStackSlot (CallArea Old) updfr_off) gcWord
-
-mkFinalCall f _ actuals updfr_off =
-  lastWithArgs Call old NativeDirectCall actuals updfr_off $ toCall f Nothing updfr_off 0
-
-mkCmmCall f results actuals = mkCall f (NativeDirectCall, NativeReturn) results actuals
-
--- I'm dropping the SRT, but that should be okay: we plan to reconstruct it later.
-mkCall f (callConv, retConv) results actuals updfr_off =
-  withFreshLabel "call successor" $ \k ->
-    let area = CallArea $ Young k
-        (off, copyin) = copyInOflow retConv area results
-        copyout = lastWithArgs Call area callConv actuals updfr_off 
-                               (toCall f (Just k) updfr_off off)
-    in (copyout <*> mkLabel k <*> copyin)
diff --git a/compiler/cmm/OldCmm.hs b/compiler/cmm/OldCmm.hs
index fc4706c8c4..00bbe6d2ee 100644
--- a/compiler/cmm/OldCmm.hs
+++ b/compiler/cmm/OldCmm.hs
@@ -9,9 +9,7 @@
 module OldCmm (
         CmmGroup, GenCmmGroup, RawCmmGroup, CmmDecl, RawCmmDecl,
         ListGraph(..),
-
-        CmmInfo(..), CmmInfoTable(..), ClosureTypeInfo(..), UpdateFrame(..),
-
+        UpdateFrame(..), CmmInfoTable(..), ClosureTypeInfo(..),
         CmmStatic(..), CmmStatics(..), CmmFormal, CmmActual,
 
         cmmMapGraph, cmmTopMapGraph,
@@ -52,13 +50,6 @@ import ForeignCall
 --     Info Tables
 -----------------------------------------------------------------------------
 
-data CmmInfo
-  = CmmInfo
-        (Maybe BlockId)     -- GC target. Nothing <=> CPS won't do stack check
-                            -- JD: NOT USED BY NEW CODE GEN
-        (Maybe UpdateFrame) -- Update frame
-        CmmInfoTable        -- Info table
-
 -- | A frame that is to be pushed before entry to the function.
 -- Used to handle 'update' frames.
 data UpdateFrame
@@ -85,8 +76,8 @@ data UpdateFrame
 newtype ListGraph i = ListGraph [GenBasicBlock i]
 
 -- | Cmm with the info table as a data type
-type CmmGroup = GenCmmGroup CmmStatics CmmInfo (ListGraph CmmStmt)
-type CmmDecl = GenCmmDecl CmmStatics CmmInfo (ListGraph CmmStmt)
+type CmmGroup = GenCmmGroup CmmStatics CmmInfoTable (ListGraph CmmStmt)
+type CmmDecl = GenCmmDecl CmmStatics CmmInfoTable (ListGraph CmmStmt)
 
 -- | Cmm with the info tables converted to a list of 'CmmStatic' along with the info
 -- table label. If we are building without tables-next-to-code there will be no statics
@@ -225,16 +216,9 @@ instance UserOfLocalRegs CmmCallTarget where
     foldRegsUsed f set (CmmCallee e _)    = foldRegsUsed f set e
     foldRegsUsed f set (CmmPrim _ mStmts) = foldRegsUsed f set mStmts
 
-instance UserOfSlots CmmCallTarget where
-    foldSlotsUsed f set (CmmCallee e _) = foldSlotsUsed f set e
-    foldSlotsUsed _ set (CmmPrim {})    = set
-
 instance UserOfLocalRegs a => UserOfLocalRegs (CmmHinted a) where
     foldRegsUsed f set a = foldRegsUsed f set (hintlessCmm a)
 
-instance UserOfSlots a => UserOfSlots (CmmHinted a) where
-    foldSlotsUsed f set a = foldSlotsUsed f set (hintlessCmm a)
-
 instance DefinerOfLocalRegs a => DefinerOfLocalRegs (CmmHinted a) where
     foldRegsDefd f set a = foldRegsDefd f set (hintlessCmm a)
 
diff --git a/compiler/cmm/OldCmmLint.hs b/compiler/cmm/OldCmmLint.hs
new file mode 100644
index 0000000000..72e40ce4f8
--- /dev/null
+++ b/compiler/cmm/OldCmmLint.hs
@@ -0,0 +1,209 @@
+-----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-- CmmLint: checking the correctness of Cmm statements and expressions
+--
+-----------------------------------------------------------------------------
+
+{-# OPTIONS -fno-warn-tabs #-}
+-- The above warning supression flag is a temporary kludge.
+-- While working on this module you are encouraged to remove it and
+-- detab the module (please do the detabbing in a separate patch). See
+--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#TabsvsSpaces
+-- for details
+
+module OldCmmLint (
+  cmmLint, cmmLintTop
+  ) where
+
+import BlockId
+import OldCmm
+import CLabel
+import Outputable
+import OldPprCmm()
+import Constants
+import FastString
+import Platform
+
+import Data.Maybe
+
+-- -----------------------------------------------------------------------------
+-- Exported entry points:
+
+cmmLint :: (Outputable d, Outputable h)
+        => Platform -> GenCmmGroup d h (ListGraph CmmStmt) -> Maybe SDoc
+cmmLint platform tops = runCmmLint platform (mapM_ (lintCmmDecl platform)) tops
+
+cmmLintTop :: (Outputable d, Outputable h)
+           => Platform -> GenCmmDecl d h (ListGraph CmmStmt) -> Maybe SDoc
+cmmLintTop platform top = runCmmLint platform (lintCmmDecl platform) top
+
+runCmmLint :: Outputable a
+           => Platform -> (a -> CmmLint b) -> a -> Maybe SDoc
+runCmmLint _ l p =
+   case unCL (l p) of
+   Left err -> Just (vcat [ptext $ sLit ("Cmm lint error:"),
+                           nest 2 err,
+                           ptext $ sLit ("Program was:"),
+                           nest 2 (ppr p)])
+   Right _  -> Nothing
+
+lintCmmDecl :: Platform -> (GenCmmDecl h i (ListGraph CmmStmt)) -> CmmLint ()
+lintCmmDecl platform (CmmProc _ lbl (ListGraph blocks))
+  = addLintInfo (text "in proc " <> pprCLabel platform lbl) $
+        let labels = foldl (\s b -> setInsert (blockId b) s) setEmpty blocks
+	in  mapM_ (lintCmmBlock platform labels) blocks
+
+lintCmmDecl _ (CmmData {})
+  = return ()
+
+lintCmmBlock :: Platform -> BlockSet -> GenBasicBlock CmmStmt -> CmmLint ()
+lintCmmBlock platform labels (BasicBlock id stmts)
+  = addLintInfo (text "in basic block " <> ppr id) $
+	mapM_ (lintCmmStmt platform labels) stmts
+
+-- -----------------------------------------------------------------------------
+-- lintCmmExpr
+
+-- Checks whether a CmmExpr is "type-correct", and check for obvious-looking
+-- byte/word mismatches.
+
+lintCmmExpr :: Platform -> CmmExpr -> CmmLint CmmType
+lintCmmExpr platform (CmmLoad expr rep) = do
+  _ <- lintCmmExpr platform expr
+  -- Disabled, if we have the inlining phase before the lint phase,
+  -- we can have funny offsets due to pointer tagging. -- EZY
+  -- when (widthInBytes (typeWidth rep) >= wORD_SIZE) $
+  --   cmmCheckWordAddress expr
+  return rep
+lintCmmExpr platform expr@(CmmMachOp op args) = do
+  tys <- mapM (lintCmmExpr platform) args
+  if map (typeWidth . cmmExprType) args == machOpArgReps op
+  	then cmmCheckMachOp op args tys
+	else cmmLintMachOpErr expr (map cmmExprType args) (machOpArgReps op)
+lintCmmExpr platform (CmmRegOff reg offset)
+  = lintCmmExpr platform (CmmMachOp (MO_Add rep)
+		[CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
+  where rep = typeWidth (cmmRegType reg)
+lintCmmExpr _ expr =
+  return (cmmExprType expr)
+
+-- Check for some common byte/word mismatches (eg. Sp + 1)
+cmmCheckMachOp   :: MachOp -> [CmmExpr] -> [CmmType] -> CmmLint CmmType
+cmmCheckMachOp op [lit@(CmmLit (CmmInt { })), reg@(CmmReg _)] tys
+  = cmmCheckMachOp op [reg, lit] tys
+cmmCheckMachOp op _ tys
+  = return (machOpResultType op tys)
+
+isOffsetOp :: MachOp -> Bool
+isOffsetOp (MO_Add _) = True
+isOffsetOp (MO_Sub _) = True
+isOffsetOp _ = False
+
+-- This expression should be an address from which a word can be loaded:
+-- check for funny-looking sub-word offsets.
+_cmmCheckWordAddress :: CmmExpr -> CmmLint ()
+_cmmCheckWordAddress e@(CmmMachOp op [arg, CmmLit (CmmInt i _)])
+  | isOffsetOp op && notNodeReg arg && i `rem` fromIntegral wORD_SIZE /= 0
+  = cmmLintDubiousWordOffset e
+_cmmCheckWordAddress e@(CmmMachOp op [CmmLit (CmmInt i _), arg])
+  | isOffsetOp op && notNodeReg arg && i `rem` fromIntegral wORD_SIZE /= 0
+  = cmmLintDubiousWordOffset e
+_cmmCheckWordAddress _
+  = return ()
+
+-- No warnings for unaligned arithmetic with the node register,
+-- which is used to extract fields from tagged constructor closures.
+notNodeReg :: CmmExpr -> Bool
+notNodeReg (CmmReg reg) | reg == nodeReg = False
+notNodeReg _                             = True
+
+lintCmmStmt :: Platform -> BlockSet -> CmmStmt -> CmmLint ()
+lintCmmStmt platform labels = lint
+    where lint (CmmNop) = return ()
+          lint (CmmComment {}) = return ()
+          lint stmt@(CmmAssign reg expr) = do
+            erep <- lintCmmExpr platform expr
+	    let reg_ty = cmmRegType reg
+            if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
+                then return ()
+                else cmmLintAssignErr stmt erep reg_ty
+          lint (CmmStore l r) = do
+            _ <- lintCmmExpr platform l
+            _ <- lintCmmExpr platform r
+            return ()
+          lint (CmmCall target _res args _) =
+              do lintTarget platform labels target
+                 mapM_ (lintCmmExpr platform . hintlessCmm) args
+          lint (CmmCondBranch e id) = checkTarget id >> lintCmmExpr platform e >> checkCond e
+          lint (CmmSwitch e branches) = do
+            mapM_ checkTarget $ catMaybes branches
+            erep <- lintCmmExpr platform e
+            if (erep `cmmEqType_ignoring_ptrhood` bWord)
+              then return ()
+              else cmmLintErr (text "switch scrutinee is not a word: " <> ppr e <>
+                               text " :: " <> ppr erep)
+          lint (CmmJump e _) = lintCmmExpr platform e >> return ()
+          lint (CmmReturn) = return ()
+          lint (CmmBranch id)    = checkTarget id
+          checkTarget id = if setMember id labels then return ()
+                           else cmmLintErr (text "Branch to nonexistent id" <+> ppr id)
+
+lintTarget :: Platform -> BlockSet -> CmmCallTarget -> CmmLint ()
+lintTarget platform _      (CmmCallee e _) = do _ <- lintCmmExpr platform e
+                                                return ()
+lintTarget _        _      (CmmPrim _ Nothing) = return ()
+lintTarget platform labels (CmmPrim _ (Just stmts))
+    = mapM_ (lintCmmStmt platform labels) stmts
+
+
+checkCond :: CmmExpr -> CmmLint ()
+checkCond (CmmMachOp mop _) | isComparisonMachOp mop = return ()
+checkCond (CmmLit (CmmInt x t)) | x == 0 || x == 1, t == wordWidth = return () -- constant values
+checkCond expr
+    = cmmLintErr (hang (text "expression is not a conditional:") 2
+                       (ppr expr))
+
+-- -----------------------------------------------------------------------------
+-- CmmLint monad
+
+-- just a basic error monad:
+
+newtype CmmLint a = CmmLint { unCL :: Either SDoc a }
+
+instance Monad CmmLint where
+  CmmLint m >>= k = CmmLint $ case m of 
+				Left e -> Left e
+				Right a -> unCL (k a)
+  return a = CmmLint (Right a)
+
+cmmLintErr :: SDoc -> CmmLint a
+cmmLintErr msg = CmmLint (Left msg)
+
+addLintInfo :: SDoc -> CmmLint a -> CmmLint a
+addLintInfo info thing = CmmLint $ 
+   case unCL thing of
+	Left err -> Left (hang info 2 err)
+	Right a  -> Right a
+
+cmmLintMachOpErr :: CmmExpr -> [CmmType] -> [Width] -> CmmLint a
+cmmLintMachOpErr expr argsRep opExpectsRep
+     = cmmLintErr (text "in MachOp application: " $$ 
+					nest 2 (ppr expr) $$
+				        (text "op is expecting: " <+> ppr opExpectsRep) $$
+					(text "arguments provide: " <+> ppr argsRep))
+
+cmmLintAssignErr :: CmmStmt -> CmmType -> CmmType -> CmmLint a
+cmmLintAssignErr stmt e_ty r_ty
+  = cmmLintErr (text "in assignment: " $$ 
+		nest 2 (vcat [ppr stmt, 
+			      text "Reg ty:" <+> ppr r_ty,
+			      text "Rhs ty:" <+> ppr e_ty]))
+			 
+					
+
+cmmLintDubiousWordOffset :: CmmExpr -> CmmLint a
+cmmLintDubiousWordOffset expr
+   = cmmLintErr (text "offset is not a multiple of words: " $$
+			nest 2 (ppr expr))
diff --git a/compiler/cmm/OldPprCmm.hs b/compiler/cmm/OldPprCmm.hs
index d6a12221fb..a30be9c6c7 100644
--- a/compiler/cmm/OldPprCmm.hs
+++ b/compiler/cmm/OldPprCmm.hs
@@ -63,10 +63,6 @@ instance Outputable instr => Outputable (GenBasicBlock instr) where
 instance Outputable CmmStmt where
     ppr s = pprStmt s
 
-instance Outputable CmmInfo where
-    ppr i = pprInfo i
-
-
 -- --------------------------------------------------------------------------
 instance Outputable CmmSafety where
   ppr CmmUnsafe = ptext (sLit "_unsafe_call_")
@@ -74,21 +70,6 @@ instance Outputable CmmSafety where
   ppr (CmmSafe srt) = ppr srt
 
 -- --------------------------------------------------------------------------
--- Info tables. The current pretty printer needs refinement
--- but will work for now.
---
--- For ideas on how to refine it, they used to be printed in the
--- style of C--'s 'stackdata' declaration, just inside the proc body,
--- and were labelled with the procedure name ++ "_info".
-pprInfo :: CmmInfo -> SDoc
-pprInfo (CmmInfo _gc_target update_frame info_table) =
-    vcat [{-ptext (sLit "gc_target: ") <>
-                maybe (ptext (sLit "<none>")) ppr gc_target,-}
-          ptext (sLit "update_frame: ") <>
-                maybe (ptext (sLit "<none>")) pprUpdateFrame update_frame,
-          ppr info_table]
-
--- --------------------------------------------------------------------------
 -- Basic blocks look like assembly blocks.
 --      lbl: stmt ; stmt ; ..
 pprBBlock :: Outputable stmt => GenBasicBlock stmt -> SDoc
diff --git a/compiler/cmm/OptimizationFuel.hs b/compiler/cmm/OptimizationFuel.hs
index a85b11bcc6..6e968c0b1d 100644
--- a/compiler/cmm/OptimizationFuel.hs
+++ b/compiler/cmm/OptimizationFuel.hs
@@ -8,10 +8,8 @@
 module OptimizationFuel
     ( OptimizationFuel, amountOfFuel, tankFilledTo, unlimitedFuel, anyFuelLeft, oneLessFuel
     , OptFuelState, initOptFuelState
-    , FuelConsumer, FuelUsingMonad, FuelState
-    , fuelGet, fuelSet, lastFuelPass, setFuelPass
-    , fuelExhausted, fuelDec1, tryWithFuel
-    , runFuelIO, runInfiniteFuelIO, fuelConsumingPass
+    , FuelConsumer, FuelState
+    , runFuelIO, runInfiniteFuelIO
     , FuelUniqSM
     , liftUniq
     )
@@ -62,25 +60,20 @@ anyFuelLeft (OptimizationFuel f) = f > 0
 oneLessFuel (OptimizationFuel f) = ASSERT (f > 0) (OptimizationFuel (f - 1))
 unlimitedFuel = OptimizationFuel infiniteFuel
 
-data FuelState = FuelState { fs_fuel :: OptimizationFuel, fs_lastpass :: String }
-newtype FuelUniqSM a = FUSM { unFUSM :: FuelState -> UniqSM (a, FuelState) }
-
-fuelConsumingPass :: String -> FuelConsumer a -> FuelUniqSM a
-fuelConsumingPass name f = do setFuelPass name
-                              fuel <- fuelGet
-                              let (a, fuel') = f fuel
-                              fuelSet fuel'
-                              return a
+data FuelState = FuelState { fs_fuel :: {-# UNPACK #-} !OptimizationFuel,
+                             fs_lastpass :: String }
+newtype FuelUniqSM a = FUSM { unFUSM :: UniqSupply -> FuelState -> (# a, UniqSupply, FuelState #) }
 
 runFuelIO :: OptFuelState -> FuelUniqSM a -> IO a
 runFuelIO fs (FUSM f) =
     do pass <- readIORef (pass_ref fs)
        fuel <- readIORef (fuel_ref fs)
        u    <- mkSplitUniqSupply 'u'
-       let (a, FuelState fuel' pass') = initUs_ u $ f (FuelState fuel pass)
-       writeIORef (pass_ref fs) pass'
-       writeIORef (fuel_ref fs) fuel'
-       return a
+       case f u (FuelState fuel pass) of
+          (# a, _, FuelState fuel' pass' #) -> do
+            writeIORef (pass_ref fs) pass'
+            writeIORef (fuel_ref fs) fuel'
+            return a
 
 -- ToDo: Do we need the pass_ref when we are doing infinite fueld
 -- transformations?
@@ -88,21 +81,32 @@ runInfiniteFuelIO :: OptFuelState -> FuelUniqSM a -> IO a
 runInfiniteFuelIO fs (FUSM f) =
     do pass <- readIORef (pass_ref fs)
        u <- mkSplitUniqSupply 'u'
-       let (a, FuelState _ pass') = initUs_ u $ f (FuelState unlimitedFuel pass)
-       writeIORef (pass_ref fs) pass'
-       return a
+       case f u (FuelState unlimitedFuel pass) of
+          (# a, _, FuelState _fuel pass' #) -> do
+            writeIORef (pass_ref fs) pass'
+            return a
 
 instance Monad FuelUniqSM where
-  FUSM f >>= k = FUSM (\s -> f s >>= \(a, s') -> unFUSM (k a) s')
-  return a     = FUSM (\s -> return (a, s))
+  FUSM f >>= k = FUSM (\u s -> case f u s of (# a, u', s' #) ->
+                                                unFUSM (k a) u' s')
+  return a     = FUSM (\u s -> (# a, u, s #))
 
 instance MonadUnique FuelUniqSM where
-    getUniqueSupplyM = liftUniq getUniqueSupplyM
-    getUniqueM       = liftUniq getUniqueM
-    getUniquesM      = liftUniq getUniquesM
+    getUniqueSupplyM =
+       FUSM $ \us f -> case splitUniqSupply us of
+                         (us1,us2) -> (# us1, us2, f #)
+
+    getUniqueM =
+       FUSM $ \us f -> case splitUniqSupply us of
+                         (us1,us2) -> (# uniqFromSupply us1, us2, f #)
+
+    getUniquesM =
+       FUSM $ \us f -> case splitUniqSupply us of
+                         (us1,us2) -> (# uniqsFromSupply us1, us2, f #)
+
 
 liftUniq :: UniqSM x -> FuelUniqSM x
-liftUniq x = FUSM (\s -> x >>= (\u -> return (u, s)))
+liftUniq x = FUSM (\u s -> case initUs u x of (a,u') -> (# a, u', s #))
 
 class Monad m => FuelUsingMonad m where
   fuelGet      :: m OptimizationFuel
@@ -110,25 +114,14 @@ class Monad m => FuelUsingMonad m where
   lastFuelPass :: m String
   setFuelPass  :: String -> m ()
 
-fuelExhausted :: FuelUsingMonad m => m Bool
-fuelExhausted = fuelGet >>= return . anyFuelLeft
-
-fuelDec1 :: FuelUsingMonad m => m ()
-fuelDec1 = fuelGet >>= fuelSet . oneLessFuel
-
-tryWithFuel :: FuelUsingMonad m => a -> m (Maybe a)
-tryWithFuel r = do f <- fuelGet
-                   if anyFuelLeft f then fuelSet (oneLessFuel f) >> return (Just r)
-                                    else return Nothing
-
 instance FuelUsingMonad FuelUniqSM where
   fuelGet          = extract fs_fuel
   lastFuelPass     = extract fs_lastpass
-  fuelSet fuel     = FUSM (\s -> return ((), s { fs_fuel     = fuel }))
-  setFuelPass pass = FUSM (\s -> return ((), s { fs_lastpass = pass }))
+  fuelSet fuel     = FUSM (\u s -> (# (), u, s { fs_fuel     = fuel } #))
+  setFuelPass pass = FUSM (\u s -> (# (), u, s { fs_lastpass = pass } #))
 
 extract :: (FuelState -> a) -> FuelUniqSM a
-extract f = FUSM (\s -> return (f s, s))
+extract f = FUSM (\u s -> (# f s, u, s #))
 
 instance FuelMonad FuelUniqSM where
   getFuel = liftM amountOfFuel fuelGet
@@ -137,6 +130,6 @@ instance FuelMonad FuelUniqSM where
 -- Don't bother to checkpoint the unique supply; it doesn't matter
 instance CheckpointMonad FuelUniqSM where
     type Checkpoint FuelUniqSM = FuelState
-    checkpoint = FUSM $ \fuel -> return (fuel, fuel) 
-    restart fuel = FUSM $ \_ -> return ((), fuel)
+    checkpoint = FUSM $ \u fuel -> (# fuel, u, fuel #)
+    restart fuel = FUSM $ \u _ -> (# (), u, fuel #)
 
diff --git a/compiler/cmm/PprCmm.hs b/compiler/cmm/PprCmm.hs
index 183708c08e..dee6ee881e 100644
--- a/compiler/cmm/PprCmm.hs
+++ b/compiler/cmm/PprCmm.hs
@@ -146,8 +146,6 @@ pprConvention  Slow                 = text "<slow-convention>"
 pprConvention  GC                   = text "<gc-convention>"
 pprConvention  PrimOpCall           = text "<primop-call-convention>"
 pprConvention  PrimOpReturn         = text "<primop-ret-convention>"
-pprConvention (Foreign c)           = ppr c
-pprConvention (Private {})          = text "<private-convention>"
 
 pprForeignConvention :: ForeignConvention -> SDoc
 pprForeignConvention (ForeignConvention c as rs) = ppr c <> ppr as <> ppr rs
diff --git a/compiler/cmm/PprCmmExpr.hs b/compiler/cmm/PprCmmExpr.hs
index 7503127555..119f2b7239 100644
--- a/compiler/cmm/PprCmmExpr.hs
+++ b/compiler/cmm/PprCmmExpr.hs
@@ -237,12 +237,8 @@ pprLocalReg (LocalReg uniq rep)
 
 -- Stack areas
 pprArea :: Area -> SDoc
-pprArea (RegSlot r)   = hcat [ text "slot<", ppr r, text ">" ]
-pprArea (CallArea id) = pprAreaId id
-
-pprAreaId :: AreaId -> SDoc
-pprAreaId Old        = text "old"
-pprAreaId (Young id) = hcat [ text "young<", ppr id, text ">" ]
+pprArea Old        = text "old"
+pprArea (Young id) = hcat [ text "young<", ppr id, text ">" ]
 
 -- needs to be kept in syn with CmmExpr.hs.GlobalReg
 --
diff --git a/compiler/cmm/SMRep.lhs b/compiler/cmm/SMRep.lhs
index ce30799bf6..8b3308ef97 100644
--- a/compiler/cmm/SMRep.lhs
+++ b/compiler/cmm/SMRep.lhs
@@ -21,6 +21,7 @@ module SMRep (
 	StgWord, StgHalfWord, 
 	hALF_WORD_SIZE, hALF_WORD_SIZE_IN_BITS,
 	WordOff, ByteOff,
+        roundUpToWords,
 
         -- * Closure repesentation
         SMRep(..),	-- CmmInfo sees the rep; no one else does
@@ -57,6 +58,7 @@ import FastString
 
 import Data.Char( ord )
 import Data.Word
+import Data.Bits
 \end{code}
 
 
@@ -69,6 +71,9 @@ import Data.Word
 \begin{code}
 type WordOff = Int	-- Word offset, or word count
 type ByteOff = Int	-- Byte offset, or byte count
+
+roundUpToWords :: ByteOff -> ByteOff
+roundUpToWords n = (n + (wORD_SIZE - 1)) .&. (complement (wORD_SIZE - 1))
 \end{code}
 
 StgWord is a type representing an StgWord on the target platform.
@@ -93,6 +98,7 @@ hALF_WORD_SIZE_IN_BITS = 32
 #endif
 \end{code}
 
+
 %************************************************************************
 %*									*
 \subsubsection[SMRep-datatype]{@SMRep@---storage manager representation}
diff --git a/compiler/cmm/cmm-notes b/compiler/cmm/cmm-notes
index 81882c8c0e..0e6a2341f2 100644
--- a/compiler/cmm/cmm-notes
+++ b/compiler/cmm/cmm-notes
@@ -24,27 +24,10 @@ More notes (June 11)
 * Check in ClosureInfo:
      -- NB: Results here should line up with the results of SMRep.rtsClosureType
 
-* Possible refactoring: Nuke AGraph in favour of 
-      mkIfThenElse :: Expr -> Graph -> Graph -> FCode Graph
-  or even
-      mkIfThenElse :: HasUniques m => Expr -> Graph -> Graph -> m Graph
-  (Remmber that the .cmm file parser must use this function)
-
-  or parameterise FCode over its envt; the CgState part seem useful for both
-
 * "Remove redundant reloads" in CmmSpillReload should be redundant; since
   insertLateReloads is now gone, every reload is reloading a live variable.
   Test and nuke.
 
-* Stack layout is very like register assignment: find non-conflicting assigments.
-  In particular we can use colouring or linear scan (etc).
-
-  We'd fine-grain interference (on a word by word basis) to get maximum overlap.
-  But that may make very big interference graphs.  So linear scan might be
-  more attactive.
-
-  NB: linear scan does on-the-fly live range splitting.
-
 * When stubbing dead slots be careful not to write into an area that
   overlaps with an area that's in use.  So stubbing needs to *follow* 
   stack layout.
@@ -81,10 +64,6 @@ Things to do:
    Old.Cmm. We should abstract it to work on both representations, it needs only to
    convert a CmmInfoTable to [CmmStatic].
 
- - The MkGraph currenty uses a different semantics for <*> than Hoopl. Maybe
-   we could convert codeGen/StgCmm* clients to the Hoopl's semantics?
-   It's all deeply unsatisfactory.
-
  - Improve performance of Hoopl.
 
    A nofib comparison of -fasm vs -fnewcodegen nofib compilation parameters
@@ -101,18 +80,12 @@ Things to do:
    When compiling nofib, ghc-head + libraries compiled with -fnew-codegen
    is 31.4% slower (http://fox.auryn.cz/msrc/0517_hoopl/32bit.oldghcoldgen.zipghcoldgen.txt).
 
-   So we generate a bit better code, but it takes us longer!
-
    EZY: Also importantly, Hoopl uses dramatically more memory than the
    old code generator.
 
  - Are all blockToNodeList and blockOfNodeList really needed? Maybe we could
    splice blocks instead?
 
-   In the CmmContFlowOpt.blockConcat, using Dataflow seems too clumsy. Still,
-   a block catenation function would be probably nicer than blockToNodeList
-   / blockOfNodeList combo.
-
  - lowerSafeForeignCall seems too lowlevel. Just use Dataflow. After that
    delete splitEntrySeq from HooplUtils.
 
@@ -129,10 +102,6 @@ Things to do:
  - NB that CmmProcPoint line 283 has a hack that works around a GADT-related
    bug in 6.10.
 
- - SDM (2010-02-26) can we remove the Foreign constructor from Convention?
-   Reason: we never generate code for a function with the Foreign
-   calling convention, and the code for calling foreign calls is generated
-
  - AsmCodeGen has a generic Cmm optimiser; move this into new pipeline
    EZY (2011-04-16): The mini-inliner has been generalized and ported,
    but the constant folding and other optimizations need to still be
@@ -161,16 +130,6 @@ Things to do:
 
  - Top-level SRT threading is a bit ugly
 
- - Add type/newtype for CmmModule = [CmmGroup]    -- A module
-                        CmmGroup  = [CmmTop]      -- A .o file
-                        CmmTop    = Proc | Data   -- A procedure or data
-
- - This is a *change*: currently a CmmGroup is one function's-worth of code
-   regardless of SplitObjs.   Question: can we *always* generate M.o if there
-   is just one element in the list (rather than M/M1.o, M/M2.o etc)
-
-   One SRT per group.
-
  - See "CAFs" below; we want to totally refactor the way SRTs are calculated
 
  - Pull out Areas into its own module