85 files changed, 3306 insertions, 3613 deletions
diff --git a/compiler/cmm/CLabel.hs b/compiler/cmm/CLabel.hs
index a5d559e9ff..04312321cc 100644
--- a/compiler/cmm/CLabel.hs
+++ b/compiler/cmm/CLabel.hs
@@ -72,7 +72,7 @@ module CLabel (
         mkCmmRetLabel,
         mkCmmCodeLabel,
         mkCmmDataLabel,
-        mkCmmGcPtrLabel,
+        mkCmmClosureLabel,
 
         mkRtsApFastLabel,
 
@@ -331,7 +331,7 @@ data CmmLabelInfo
   | CmmRet                      -- ^ misc rts return points,    suffix _ret
   | CmmData                     -- ^ misc rts data bits, eg CHARLIKE_closure
   | CmmCode                     -- ^ misc rts code
-  | CmmGcPtr                    -- ^ GcPtrs eg CHARLIKE_closure
+  | CmmClosure                  -- ^ closures eg CHARLIKE_closure
   | CmmPrimCall                 -- ^ a prim call to some hand written Cmm code
   deriving (Eq, Ord)
 
@@ -418,7 +418,7 @@ mkCAFBlackHoleEntryLabel        = CmmLabel rtsPackageId (fsLit "stg_CAF_BLACKHOL
 
 -----
 mkCmmInfoLabel,   mkCmmEntryLabel, mkCmmRetInfoLabel, mkCmmRetLabel,
-  mkCmmCodeLabel, mkCmmDataLabel,  mkCmmGcPtrLabel
+  mkCmmCodeLabel, mkCmmDataLabel,  mkCmmClosureLabel
         :: PackageId -> FastString -> CLabel
 
 mkCmmInfoLabel      pkg str     = CmmLabel pkg str CmmInfo
@@ -427,7 +427,7 @@ mkCmmRetInfoLabel   pkg str     = CmmLabel pkg str CmmRetInfo
 mkCmmRetLabel       pkg str     = CmmLabel pkg str CmmRet
 mkCmmCodeLabel      pkg str     = CmmLabel pkg str CmmCode
 mkCmmDataLabel      pkg str     = CmmLabel pkg str CmmData
-mkCmmGcPtrLabel     pkg str     = CmmLabel pkg str CmmGcPtr
+mkCmmClosureLabel   pkg str     = CmmLabel pkg str CmmClosure
 
 
 -- Constructing RtsLabels
@@ -543,6 +543,7 @@ mkPlainModuleInitLabel mod      = PlainModuleInitLabel mod
 
 toClosureLbl :: CLabel -> CLabel
 toClosureLbl (IdLabel n c _) = IdLabel n c Closure
+toClosureLbl (CmmLabel m str _) = CmmLabel m str CmmClosure
 toClosureLbl l = pprPanic "toClosureLbl" (ppr l)
 
 toSlowEntryLbl :: CLabel -> CLabel
@@ -774,7 +775,7 @@ isGcPtrLabel lbl = case labelType lbl of
 --    whether it be code, data, or static GC object.
 labelType :: CLabel -> CLabelType
 labelType (CmmLabel _ _ CmmData)                = DataLabel
-labelType (CmmLabel _ _ CmmGcPtr)               = GcPtrLabel
+labelType (CmmLabel _ _ CmmClosure)             = GcPtrLabel
 labelType (CmmLabel _ _ CmmCode)                = CodeLabel
 labelType (CmmLabel _ _ CmmInfo)                = DataLabel
 labelType (CmmLabel _ _ CmmEntry)               = CodeLabel
@@ -1001,7 +1002,6 @@ pprCLbl (LargeBitmapLabel u)  = text "b" <> pprUnique u <> pp_cSEP <> ptext (sLi
 
 pprCLbl (CmmLabel _ str CmmCode)        = ftext str
 pprCLbl (CmmLabel _ str CmmData)        = ftext str
-pprCLbl (CmmLabel _ str CmmGcPtr)       = ftext str
 pprCLbl (CmmLabel _ str CmmPrimCall)    = ftext str
 
 pprCLbl (RtsLabel (RtsApFast str))   = ftext str <> ptext (sLit "_fast")
@@ -1046,6 +1046,9 @@ pprCLbl (CmmLabel _ fs CmmRetInfo)
 pprCLbl (CmmLabel _ fs CmmRet)
   = ftext fs <> ptext (sLit "_ret")
 
+pprCLbl (CmmLabel _ fs CmmClosure)
+  = ftext fs <> ptext (sLit "_closure")
+
 pprCLbl (RtsLabel (RtsPrimOp primop))
   = ptext (sLit "stg_") <> ppr primop
 
diff --git a/compiler/cmm/Cmm.hs b/compiler/cmm/Cmm.hs
index 2dedee0d52..8409f0dbeb 100644
--- a/compiler/cmm/Cmm.hs
+++ b/compiler/cmm/Cmm.hs
@@ -109,9 +109,14 @@ data CmmStackInfo
                -- number of bytes of arguments on the stack on entry to the
                -- the proc.  This is filled in by StgCmm.codeGen, and used
                -- by the stack allocator later.
-       updfr_space :: Maybe ByteOff
+       updfr_space :: Maybe ByteOff,
                -- XXX: this never contains anything useful, but it should.
                -- See comment in CmmLayoutStack.
+       do_layout :: Bool
+               -- Do automatic stack layout for this proc.  This is
+               -- True for all code generated by the code generator,
+               -- but is occasionally False for hand-written Cmm where
+               -- we want to do the stack manipulation manually.
   }
 
 -- | Info table as a haskell data type
diff --git a/compiler/cmm/CmmBuildInfoTables.hs b/compiler/cmm/CmmBuildInfoTables.hs
index ecaab57d76..304f4c2170 100644
--- a/compiler/cmm/CmmBuildInfoTables.hs
+++ b/compiler/cmm/CmmBuildInfoTables.hs
@@ -235,8 +235,8 @@ to_SRT dflags top_srt off len bmp
            tbl = CmmData RelocatableReadOnlyData $
                    Statics srt_desc_lbl $ map CmmStaticLit
                      ( cmmLabelOffW dflags top_srt off
-                     : mkWordCLit dflags (toStgWord dflags (fromIntegral len))
-                     : map (mkWordCLit dflags) bmp)
+                     : mkWordCLit dflags (fromIntegral len)
+                     : map (mkStgWordCLit dflags) bmp)
        return (Just tbl, C_SRT srt_desc_lbl 0 (srtEscape dflags))
   | otherwise
   = return (Nothing, C_SRT top_srt off (toStgHalfWord dflags (fromStgWord (head bmp))))
@@ -252,7 +252,8 @@ localCAFInfo :: CAFEnv -> CmmDecl -> (CAFSet, Maybe CLabel)
 localCAFInfo _      (CmmData _ _) = (Set.empty, Nothing)
 localCAFInfo cafEnv proc@(CmmProc _ top_l (CmmGraph {g_entry=entry})) =
   case topInfoTable proc of
-    Just (CmmInfoTable { cit_rep = rep }) | not (isStaticRep rep)
+    Just (CmmInfoTable { cit_rep = rep })
+      | not (isStaticRep rep) && not (isStackRep rep)
       -> (cafs, Just (toClosureLbl top_l))
     _other -> (cafs, Nothing)
   where
diff --git a/compiler/cmm/CmmCallConv.hs b/compiler/cmm/CmmCallConv.hs
index 235fe7f911..180b2d7eab 100644
--- a/compiler/cmm/CmmCallConv.hs
+++ b/compiler/cmm/CmmCallConv.hs
@@ -8,7 +8,8 @@
 module CmmCallConv (
   ParamLocation(..),
   assignArgumentsPos,
-  globalArgRegs
+  assignStack,
+  globalArgRegs, realArgRegs
 ) where
 
 #include "HsVersions.h"
@@ -18,7 +19,6 @@ import SMRep
 import Cmm (Convention(..))
 import PprCmm ()
 
-import qualified Data.List as L
 import DynFlags
 import Outputable
 
@@ -33,15 +33,22 @@ instance Outputable ParamLocation where
   ppr (RegisterParam g) = ppr g
   ppr (StackParam p)    = ppr p
 
--- | JD: For the new stack story, I want arguments passed on the stack to manifest as
--- positive offsets in a CallArea, not negative offsets from the stack pointer.
--- Also, I want byte offsets, not word offsets.
-assignArgumentsPos :: DynFlags -> Convention -> (a -> CmmType) -> [a] ->
-                      [(a, ParamLocation)]
+-- |
 -- Given a list of arguments, and a function that tells their types,
 -- return a list showing where each argument is passed
-assignArgumentsPos dflags conv arg_ty reps = assignments
-    where -- The calling conventions (CgCallConv.hs) are complicated, to say the least
+--
+assignArgumentsPos :: DynFlags
+                   -> ByteOff           -- stack offset to start with
+                   -> Convention
+                   -> (a -> CmmType)    -- how to get a type from an arg
+                   -> [a]               -- args
+                   -> (
+                        ByteOff              -- bytes of stack args
+                      , [(a, ParamLocation)] -- args and locations
+                      )
+
+assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
+    where
       regs = case (reps, conv) of
                (_,   NativeNodeCall)   -> getRegsWithNode dflags
                (_,   NativeDirectCall) -> getRegsWithoutNode dflags
@@ -49,23 +56,14 @@ assignArgumentsPos dflags conv arg_ty reps = assignments
                (_,   NativeReturn)     -> getRegsWithNode dflags
                -- GC calling convention *must* put values in registers
                (_,   GC)               -> allRegs dflags
-               (_,   PrimOpCall)       -> allRegs dflags
-               ([_], PrimOpReturn)     -> allRegs dflags
-               (_,   PrimOpReturn)     -> getRegsWithNode dflags
                (_,   Slow)             -> noRegs
       -- The calling conventions first assign arguments to registers,
       -- then switch to the stack when we first run out of registers
-      -- (even if there are still available registers for args of a different type).
-      -- When returning an unboxed tuple, we also separate the stack
-      -- arguments by pointerhood.
-      (reg_assts, stk_args) = assign_regs [] reps regs
-      stk_args' = case conv of NativeReturn -> part
-                               PrimOpReturn -> part
-                               GC | length stk_args /= 0 -> panic "Failed to allocate registers for GC call"
-                               _            -> stk_args
-                  where part = uncurry (++)
-                                       (L.partition (not . isGcPtrType . arg_ty) stk_args)
-      stk_assts = assign_stk 0 [] (reverse stk_args')
+      -- (even if there are still available registers for args of a
+      -- different type).  When returning an unboxed tuple, we also
+      -- separate the stack arguments by pointerhood.
+      (reg_assts, stk_args)  = assign_regs [] reps regs
+      (stk_off,   stk_assts) = assignStack dflags off arg_ty stk_args
       assignments = reg_assts ++ stk_assts
 
       assign_regs assts []     _    = (assts, [])
@@ -88,11 +86,21 @@ assignArgumentsPos dflags conv arg_ty reps = assignments
               gcp | isGcPtrType ty = VGcPtr
                   | otherwise  	   = VNonGcPtr
 
-      assign_stk _      assts [] = assts
-      assign_stk offset assts (r:rs) = assign_stk off' ((r, StackParam off') : assts) rs
+
+assignStack :: DynFlags -> ByteOff -> (a -> CmmType) -> [a]
+            -> (
+                 ByteOff              -- bytes of stack args
+               , [(a, ParamLocation)] -- args and locations
+               )
+assignStack dflags offset arg_ty args = assign_stk offset [] (reverse args)
+ where
+      assign_stk offset assts [] = (offset, assts)
+      assign_stk offset assts (r:rs)
+        = assign_stk off' ((r, StackParam off') : assts) rs
         where w    = typeWidth (arg_ty r)
-              size = (((widthInBytes w - 1) `div` wORD_SIZE dflags) + 1) * wORD_SIZE dflags
+              size = (((widthInBytes w - 1) `div` word_size) + 1) * word_size
               off' = offset + size
+              word_size = wORD_SIZE dflags
 
 -----------------------------------------------------------------------------
 -- Local information about the registers available
@@ -158,3 +166,9 @@ globalArgRegs dflags = map ($ VGcPtr) (allVanillaRegs dflags) ++
                        allFloatRegs dflags ++
                        allDoubleRegs dflags ++
                        allLongRegs dflags
+
+realArgRegs :: DynFlags -> [GlobalReg]
+realArgRegs dflags = map ($VGcPtr) (realVanillaRegs dflags) ++
+                realFloatRegs dflags ++
+                realDoubleRegs dflags ++
+                realLongRegs dflags
diff --git a/compiler/cmm/CmmContFlowOpt.hs b/compiler/cmm/CmmContFlowOpt.hs
index aa2925fe53..4028efddf6 100644
--- a/compiler/cmm/CmmContFlowOpt.hs
+++ b/compiler/cmm/CmmContFlowOpt.hs
@@ -97,15 +97,17 @@ cmmCfgOptsProc _ top = top
 
 blockConcat :: Bool -> CmmGraph -> (CmmGraph, BlockEnv BlockId)
 blockConcat splitting_procs g@CmmGraph { g_entry = entry_id }
-  = (replaceLabels shortcut_map $ ofBlockMap new_entry new_blocks, shortcut_map)
+  = (replaceLabels shortcut_map $ ofBlockMap new_entry new_blocks, shortcut_map')
   where
-     -- we might be able to shortcut the entry BlockId itself
-     new_entry
+     -- we might be able to shortcut the entry BlockId itself.
+     -- remember to update the shortcut_map', since we also have to
+     -- update the info_tbls mapping now.
+     (new_entry, shortcut_map')
        | Just entry_blk <- mapLookup entry_id new_blocks
        , Just dest      <- canShortcut entry_blk
-       = dest
+       = (dest, mapInsert entry_id dest shortcut_map)
        | otherwise
-       = entry_id
+       = (entry_id, shortcut_map)
 
      blocks = postorderDfs g
      blockmap = foldr addBlock emptyBody blocks
diff --git a/compiler/cmm/CmmCvt.hs b/compiler/cmm/CmmCvt.hs
index cd838821b3..017d120d84 100644
--- a/compiler/cmm/CmmCvt.hs
+++ b/compiler/cmm/CmmCvt.hs
@@ -22,19 +22,23 @@ cmmOfZgraph tops = map mapTop tops
   where mapTop (CmmProc h l g) = CmmProc (info_tbls h) l (ofZgraph g)
         mapTop (CmmData s ds) = CmmData s ds
 
-data ValueDirection = Arguments | Results
+add_hints :: [a] -> [ForeignHint] -> [Old.CmmHinted a]
+add_hints args hints = zipWith Old.CmmHinted args hints
 
-add_hints :: ForeignTarget -> ValueDirection -> [a] -> [Old.CmmHinted a]
-add_hints conv vd args = zipWith Old.CmmHinted args (get_hints conv vd)
-
-get_hints :: ForeignTarget -> ValueDirection -> [ForeignHint]
-get_hints (ForeignTarget _ (ForeignConvention _ hints _)) Arguments = hints
-get_hints (ForeignTarget _ (ForeignConvention _ _ hints)) Results   = hints
-get_hints (PrimTarget _) _vd = repeat NoHint
+get_hints :: ForeignTarget -> ([ForeignHint], [ForeignHint])
+get_hints (PrimTarget op) = (res_hints ++ repeat NoHint,
+                             arg_hints ++ repeat NoHint)
+  where (res_hints, arg_hints) = callishMachOpHints op
+get_hints (ForeignTarget _ (ForeignConvention _ arg_hints res_hints _))
+  = (res_hints, arg_hints)
 
 cmm_target :: ForeignTarget -> Old.CmmCallTarget
 cmm_target (PrimTarget op) = Old.CmmPrim op Nothing
-cmm_target (ForeignTarget e (ForeignConvention cc _ _)) = Old.CmmCallee e cc
+cmm_target (ForeignTarget e (ForeignConvention cc _ _ _)) = Old.CmmCallee e cc
+
+get_ret :: ForeignTarget -> CmmReturnInfo
+get_ret (PrimTarget _) = CmmMayReturn
+get_ret (ForeignTarget _ (ForeignConvention _ _ _ ret)) = ret
 
 ofZgraph :: CmmGraph -> Old.ListGraph Old.CmmStmt
 ofZgraph g = Old.ListGraph $ mapMaybe convert_block $ postorderDfs g
@@ -83,11 +87,14 @@ ofZgraph g = Old.ListGraph $ mapMaybe convert_block $ postorderDfs g
                             CmmAssign l r                                  -> Old.CmmAssign l r
                             CmmStore  l r                                  -> Old.CmmStore  l r
                             CmmUnsafeForeignCall (PrimTarget MO_Touch) _ _ -> Old.CmmNop
-                            CmmUnsafeForeignCall target ress args          -> 
+                            CmmUnsafeForeignCall target ress args          ->
                               Old.CmmCall (cmm_target target)
-                                          (add_hints target Results   ress)
-                                          (add_hints target Arguments args)
-                                          Old.CmmMayReturn
+                                          (add_hints ress res_hints)
+                                          (add_hints args arg_hints)
+                                          (get_ret target)
+                                  where
+                                     (res_hints, arg_hints) = get_hints target
+
 
                   last :: CmmNode O C -> () -> [Old.CmmStmt]
                   last node _ = stmts
diff --git a/compiler/cmm/CmmInfo.hs b/compiler/cmm/CmmInfo.hs
index 6aa4d6cbfa..dec6b5d09d 100644
--- a/compiler/cmm/CmmInfo.hs
+++ b/compiler/cmm/CmmInfo.hs
@@ -155,7 +155,7 @@ type InfoTableContents = ( [CmmLit]	     -- The standard part
 
 mkInfoTableContents :: DynFlags
                     -> CmmInfoTable
-                    -> Maybe StgHalfWord    -- Override default RTS type tag?
+                    -> Maybe Int               -- Override default RTS type tag?
                     -> UniqSM ([RawCmmDecl],             -- Auxiliary top decls
                                InfoTableContents)	-- Info tbl + extra bits
 
@@ -178,22 +178,19 @@ mkInfoTableContents dflags
        ; let
              std_info = mkStdInfoTable dflags prof_lits rts_tag srt_bitmap liveness_lit
              rts_tag | Just tag <- mb_rts_tag = tag
-                     | null liveness_data     = rET_SMALL dflags -- Fits in extra_bits
-                     | otherwise              = rET_BIG   dflags -- Does not; extra_bits is
-                                                                 -- a label
+                     | null liveness_data     = rET_SMALL -- Fits in extra_bits
+                     | otherwise              = rET_BIG   -- Does not; extra_bits is
+                                                          -- a label
        ; return (prof_data ++ liveness_data, (std_info, srt_label)) }
 
   | HeapRep _ ptrs nonptrs closure_type <- smrep
-  = do { let layout  = packHalfWordsCLit
-                           dflags
-                           (toStgHalfWord dflags (toInteger ptrs))
-                           (toStgHalfWord dflags (toInteger nonptrs))
+  = do { let layout  = packIntsCLit dflags ptrs nonptrs
        ; (prof_lits, prof_data) <- mkProfLits dflags prof
        ; let (srt_label, srt_bitmap) = mkSRTLit dflags srt
        ; (mb_srt_field, mb_layout, extra_bits, ct_data)
                                 <- mk_pieces closure_type srt_label
        ; let std_info = mkStdInfoTable dflags prof_lits
-                                       (mb_rts_tag   `orElse` rtsClosureType dflags smrep)
+                                       (mb_rts_tag   `orElse` rtsClosureType smrep)
                                        (mb_srt_field `orElse` srt_bitmap)
                                        (mb_layout    `orElse` layout)
        ; return (prof_data ++ ct_data, (std_info, extra_bits)) }
@@ -205,24 +202,25 @@ mkInfoTableContents dflags
                  	, [RawCmmDecl])	     -- Auxiliary data decls 
     mk_pieces (Constr con_tag con_descr) _no_srt    -- A data constructor
       = do { (descr_lit, decl) <- newStringLit con_descr
-      	   ; return (Just con_tag, Nothing, [descr_lit], [decl]) }
+           ; return ( Just (toStgHalfWord dflags (fromIntegral con_tag))
+                    , Nothing, [descr_lit], [decl]) }
 
     mk_pieces Thunk srt_label
       = return (Nothing, Nothing, srt_label, [])
 
     mk_pieces (ThunkSelector offset) _no_srt
-      = return (Just (toStgHalfWord dflags 0), Just (mkWordCLit dflags offset), [], [])
+      = return (Just (toStgHalfWord dflags 0), Just (mkWordCLit dflags (fromIntegral offset)), [], [])
          -- Layout known (one free var); we use the layout field for offset
 
     mk_pieces (Fun arity (ArgSpec fun_type)) srt_label 
-      = do { let extra_bits = packHalfWordsCLit dflags fun_type arity : srt_label
+      = do { let extra_bits = packIntsCLit dflags fun_type arity : srt_label
            ; return (Nothing, Nothing,  extra_bits, []) }
 
     mk_pieces (Fun arity (ArgGen arg_bits)) srt_label
       = do { (liveness_lit, liveness_data) <- mkLivenessBits dflags arg_bits
-           ; let fun_type | null liveness_data = aRG_GEN     dflags
-                          | otherwise          = aRG_GEN_BIG dflags
-                 extra_bits = [ packHalfWordsCLit dflags fun_type arity
+           ; let fun_type | null liveness_data = aRG_GEN
+                          | otherwise          = aRG_GEN_BIG
+                 extra_bits = [ packIntsCLit dflags fun_type arity
                               , srt_lit, liveness_lit, slow_entry ]
            ; return (Nothing, Nothing, extra_bits, liveness_data) }
       where
@@ -233,9 +231,14 @@ mkInfoTableContents dflags
 
     mk_pieces BlackHole _ = panic "mk_pieces: BlackHole"
 
-
 mkInfoTableContents _ _ _ = panic "mkInfoTableContents"   -- NonInfoTable dealt with earlier
 
+packIntsCLit :: DynFlags -> Int -> Int -> CmmLit
+packIntsCLit dflags a b = packHalfWordsCLit dflags
+                           (toStgHalfWord dflags (fromIntegral a))
+                           (toStgHalfWord dflags (fromIntegral b))
+
+
 mkSRTLit :: DynFlags
          -> C_SRT
          -> ([CmmLit],    -- srt_label, if any
@@ -314,7 +317,7 @@ mkLivenessBits dflags liveness
                  [mkRODataLits bitmap_lbl lits]) }
 
   | otherwise -- Fits in one word
-  = return (mkWordCLit dflags bitmap_word, [])
+  = return (mkStgWordCLit dflags bitmap_word, [])
   where
     n_bits = length liveness
 
@@ -328,7 +331,8 @@ mkLivenessBits dflags liveness
     bitmap_word = toStgWord dflags (fromIntegral n_bits)
               .|. (small_bitmap `shiftL` bITMAP_BITS_SHIFT dflags)
 
-    lits = mkWordCLit dflags (toStgWord dflags (fromIntegral n_bits)) : map (mkWordCLit dflags) bitmap
+    lits = mkWordCLit dflags (fromIntegral n_bits)
+         : map (mkStgWordCLit dflags) bitmap
       -- The first word is the size.  The structure must match
       -- StgLargeBitmap in includes/rts/storage/InfoTable.h
 
@@ -348,8 +352,8 @@ mkLivenessBits dflags liveness
 mkStdInfoTable
    :: DynFlags
    -> (CmmLit,CmmLit)	-- Closure type descr and closure descr  (profiling)
-   -> StgHalfWord	-- Closure RTS tag 
-   -> StgHalfWord	-- SRT length
+   -> Int               -- Closure RTS tag
+   -> StgHalfWord       -- SRT length
    -> CmmLit		-- layout field
    -> [CmmLit]
 
@@ -365,7 +369,7 @@ mkStdInfoTable dflags (type_descr, closure_descr) cl_type srt_len layout_lit
 	| dopt Opt_SccProfilingOn dflags = [type_descr, closure_descr]
 	| otherwise = []
 
-    type_lit = packHalfWordsCLit dflags cl_type srt_len
+    type_lit = packHalfWordsCLit dflags (toStgHalfWord dflags (fromIntegral cl_type)) srt_len
 
 -------------------------------------------------------------------------
 --
diff --git a/compiler/cmm/CmmLayoutStack.hs b/compiler/cmm/CmmLayoutStack.hs
index 6f75f5451c..5c4045778a 100644
--- a/compiler/cmm/CmmLayoutStack.hs
+++ b/compiler/cmm/CmmLayoutStack.hs
@@ -933,7 +933,7 @@ lowerSafeForeignCall dflags block
 
         (ret_args, regs, copyout) = copyOutOflow dflags NativeReturn Jump (Young succ)
                                            (map (CmmReg . CmmLocal) res)
-                                           updfr (0, [])
+                                           updfr []
 
         -- NB. after resumeThread returns, the top-of-stack probably contains
         -- the stack frame for succ, but it might not: if the current thread
@@ -973,14 +973,14 @@ callSuspendThread :: DynFlags -> LocalReg -> Bool -> CmmNode O O
 callSuspendThread dflags id intrbl =
   CmmUnsafeForeignCall
        (ForeignTarget (foreignLbl (fsLit "suspendThread"))
-             (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint]))
+        (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint] CmmMayReturn))
        [id] [CmmReg (CmmGlobal BaseReg), mkIntExpr dflags (fromEnum intrbl)]
 
 callResumeThread :: LocalReg -> LocalReg -> CmmNode O O
 callResumeThread new_base id =
   CmmUnsafeForeignCall
        (ForeignTarget (foreignLbl (fsLit "resumeThread"))
-            (ForeignConvention CCallConv [AddrHint] [AddrHint]))
+            (ForeignConvention CCallConv [AddrHint] [AddrHint] CmmMayReturn))
        [new_base] [CmmReg (CmmLocal id)]
 
 -- -----------------------------------------------------------------------------
diff --git a/compiler/cmm/CmmLex.x b/compiler/cmm/CmmLex.x
index ddd681d25e..718eb27c82 100644
--- a/compiler/cmm/CmmLex.x
+++ b/compiler/cmm/CmmLex.x
@@ -23,9 +23,9 @@ module CmmLex (
    CmmToken(..), cmmlex,
   ) where
 
-import OldCmm
-import Lexer
+import CmmExpr
 
+import Lexer
 import SrcLoc
 import UniqFM
 import StringBuffer
@@ -147,6 +147,7 @@ data CmmToken
   | CmmT_align
   | CmmT_goto
   | CmmT_if
+  | CmmT_call
   | CmmT_jump
   | CmmT_foreign
   | CmmT_never
@@ -157,6 +158,7 @@ data CmmToken
   | CmmT_switch
   | CmmT_case
   | CmmT_default
+  | CmmT_push
   | CmmT_bits8
   | CmmT_bits16
   | CmmT_bits32
@@ -224,8 +226,9 @@ reservedWordsFM = listToUFM $
 	( "align",		CmmT_align ),
 	( "goto",		CmmT_goto ),
 	( "if",			CmmT_if ),
-	( "jump",		CmmT_jump ),
-	( "foreign",		CmmT_foreign ),
+        ( "call",               CmmT_call ),
+        ( "jump",               CmmT_jump ),
+        ( "foreign",            CmmT_foreign ),
 	( "never",		CmmT_never ),
 	( "prim",		CmmT_prim ),
 	( "return",		CmmT_return ),
@@ -233,8 +236,9 @@ reservedWordsFM = listToUFM $
 	( "import",		CmmT_import ),
 	( "switch",		CmmT_switch ),
 	( "case",		CmmT_case ),
-	( "default",		CmmT_default ),
-	( "bits8",		CmmT_bits8 ),
+        ( "default",            CmmT_default ),
+        ( "push",               CmmT_push ),
+        ( "bits8",              CmmT_bits8 ),
 	( "bits16",		CmmT_bits16 ),
 	( "bits32",		CmmT_bits32 ),
 	( "bits64",		CmmT_bits64 ),
diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index 520c7e7a7d..c00cdb5b5a 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -16,7 +16,7 @@ module CmmMachOp
     , mo_32To8, mo_32To16, mo_WordTo8, mo_WordTo16, mo_WordTo32, mo_WordTo64
 
     -- CallishMachOp
-    , CallishMachOp(..)
+    , CallishMachOp(..), callishMachOpHints
     , pprCallishMachOp
    )
 where
@@ -463,3 +463,10 @@ data CallishMachOp
 pprCallishMachOp :: CallishMachOp -> SDoc
 pprCallishMachOp mo = text (show mo)
 
+callishMachOpHints :: CallishMachOp -> ([ForeignHint], [ForeignHint])
+callishMachOpHints op = case op of
+  MO_Memcpy  -> ([], [AddrHint,AddrHint,NoHint,NoHint])
+  MO_Memset  -> ([], [AddrHint,NoHint,NoHint,NoHint])
+  MO_Memmove -> ([], [AddrHint,AddrHint,NoHint,NoHint])
+  _          -> ([],[])
+  -- empty lists indicate NoHint
diff --git a/compiler/cmm/CmmNode.hs b/compiler/cmm/CmmNode.hs
index ae7ac091de..b7bb270bd6 100644
--- a/compiler/cmm/CmmNode.hs
+++ b/compiler/cmm/CmmNode.hs
@@ -9,8 +9,9 @@
 -- for details
 
 module CmmNode (
-     CmmNode(..), ForeignHint(..), CmmFormal, CmmActual,
+     CmmNode(..), CmmFormal, CmmActual,
      UpdFrameOffset, Convention(..), ForeignConvention(..), ForeignTarget(..),
+     CmmReturnInfo(..),
      mapExp, mapExpDeep, wrapRecExp, foldExp, foldExpDeep, wrapRecExpf,
      mapExpM, mapExpDeepM, wrapRecExpM, mapSuccessors
   ) where
@@ -228,14 +229,31 @@ type CmmFormal = LocalReg
 
 type UpdFrameOffset = ByteOff
 
+-- | A convention maps a list of values (function arguments or return
+-- values) to registers or stack locations.
 data Convention
-  = NativeDirectCall -- Native C-- call skipping the node (closure) argument
-  | NativeNodeCall   -- Native C-- call including the node argument
-  | NativeReturn     -- Native C-- return
-  | Slow             -- Slow entry points: all args pushed on the stack
-  | GC               -- Entry to the garbage collector: uses the node reg!
-  | PrimOpCall       -- Calling prim ops
-  | PrimOpReturn     -- Returning from prim ops
+  = NativeDirectCall
+       -- ^ top-level Haskell functions use @NativeDirectCall@, which
+       -- maps arguments to registers starting with R2, according to
+       -- how many registers are available on the platform.  This
+       -- convention ignores R1, because for a top-level function call
+       -- the function closure is implicit, and doesn't need to be passed.
+  | NativeNodeCall
+       -- ^ non-top-level Haskell functions, which pass the address of
+       -- the function closure in R1 (regardless of whether R1 is a
+       -- real register or not), and the rest of the arguments in
+       -- registers or on the stack.
+  | NativeReturn
+       -- ^ a native return.  The convention for returns depends on
+       -- how many values are returned: for just one value returned,
+       -- the appropriate register is used (R1, F1, etc.). regardless
+       -- of whether it is a real register or not.  For multiple
+       -- values returned, they are mapped to registers or the stack.
+  | Slow
+       -- ^ Slow entry points: all args pushed on the stack
+  | GC
+       -- ^ Entry to the garbage collector: uses the node reg!
+       -- (TODO: I don't think we need this --SDM)
   deriving( Eq )
 
 data ForeignConvention
@@ -243,8 +261,14 @@ data ForeignConvention
         CCallConv               -- Which foreign-call convention
         [ForeignHint]           -- Extra info about the args
         [ForeignHint]           -- Extra info about the result
+        CmmReturnInfo
   deriving Eq
 
+data CmmReturnInfo
+  = CmmMayReturn
+  | CmmNeverReturns
+  deriving ( Eq )
+
 data ForeignTarget        -- The target of a foreign call
   = ForeignTarget                -- A foreign procedure
         CmmExpr                  -- Its address
@@ -253,12 +277,6 @@ data ForeignTarget        -- The target of a foreign call
         CallishMachOp            -- Which one
   deriving Eq
 
-data ForeignHint
-  = NoHint | AddrHint | SignedHint
-  deriving( Eq )
-        -- Used to give extra per-argument or per-result
-        -- information needed by foreign calling conventions
-
 --------------------------------------------------
 -- Instances of register and slot users / definers
 
diff --git a/compiler/cmm/CmmOpt.hs b/compiler/cmm/CmmOpt.hs
index 32afa1d078..843626303a 100644
--- a/compiler/cmm/CmmOpt.hs
+++ b/compiler/cmm/CmmOpt.hs
@@ -14,6 +14,7 @@ module CmmOpt (
 
 #include "HsVersions.h"
 
+import CmmUtils
 import OldCmm
 import DynFlags
 import CLabel
@@ -184,22 +185,22 @@ cmmMachOpFoldM dflags mop1@(MO_Add{}) [CmmMachOp mop2@(MO_Sub{}) [arg1,arg2], ar
 
 -- Make a RegOff if we can
 cmmMachOpFoldM _ (MO_Add _) [CmmReg reg, CmmLit (CmmInt n rep)]
-  = Just $ CmmRegOff reg (fromIntegral (narrowS rep n))
+  = Just $ cmmRegOff reg (fromIntegral (narrowS rep n))
 cmmMachOpFoldM _ (MO_Add _) [CmmRegOff reg off, CmmLit (CmmInt n rep)]
-  = Just $ CmmRegOff reg (off + fromIntegral (narrowS rep n))
+  = Just $ cmmRegOff reg (off + fromIntegral (narrowS rep n))
 cmmMachOpFoldM _ (MO_Sub _) [CmmReg reg, CmmLit (CmmInt n rep)]
-  = Just $ CmmRegOff reg (- fromIntegral (narrowS rep n))
+  = Just $ cmmRegOff reg (- fromIntegral (narrowS rep n))
 cmmMachOpFoldM _ (MO_Sub _) [CmmRegOff reg off, CmmLit (CmmInt n rep)]
-  = Just $ CmmRegOff reg (off - fromIntegral (narrowS rep n))
+  = Just $ cmmRegOff reg (off - fromIntegral (narrowS rep n))
 
 -- Fold label(+/-)offset into a CmmLit where possible
 
-cmmMachOpFoldM _ (MO_Add _) [CmmLit (CmmLabel lbl), CmmLit (CmmInt i rep)]
-  = Just $ CmmLit (CmmLabelOff lbl (fromIntegral (narrowU rep i)))
-cmmMachOpFoldM _ (MO_Add _) [CmmLit (CmmInt i rep), CmmLit (CmmLabel lbl)]
-  = Just $ CmmLit (CmmLabelOff lbl (fromIntegral (narrowU rep i)))
-cmmMachOpFoldM _ (MO_Sub _) [CmmLit (CmmLabel lbl), CmmLit (CmmInt i rep)]
-  = Just $ CmmLit (CmmLabelOff lbl (fromIntegral (negate (narrowU rep i))))
+cmmMachOpFoldM _ (MO_Add _) [CmmLit lit, CmmLit (CmmInt i rep)]
+  = Just $ CmmLit (cmmOffsetLit lit (fromIntegral (narrowU rep i)))
+cmmMachOpFoldM _ (MO_Add _) [CmmLit (CmmInt i rep), CmmLit lit]
+  = Just $ CmmLit (cmmOffsetLit lit (fromIntegral (narrowU rep i)))
+cmmMachOpFoldM _ (MO_Sub _) [CmmLit lit, CmmLit (CmmInt i rep)]
+  = Just $ CmmLit (cmmOffsetLit lit (fromIntegral (negate (narrowU rep i))))
 
 
 -- Comparison of literal with widened operand: perform the comparison
diff --git a/compiler/cmm/CmmParse.y b/compiler/cmm/CmmParse.y
index 8c3559b774..22e28a8a9d 100644
--- a/compiler/cmm/CmmParse.y
+++ b/compiler/cmm/CmmParse.y
@@ -1,14 +1,160 @@
 -----------------------------------------------------------------------------
 --
--- (c) The University of Glasgow, 2004-2006
+-- (c) The University of Glasgow, 2004-2012
 --
 -- Parser for concrete Cmm.
--- This doesn't just parse the Cmm file, we also do some code generation
--- along the way for switches and foreign calls etc.
 --
 -----------------------------------------------------------------------------
 
--- TODO: Add support for interruptible/uninterruptible foreign call specification
+{- -----------------------------------------------------------------------------
+Note [Syntax of .cmm files]
+
+NOTE: You are very much on your own in .cmm.  There is very little
+error checking at all:
+
+  * Type errors are detected by the (optional) -dcmm-lint pass, if you
+    don't turn this on then a type error will likely result in a panic
+    from the native code generator.
+
+  * Passing the wrong number of arguments or arguments of the wrong
+    type is not detected.
+
+There are two ways to write .cmm code:
+
+ (1) High-level Cmm code delegates the stack handling to GHC, and
+     never explicitly mentions Sp or registers.
+
+ (2) Low-level Cmm manages the stack itself, and must know about
+     calling conventions.
+
+Whether you want high-level or low-level Cmm is indicated by the
+presence of an argument list on a procedure.  For example:
+
+foo ( gcptr a, bits32 b )
+{
+  // this is high-level cmm code
+
+  if (b > 0) {
+     // we can make tail calls passing arguments:
+     jump stg_ap_0_fast(a);
+  }
+
+  push (stg_upd_frame_info, a) {
+    // stack frames can be explicitly pushed
+
+    (x,y) = call wibble(a,b,3,4);
+      // calls pass arguments and return results using the native
+      // Haskell calling convention.  The code generator will automatically
+      // construct a stack frame and an info table for the continuation.
+
+    return (x,y);
+      // we can return multiple values from the current proc
+  }
+}
+
+bar
+{
+  // this is low-level cmm code, indicated by the fact that we did not
+  // put an argument list on bar.
+
+  x = R1;  // the calling convention is explicit: better be careful
+           // that this works on all platforms!
+
+  jump %ENTRY_CODE(Sp(0))
+}
+
+Here is a list of rules for high-level and low-level code.  If you
+break the rules, you get a panic (for using a high-level construct in
+a low-level proc), or wrong code (when using low-level code in a
+high-level proc).  This stuff isn't checked! (TODO!)
+
+High-level only:
+
+  - tail-calls with arguments, e.g.
+    jump stg_fun (arg1, arg2);
+
+  - function calls:
+    (ret1,ret2) = call stg_fun (arg1, arg2);
+
+    This makes a call with the NativeNodeCall convention, and the
+    values are returned to the following code using the NativeReturn
+    convention.
+
+  - returning:
+    return (ret1, ret2)
+
+    These use the NativeReturn convention to return zero or more
+    results to the caller.
+
+  - pushing stack frames:
+    push (info_ptr, field1, ..., fieldN) { ... statements ... }
+
+Low-level only:
+
+  - References to Sp, R1-R8, F1-F4 etc.
+
+    NB. foreign calls may clobber the argument registers R1-R8, F1-F4
+    etc., so ensure they are saved into variables around foreign
+    calls.
+
+  - SAVE_THREAD_STATE() and LOAD_THREAD_STATE(), which modify Sp
+    directly.
+
+Both high-level and low-level code can use a raw tail-call:
+
+    jump stg_fun [R1,R2]
+
+This always transfers control to a low-level Cmm function, but the
+call can be made from high-level code.  Arguments must be passed
+explicitly in R/F/D/L registers.
+
+NB. you *must* specify the list of GlobalRegs that are passed via a
+jump, otherwise the register allocator will assume that all the
+GlobalRegs are dead at the jump.
+
+
+A stack frame is written like this:
+
+INFO_TABLE_RET ( label, FRAME_TYPE, info_ptr, field1, ..., fieldN )
+               return ( arg1, ..., argM )
+{
+  ... code ...
+}
+
+where field1 ... fieldN are the fields of the stack frame (with types)
+arg1...argN are the values returned to the stack frame (with types).
+The return values are assumed to be passed according to the
+NativeReturn convention.
+
+On entry to the code, the stack frame looks like:
+
+   |----------|
+   | fieldN   |
+   |   ...    |
+   | field1   |
+   |----------|
+   | info_ptr |
+   |----------|
+   |  argN    |
+   |   ...    | <- Sp
+
+and some of the args may be in registers.
+
+We prepend the code by a copyIn of the args, and assign all the stack
+frame fields to their formals.  The initial "arg offset" for stack
+layout purposes consists of the whole stack frame plus any args that
+might be on the stack.
+
+A tail-call may pass a stack frame to the callee using the following
+syntax:
+
+jump f (info_ptr, field1,..,fieldN) (arg1,..,argN)
+
+where info_ptr and field1..fieldN describe the stack frame, and
+arg1..argN are the arguments passed to f using the NativeNodeCall
+convention.
+
+----------------------------------------------------------------------------- -}
 
 {
 {-# LANGUAGE BangPatterns #-} -- required for versions of Happy before 1.18.6
@@ -21,30 +167,32 @@
 
 module CmmParse ( parseCmmFile ) where
 
-import CgMonad
-import CgExtCode
-import CgHeapery
-import CgUtils
-import CgProf
-import CgTicky
-import CgInfoTbls
-import CgForeignCall
-import CgTailCall
-import CgStackery
-import ClosureInfo
-import CgCallConv
-import CgClosure
-import CostCentre
-
-import BlockId
-import OldCmm
-import OldPprCmm()
+import StgCmmExtCode
+import CmmCallConv
+import StgCmmProf
+import StgCmmHeap
+import StgCmmMonad hiding ( getCode, getCodeR, emitLabel, emit, emitStore
+                          , emitAssign, emitOutOfLine, withUpdFrameOff
+                          , getUpdFrameOff )
+import qualified StgCmmMonad as F
+import StgCmmUtils
+import StgCmmForeign
+import StgCmmExpr
+import StgCmmClosure
+import StgCmmLayout
+import StgCmmTicky
+import StgCmmBind       ( emitBlackHoleCode, emitUpdateFrame )
+
+import MkGraph
+import Cmm
 import CmmUtils
+import BlockId
 import CmmLex
 import CLabel
 import SMRep
 import Lexer
 
+import CostCentre
 import ForeignCall
 import Module
 import Platform
@@ -68,6 +216,7 @@ import Control.Monad
 import Data.Array
 import Data.Char        ( ord )
 import System.Exit
+import Data.Maybe
 
 #include "HsVersions.h"
 }
@@ -110,41 +259,43 @@ import System.Exit
         '&&'    { L _ (CmmT_BoolAnd) }
         '||'    { L _ (CmmT_BoolOr) }
 
-        'CLOSURE'               { L _ (CmmT_CLOSURE) }
-        'INFO_TABLE'            { L _ (CmmT_INFO_TABLE) }
-        'INFO_TABLE_RET'        { L _ (CmmT_INFO_TABLE_RET) }
-        'INFO_TABLE_FUN'        { L _ (CmmT_INFO_TABLE_FUN) }
-        'INFO_TABLE_CONSTR'     { L _ (CmmT_INFO_TABLE_CONSTR) }
-        'INFO_TABLE_SELECTOR'   { L _ (CmmT_INFO_TABLE_SELECTOR) }
-        'else'                  { L _ (CmmT_else) }
-        'export'                { L _ (CmmT_export) }
-        'section'               { L _ (CmmT_section) }
-        'align'                 { L _ (CmmT_align) }
-        'goto'                  { L _ (CmmT_goto) }
-        'if'                    { L _ (CmmT_if) }
-        'jump'                  { L _ (CmmT_jump) }
-        'foreign'               { L _ (CmmT_foreign) }
-        'never'                 { L _ (CmmT_never) }
-        'prim'                  { L _ (CmmT_prim) }
-        'return'                { L _ (CmmT_return) }
-        'returns'               { L _ (CmmT_returns) }
-        'import'                { L _ (CmmT_import) }
-        'switch'                { L _ (CmmT_switch) }
-        'case'                  { L _ (CmmT_case) }
-        'default'               { L _ (CmmT_default) }
-        'bits8'                 { L _ (CmmT_bits8) }
-        'bits16'                { L _ (CmmT_bits16) }
-        'bits32'                { L _ (CmmT_bits32) }
-        'bits64'                { L _ (CmmT_bits64) }
-        'float32'               { L _ (CmmT_float32) }
-        'float64'               { L _ (CmmT_float64) }
-        'gcptr'                 { L _ (CmmT_gcptr) }
-
-        GLOBALREG               { L _ (CmmT_GlobalReg   $$) }
-        NAME                    { L _ (CmmT_Name        $$) }
-        STRING                  { L _ (CmmT_String      $$) }
-        INT                     { L _ (CmmT_Int         $$) }
-        FLOAT                   { L _ (CmmT_Float       $$) }
+        'CLOSURE'       { L _ (CmmT_CLOSURE) }
+	'INFO_TABLE'	{ L _ (CmmT_INFO_TABLE) }
+	'INFO_TABLE_RET'{ L _ (CmmT_INFO_TABLE_RET) }
+	'INFO_TABLE_FUN'{ L _ (CmmT_INFO_TABLE_FUN) }
+	'INFO_TABLE_CONSTR'{ L _ (CmmT_INFO_TABLE_CONSTR) }
+	'INFO_TABLE_SELECTOR'{ L _ (CmmT_INFO_TABLE_SELECTOR) }
+	'else'		{ L _ (CmmT_else) }
+	'export'	{ L _ (CmmT_export) }
+	'section'	{ L _ (CmmT_section) }
+	'align'		{ L _ (CmmT_align) }
+	'goto'		{ L _ (CmmT_goto) }
+	'if'		{ L _ (CmmT_if) }
+        'call'          { L _ (CmmT_call) }
+        'jump'          { L _ (CmmT_jump) }
+        'foreign'       { L _ (CmmT_foreign) }
+	'never'		{ L _ (CmmT_never) }
+	'prim'		{ L _ (CmmT_prim) }
+	'return'	{ L _ (CmmT_return) }
+	'returns'	{ L _ (CmmT_returns) }
+	'import'	{ L _ (CmmT_import) }
+	'switch'	{ L _ (CmmT_switch) }
+	'case'		{ L _ (CmmT_case) }
+        'default'       { L _ (CmmT_default) }
+        'push'          { L _ (CmmT_push) }
+        'bits8'         { L _ (CmmT_bits8) }
+	'bits16'	{ L _ (CmmT_bits16) }
+	'bits32'	{ L _ (CmmT_bits32) }
+	'bits64'	{ L _ (CmmT_bits64) }
+	'float32'	{ L _ (CmmT_float32) }
+	'float64'	{ L _ (CmmT_float64) }
+	'gcptr'	        { L _ (CmmT_gcptr) }
+
+	GLOBALREG	{ L _ (CmmT_GlobalReg   $$) }
+  	NAME		{ L _ (CmmT_Name	$$) }
+	STRING		{ L _ (CmmT_String	$$) }
+	INT		{ L _ (CmmT_Int		$$) }
+	FLOAT		{ L _ (CmmT_Float	$$) }
 
 %monad { P } { >>= } { return }
 %lexer { cmmlex } { L _ CmmT_EOF }
@@ -166,18 +317,18 @@ import System.Exit
 
 %%
 
-cmm     :: { ExtCode }
-        : {- empty -}                   { return () }
-        | cmmtop cmm                    { do $1; $2 }
+cmm     :: { CmmParse () }
+	: {- empty -}			{ return () }
+	| cmmtop cmm			{ do $1; $2 }
 
-cmmtop  :: { ExtCode }
-        : cmmproc                       { $1 }
-        | cmmdata                       { $1 }
-        | decl                          { $1 }
-        | 'CLOSURE' '(' NAME ',' NAME lits ')' ';'
-                {% withThisPackage $ \pkg ->
-                   do lits <- sequence $6;
-                      staticClosure pkg $3 $5 (map getLit lits) }
+cmmtop  :: { CmmParse () }
+	: cmmproc			{ $1 }
+	| cmmdata			{ $1 }
+	| decl				{ $1 } 
+	| 'CLOSURE' '(' NAME ',' NAME lits ')' ';'  
+		{% withThisPackage $ \pkg -> 
+		   do lits <- sequence $6;
+		      staticClosure pkg $3 $5 (map getLit lits) }
 
 -- The only static closures in the RTS are dummy closures like
 -- stg_END_TSO_QUEUE_closure and stg_dummy_ret.  We don't need
@@ -188,37 +339,37 @@ cmmtop  :: { ExtCode }
 --      * payload is always empty
 --      * we can derive closure and info table labels from a single NAME
 
-cmmdata :: { ExtCode }
-        : 'section' STRING '{' data_label statics '}'
-                { do lbl <- $4;
-                     ss <- sequence $5;
-                     code (emitDecl (CmmData (section $2) (Statics lbl $ concat ss))) }
-
-data_label :: { ExtFCode CLabel }
-    : NAME ':'
-                {% withThisPackage $ \pkg ->
-                   return (mkCmmDataLabel pkg $1) }
-
-statics :: { [ExtFCode [CmmStatic]] }
-        : {- empty -}                   { [] }
-        | static statics                { $1 : $2 }
-
+cmmdata :: { CmmParse () }
+	: 'section' STRING '{' data_label statics '}' 
+		{ do lbl <- $4;
+		     ss <- sequence $5;
+		     code (emitDecl (CmmData (section $2) (Statics lbl $ concat ss))) }
+
+data_label :: { CmmParse CLabel }
+    : NAME ':'	
+		{% withThisPackage $ \pkg -> 
+		   return (mkCmmDataLabel pkg $1) }
+
+statics	:: { [CmmParse [CmmStatic]] }
+	: {- empty -}			{ [] }
+	| static statics		{ $1 : $2 }
+    
 -- Strings aren't used much in the RTS HC code, so it doesn't seem
 -- worth allowing inline strings.  C-- doesn't allow them anyway.
-static  :: { ExtFCode [CmmStatic] }
-        : type expr ';' { do e <- $2;
-                             return [CmmStaticLit (getLit e)] }
-        | type ';'                      { return [CmmUninitialised
-                                                        (widthInBytes (typeWidth $1))] }
-        | 'bits8' '[' ']' STRING ';'    { return [mkString $4] }
-        | 'bits8' '[' INT ']' ';'       { return [CmmUninitialised
-                                                        (fromIntegral $3)] }
-        | typenot8 '[' INT ']' ';'      { return [CmmUninitialised
-                                                (widthInBytes (typeWidth $1) *
-                                                        fromIntegral $3)] }
-        | 'CLOSURE' '(' NAME lits ')'
-                { do { lits <- sequence $4
-             ; dflags <- getDynFlags
+static  :: { CmmParse [CmmStatic] }
+	: type expr ';'	{ do e <- $2;
+			     return [CmmStaticLit (getLit e)] }
+	| type ';'			{ return [CmmUninitialised
+							(widthInBytes (typeWidth $1))] }
+        | 'bits8' '[' ']' STRING ';'	{ return [mkString $4] }
+        | 'bits8' '[' INT ']' ';'	{ return [CmmUninitialised 
+							(fromIntegral $3)] }
+        | typenot8 '[' INT ']' ';'	{ return [CmmUninitialised 
+						(widthInBytes (typeWidth $1) * 
+							fromIntegral $3)] }
+	| 'CLOSURE' '(' NAME lits ')'
+		{ do { lits <- sequence $4
+                ; dflags <- getDynFlags
                      ; return $ map CmmStaticLit $
                         mkStaticClosure dflags (mkForeignLabel $3 Nothing ForeignLabelInExternalPackage IsData)
                          -- mkForeignLabel because these are only used
@@ -226,140 +377,140 @@ static  :: { ExtFCode [CmmStatic] }
                         dontCareCCS (map getLit lits) [] [] [] } }
         -- arrays of closures required for the CHARLIKE & INTLIKE arrays
 
-lits    :: { [ExtFCode CmmExpr] }
-        : {- empty -}           { [] }
-        | ',' expr lits         { $2 : $3 }
-
-cmmproc :: { ExtCode }
--- TODO: add real SRT/info tables to parsed Cmm
-        : info maybe_formals_without_hints '{' body '}'
-                { do ((entry_ret_label, info, live, formals), stmts) <-
-                       getCgStmtsEC' $ loopDecls $ do {
-                         (entry_ret_label, info, live) <- $1;
-                         formals <- sequence $2;
+lits    :: { [CmmParse CmmExpr] }
+	: {- empty -}		{ [] }
+	| ',' expr lits		{ $2 : $3 }
+
+cmmproc :: { CmmParse () }
+        : info maybe_conv maybe_formals maybe_body
+                { do ((entry_ret_label, info, stk_formals, formals), agraph) <-
+                       getCodeR $ loopDecls $ do {
+                         (entry_ret_label, info, stk_formals) <- $1;
+                         formals <- sequence (fromMaybe [] $3);
                          $4;
-                         return (entry_ret_label, info, live, formals) }
-                     blks <- code (cgStmtsToBlocks stmts)
-                     code (emitInfoTableAndCode entry_ret_label info formals blks) }
+                         return (entry_ret_label, info, stk_formals, formals) }
+                     let do_layout = isJust $3
+                     code (emitProcWithStackFrame $2 info
+                                entry_ret_label stk_formals formals agraph
+                                do_layout ) }
 
-        | info maybe_formals_without_hints ';'
-                { do (entry_ret_label, info, live) <- $1;
-                     formals <- sequence $2;
-                     code (emitInfoTableAndCode entry_ret_label info formals []) }
+maybe_conv :: { Convention }
+           : {- empty -}        { NativeNodeCall }
+           | 'return'           { NativeReturn }
 
-        | NAME maybe_formals_without_hints '{' body '}'
-                {% withThisPackage $ \pkg ->
-                   do   newFunctionName $1 pkg
-                        (formals, stmts) <-
-                                getCgStmtsEC' $ loopDecls $ do {
-                                        formals <- sequence $2;
-                                        $4;
-                                        return formals }
-                        blks <- code (cgStmtsToBlocks stmts)
-                        code (emitProc Nothing (mkCmmCodeLabel pkg $1) formals blks) }
-
-info    :: { ExtFCode (CLabel, CmmInfoTable, [Maybe LocalReg]) }
-        : 'INFO_TABLE' '(' NAME ',' INT ',' INT ',' stgHalfWord ',' STRING ',' STRING ')'
-                -- ptrs, nptrs, closure type, description, type
-                {% withThisPackage $ \pkg ->
+maybe_body :: { CmmParse () }
+           : ';'                { return () }
+           | '{' body '}'       { $2 }
+
+info    :: { CmmParse (CLabel, Maybe CmmInfoTable, [LocalReg]) }
+        : NAME
+		{% withThisPackage $ \pkg ->
+		   do	newFunctionName $1 pkg
+                        return (mkCmmCodeLabel pkg $1, Nothing, []) }
+
+
+        | 'INFO_TABLE' '(' NAME ',' INT ',' INT ',' INT ',' STRING ',' STRING ')'
+		-- ptrs, nptrs, closure type, description, type
+		{% withThisPackage $ \pkg ->
                    do dflags <- getDynFlags
                       let prof = profilingInfo dflags $11 $13
-                          rep  = mkRTSRep $9 $
+                          rep  = mkRTSRep (fromIntegral $9) $
                                    mkHeapRep dflags False (fromIntegral $5)
                                                    (fromIntegral $7) Thunk
                               -- not really Thunk, but that makes the info table
                               -- we want.
                       return (mkCmmEntryLabel pkg $3,
-                              CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
-                                           , cit_rep = rep
-                                           , cit_prof = prof, cit_srt = NoC_SRT },
-                              []) }
-
-        | 'INFO_TABLE_FUN' '(' NAME ',' INT ',' INT ',' stgHalfWord ',' STRING ',' STRING ',' stgHalfWord ')'
-                -- ptrs, nptrs, closure type, description, type, fun type
-                {% withThisPackage $ \pkg ->
+                              Just $ CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
+				           , cit_rep = rep
+               				   , cit_prof = prof, cit_srt = NoC_SRT },
+			      []) }
+	
+        | 'INFO_TABLE_FUN' '(' NAME ',' INT ',' INT ',' INT ',' STRING ',' STRING ',' INT ')'
+		-- ptrs, nptrs, closure type, description, type, fun type
+		{% withThisPackage $ \pkg -> 
                    do dflags <- getDynFlags
                       let prof = profilingInfo dflags $11 $13
-                          ty   = Fun (toStgHalfWord dflags 0) (ArgSpec $15)
+                          ty   = Fun 0 (ArgSpec (fromIntegral $15))
                                 -- Arity zero, arg_type $15
-                          rep = mkRTSRep $9 $
+                          rep = mkRTSRep (fromIntegral $9) $
                                     mkHeapRep dflags False (fromIntegral $5)
                                                     (fromIntegral $7) ty
                       return (mkCmmEntryLabel pkg $3,
-                              CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
-                                           , cit_rep = rep
-                                           , cit_prof = prof, cit_srt = NoC_SRT },
-                              []) }
-                -- we leave most of the fields zero here.  This is only used
-                -- to generate the BCO info table in the RTS at the moment.
-
-        | 'INFO_TABLE_CONSTR' '(' NAME ',' INT ',' INT ',' stgHalfWord ',' stgHalfWord ',' STRING ',' STRING ')'
+                              Just $ CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
+				           , cit_rep = rep
+               				   , cit_prof = prof, cit_srt = NoC_SRT },
+			      []) }
+		-- we leave most of the fields zero here.  This is only used
+		-- to generate the BCO info table in the RTS at the moment.
+
+        | 'INFO_TABLE_CONSTR' '(' NAME ',' INT ',' INT ',' INT ',' INT ',' STRING ',' STRING ')'
                 -- ptrs, nptrs, tag, closure type, description, type
                 {% withThisPackage $ \pkg ->
                    do dflags <- getDynFlags
                       let prof = profilingInfo dflags $13 $15
-                          ty  = Constr $9  -- Tag
+                          ty  = Constr (fromIntegral $9)  -- Tag
                                        (stringToWord8s $13)
-                          rep = mkRTSRep $11 $
+                          rep = mkRTSRep (fromIntegral $11) $
                                   mkHeapRep dflags False (fromIntegral $5)
                                                   (fromIntegral $7) ty
                       return (mkCmmEntryLabel pkg $3,
-                              CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
-                                           , cit_rep = rep
-                                           , cit_prof = prof, cit_srt = NoC_SRT },
-                              []) }
-
-                     -- If profiling is on, this string gets duplicated,
-                     -- but that's the way the old code did it we can fix it some other time.
-
-        | 'INFO_TABLE_SELECTOR' '(' NAME ',' stgWord ',' stgHalfWord ',' STRING ',' STRING ')'
-                -- selector, closure type, description, type
-                {% withThisPackage $ \pkg ->
+                              Just $ CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
+				           , cit_rep = rep
+               				   , cit_prof = prof, cit_srt = NoC_SRT },
+			      []) }
+
+		     -- If profiling is on, this string gets duplicated,
+		     -- but that's the way the old code did it we can fix it some other time.
+	
+        | 'INFO_TABLE_SELECTOR' '(' NAME ',' INT ',' INT ',' STRING ',' STRING ')'
+		-- selector, closure type, description, type
+		{% withThisPackage $ \pkg ->
                    do dflags <- getDynFlags
                       let prof = profilingInfo dflags $9 $11
-                          ty  = ThunkSelector $5
-                          rep = mkRTSRep $7 $
+                          ty  = ThunkSelector (fromIntegral $5)
+                          rep = mkRTSRep (fromIntegral $7) $
                                    mkHeapRep dflags False 0 0 ty
                       return (mkCmmEntryLabel pkg $3,
-                              CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
-                                           , cit_rep = rep
-                                           , cit_prof = prof, cit_srt = NoC_SRT },
-                              []) }
-
-        | 'INFO_TABLE_RET' '(' NAME ',' stgHalfWord ')'
-                -- closure type (no live regs)
-                {% withThisPackage $ \pkg ->
-                   do let prof = NoProfilingInfo
-                          rep  = mkRTSRep $5 $ mkStackRep []
+                              Just $ CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
+				           , cit_rep = rep
+               				   , cit_prof = prof, cit_srt = NoC_SRT },
+			      []) }
+
+        | 'INFO_TABLE_RET' '(' NAME ',' INT ')'
+		-- closure type (no live regs)
+		{% withThisPackage $ \pkg ->
+		   do let prof = NoProfilingInfo
+                          rep  = mkRTSRep (fromIntegral $5) $ mkStackRep []
                       return (mkCmmRetLabel pkg $3,
-                              CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
-                                           , cit_rep = rep
-                                           , cit_prof = prof, cit_srt = NoC_SRT },
-                              []) }
-
-        | 'INFO_TABLE_RET' '(' NAME ',' stgHalfWord ',' formals_without_hints0 ')'
-                -- closure type, live regs
-                {% withThisPackage $ \pkg ->
+                              Just $ CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
+				           , cit_rep = rep
+               				   , cit_prof = prof, cit_srt = NoC_SRT },
+			      []) }
+
+        | 'INFO_TABLE_RET' '(' NAME ',' INT ',' formals0 ')'
+		-- closure type, live regs
+		{% withThisPackage $ \pkg ->
                    do dflags <- getDynFlags
-                      live <- sequence (map (liftM Just) $7)
-                      let prof = NoProfilingInfo
-                          bitmap = mkLiveness dflags live
-                          rep  = mkRTSRep $5 $ mkStackRep bitmap
+                      live <- sequence $7
+		      let prof = NoProfilingInfo
+                          -- drop one for the info pointer
+                          bitmap = mkLiveness dflags (map Just (drop 1 live))
+                          rep  = mkRTSRep (fromIntegral $5) $ mkStackRep bitmap
                       return (mkCmmRetLabel pkg $3,
-                              CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
-                                           , cit_rep = rep
-                                           , cit_prof = prof, cit_srt = NoC_SRT },
-                              []) }
+                              Just $ CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
+				           , cit_rep = rep
+               				   , cit_prof = prof, cit_srt = NoC_SRT },
+                              live) }
 
-body    :: { ExtCode }
-        : {- empty -}                   { return () }
-        | decl body                     { do $1; $2 }
-        | stmt body                     { do $1; $2 }
+body    :: { CmmParse () }
+	: {- empty -}			{ return () }
+	| decl body			{ do $1; $2 }
+	| stmt body			{ do $1; $2 }
 
-decl    :: { ExtCode }
-        : type names ';'                { mapM_ (newLocal $1) $2 }
-        | 'import' importNames ';'      { mapM_ newImport $2 }
-        | 'export' names ';'            { return () }  -- ignore exports
+decl    :: { CmmParse () }
+	: type names ';'		{ mapM_ (newLocal $1) $2 }
+	| 'import' importNames ';'	{ mapM_ newImport $2 }
+	| 'export' names ';'		{ return () }  -- ignore exports
 
 
 -- an imported function name, with optional packageId
@@ -371,84 +522,96 @@ importNames
 importName
         :: { (FastString,  CLabel) }
 
-        -- A label imported without an explicit packageId.
-        --      These are taken to come frome some foreign, unnamed package.
-        : NAME
-        { ($1, mkForeignLabel $1 Nothing ForeignLabelInExternalPackage IsFunction) }
-
-        -- A label imported with an explicit packageId.
-        | STRING NAME
-        { ($2, mkCmmCodeLabel (fsToPackageId (mkFastString $1)) $2) }
-
-
-names   :: { [FastString] }
-        : NAME                          { [$1] }
-        | NAME ',' names                { $1 : $3 }
-
-stmt    :: { ExtCode }
-        : ';'                                   { nopEC }
-
-        | NAME ':'
-                { do l <- newLabel $1; code (labelC l) }
-
-        | lreg '=' expr ';'
-                { do reg <- $1; e <- $3; stmtEC (CmmAssign reg e) }
-        | type '[' expr ']' '=' expr ';'
-                { doStore $1 $3 $6 }
-
-        -- Gah! We really want to say "maybe_results" but that causes
-        -- a shift/reduce conflict with assignment.  We either
-        -- we expand out the no-result and single result cases or
-        -- we tweak the syntax to avoid the conflict.  The later
-        -- option is taken here because the other way would require
-        -- multiple levels of expanding and get unwieldy.
-        | maybe_results 'foreign' STRING expr '(' cmm_hint_exprs0 ')' safety vols opt_never_returns ';'
-                {% foreignCall $3 $1 $4 $6 $9 $8 $10 }
-        | maybe_results 'prim' '%' NAME '(' cmm_hint_exprs0 ')' safety vols ';'
-                {% primCall $1 $4 $6 $9 $8 }
-        -- stmt-level macros, stealing syntax from ordinary C-- function calls.
-        -- Perhaps we ought to use the %%-form?
-        | NAME '(' exprs0 ')' ';'
-                {% stmtMacro $1 $3  }
-        | 'switch' maybe_range expr '{' arms default '}'
-                { do as <- sequence $5; doSwitch $2 $3 as $6 }
-        | 'goto' NAME ';'
-                { do l <- lookupLabel $2; stmtEC (CmmBranch l) }
+	-- A label imported without an explicit packageId.
+	--	These are taken to come frome some foreign, unnamed package.
+	: NAME	
+	{ ($1, mkForeignLabel $1 Nothing ForeignLabelInExternalPackage IsFunction) }
+
+	-- A label imported with an explicit packageId.
+	| STRING NAME
+	{ ($2, mkCmmCodeLabel (fsToPackageId (mkFastString $1)) $2) }
+	
+	
+names 	:: { [FastString] }
+	: NAME				{ [$1] }
+	| NAME ',' names		{ $1 : $3 }
+
+stmt	:: { CmmParse () }
+        : ';'                                   { return () }
+
+	| NAME ':'
+                { do l <- newLabel $1; emitLabel l }
+
+
+
+	| lreg '=' expr ';'
+                { do reg <- $1; e <- $3; emitAssign reg e }
+	| type '[' expr ']' '=' expr ';'
+		{ doStore $1 $3 $6 }
+
+        -- Gah! We really want to say "foreign_results" but that causes
+	-- a shift/reduce conflict with assignment.  We either
+	-- we expand out the no-result and single result cases or
+	-- we tweak the syntax to avoid the conflict.  The later
+	-- option is taken here because the other way would require
+	-- multiple levels of expanding and get unwieldy.
+        | foreign_results 'foreign' STRING expr '(' cmm_hint_exprs0 ')' safety opt_never_returns ';'
+                {% foreignCall $3 $1 $4 $6 $8 $9 }
+        | foreign_results 'prim' '%' NAME '(' exprs0 ')' ';'
+                {% primCall $1 $4 $6 }
+	-- stmt-level macros, stealing syntax from ordinary C-- function calls.
+	-- Perhaps we ought to use the %%-form?
+	| NAME '(' exprs0 ')' ';'
+		{% stmtMacro $1 $3  }
+	| 'switch' maybe_range expr '{' arms default '}'
+		{ do as <- sequence $5; doSwitch $2 $3 as $6 }
+	| 'goto' NAME ';'
+                { do l <- lookupLabel $2; emit (mkBranch l) }
+        | 'return' '(' exprs0 ')' ';'
+                { doReturn $3 }
         | 'jump' expr vols ';'
-                { do e <- $2; stmtEC (CmmJump e $3) }
-        | 'return' ';'
-                { stmtEC CmmReturn }
+                { doRawJump $2 $3 }
+        | 'jump' expr '(' exprs0 ')' ';'
+                { doJumpWithStack $2 [] $4 }
+        | 'jump' expr '(' exprs0 ')' '(' exprs0 ')' ';'
+                { doJumpWithStack $2 $4 $7 }
+        | 'call' expr '(' exprs0 ')' ';'
+                { doCall $2 [] $4 }
+        | '(' formals ')' '=' 'call' expr '(' exprs0 ')' ';'
+                { doCall $6 $2 $8 }
         | 'if' bool_expr 'goto' NAME
-                { do l <- lookupLabel $4; cmmRawIf $2 l }
-        | 'if' bool_expr '{' body '}' else
-                { cmmIfThenElse $2 $4 $6 }
+		{ do l <- lookupLabel $4; cmmRawIf $2 l }
+	| 'if' bool_expr '{' body '}' else 	
+		{ cmmIfThenElse $2 $4 $6 }
+        | 'push' '(' exprs0 ')' maybe_body
+                { pushStackFrame $3 $5 }
 
 opt_never_returns :: { CmmReturnInfo }
         :                               { CmmMayReturn }
         | 'never' 'returns'             { CmmNeverReturns }
 
-bool_expr :: { ExtFCode BoolExpr }
-        : bool_op                       { $1 }
-        | expr                          { do e <- $1; return (BoolTest e) }
-
-bool_op :: { ExtFCode BoolExpr }
-        : bool_expr '&&' bool_expr      { do e1 <- $1; e2 <- $3;
-                                          return (BoolAnd e1 e2) }
-        | bool_expr '||' bool_expr      { do e1 <- $1; e2 <- $3;
-                                          return (BoolOr e1 e2)  }
-        | '!' bool_expr                 { do e <- $2; return (BoolNot e) }
-        | '(' bool_op ')'               { $2 }
-
--- This is not C-- syntax.  What to do?
-safety  :: { CmmSafety }
-        : {- empty -}                   { CmmUnsafe } -- Default may change soon
-        | STRING                        {% parseSafety $1 }
-
--- This is not C-- syntax.  What to do?
-vols    :: { Maybe [GlobalReg] }
-        : {- empty -}                   { Nothing }
-        | '[' ']'                       { Just [] }
-        | '[' globals ']'               { Just $2 }
+bool_expr :: { CmmParse BoolExpr }
+	: bool_op			{ $1 }
+	| expr				{ do e <- $1; return (BoolTest e) }
+
+bool_op :: { CmmParse BoolExpr }
+	: bool_expr '&&' bool_expr 	{ do e1 <- $1; e2 <- $3; 
+					  return (BoolAnd e1 e2) }
+	| bool_expr '||' bool_expr	{ do e1 <- $1; e2 <- $3; 
+					  return (BoolOr e1 e2)  }
+	| '!' bool_expr			{ do e <- $2; return (BoolNot e) }
+	| '(' bool_op ')'		{ $2 }
+
+safety  :: { Safety }
+        : {- empty -}                   { PlayRisky }
+	| STRING			{% parseSafety $1 }
+
+vols    :: { [GlobalReg] }
+        : '[' ']'                       { [] }
+        | '[' '*' ']'                   {% do df <- getDynFlags
+                                         ; return (realArgRegs df) }
+                                           -- all of them
+        | '[' globals ']'               { $2 }
 
 globals :: { [GlobalReg] }
         : GLOBALREG                     { [$1] }
@@ -458,67 +621,67 @@ maybe_range :: { Maybe (Int,Int) }
         : '[' INT '..' INT ']'  { Just (fromIntegral $2, fromIntegral $4) }
         | {- empty -}           { Nothing }
 
-arms    :: { [ExtFCode ([Int],Either BlockId ExtCode)] }
-        : {- empty -}                   { [] }
-        | arm arms                      { $1 : $2 }
+arms    :: { [CmmParse ([Int],Either BlockId (CmmParse ()))] }
+	: {- empty -}			{ [] }
+	| arm arms			{ $1 : $2 }
 
-arm     :: { ExtFCode ([Int],Either BlockId ExtCode) }
-        : 'case' ints ':' arm_body      { do b <- $4; return ($2, b) }
+arm     :: { CmmParse ([Int],Either BlockId (CmmParse ())) }
+	: 'case' ints ':' arm_body	{ do b <- $4; return ($2, b) }
 
-arm_body :: { ExtFCode (Either BlockId ExtCode) }
-        : '{' body '}'                  { return (Right $2) }
-        | 'goto' NAME ';'               { do l <- lookupLabel $2; return (Left l) }
+arm_body :: { CmmParse (Either BlockId (CmmParse ())) }
+	: '{' body '}'			{ return (Right $2) }
+	| 'goto' NAME ';'		{ do l <- lookupLabel $2; return (Left l) }
 
 ints    :: { [Int] }
         : INT                           { [ fromIntegral $1 ] }
         | INT ',' ints                  { fromIntegral $1 : $3 }
 
-default :: { Maybe ExtCode }
-        : 'default' ':' '{' body '}'    { Just $4 }
-        -- taking a few liberties with the C-- syntax here; C-- doesn't have
-        -- 'default' branches
-        | {- empty -}                   { Nothing }
+default :: { Maybe (CmmParse ()) }
+	: 'default' ':' '{' body '}'	{ Just $4 }
+	-- taking a few liberties with the C-- syntax here; C-- doesn't have
+	-- 'default' branches
+	| {- empty -}			{ Nothing }
 
 -- Note: OldCmm doesn't support a first class 'else' statement, though
 -- CmmNode does.
-else    :: { ExtCode }
-        : {- empty -}                   { nopEC }
-        | 'else' '{' body '}'           { $3 }
+else    :: { CmmParse () }
+        : {- empty -}                   { return () }
+	| 'else' '{' body '}'		{ $3 }
 
 -- we have to write this out longhand so that Happy's precedence rules
 -- can kick in.
-expr    :: { ExtFCode CmmExpr }
-        : expr '/' expr                 { mkMachOp MO_U_Quot [$1,$3] }
-        | expr '*' expr                 { mkMachOp MO_Mul [$1,$3] }
-        | expr '%' expr                 { mkMachOp MO_U_Rem [$1,$3] }
-        | expr '-' expr                 { mkMachOp MO_Sub [$1,$3] }
-        | expr '+' expr                 { mkMachOp MO_Add [$1,$3] }
-        | expr '>>' expr                { mkMachOp MO_U_Shr [$1,$3] }
-        | expr '<<' expr                { mkMachOp MO_Shl [$1,$3] }
-        | expr '&' expr                 { mkMachOp MO_And [$1,$3] }
-        | expr '^' expr                 { mkMachOp MO_Xor [$1,$3] }
-        | expr '|' expr                 { mkMachOp MO_Or [$1,$3] }
-        | expr '>=' expr                { mkMachOp MO_U_Ge [$1,$3] }
-        | expr '>' expr                 { mkMachOp MO_U_Gt [$1,$3] }
-        | expr '<=' expr                { mkMachOp MO_U_Le [$1,$3] }
-        | expr '<' expr                 { mkMachOp MO_U_Lt [$1,$3] }
-        | expr '!=' expr                { mkMachOp MO_Ne [$1,$3] }
-        | expr '==' expr                { mkMachOp MO_Eq [$1,$3] }
-        | '~' expr                      { mkMachOp MO_Not [$2] }
-        | '-' expr                      { mkMachOp MO_S_Neg [$2] }
-        | expr0 '`' NAME '`' expr0      {% do { mo <- nameToMachOp $3 ;
-                                                return (mkMachOp mo [$1,$5]) } }
-        | expr0                         { $1 }
-
-expr0   :: { ExtFCode CmmExpr }
-        : INT   maybe_ty         { return (CmmLit (CmmInt $1 (typeWidth $2))) }
-        | FLOAT maybe_ty         { return (CmmLit (CmmFloat $1 (typeWidth $2))) }
-        | STRING                 { do s <- code (newStringCLit $1);
-                                      return (CmmLit s) }
-        | reg                    { $1 }
-        | type '[' expr ']'      { do e <- $3; return (CmmLoad e $1) }
-        | '%' NAME '(' exprs0 ')' {% exprOp $2 $4 }
-        | '(' expr ')'           { $2 }
+expr    :: { CmmParse CmmExpr }
+	: expr '/' expr			{ mkMachOp MO_U_Quot [$1,$3] }
+	| expr '*' expr			{ mkMachOp MO_Mul [$1,$3] }
+	| expr '%' expr			{ mkMachOp MO_U_Rem [$1,$3] }
+	| expr '-' expr			{ mkMachOp MO_Sub [$1,$3] }
+	| expr '+' expr			{ mkMachOp MO_Add [$1,$3] }
+	| expr '>>' expr		{ mkMachOp MO_U_Shr [$1,$3] }
+	| expr '<<' expr		{ mkMachOp MO_Shl [$1,$3] }
+	| expr '&' expr			{ mkMachOp MO_And [$1,$3] }
+	| expr '^' expr			{ mkMachOp MO_Xor [$1,$3] }
+	| expr '|' expr			{ mkMachOp MO_Or [$1,$3] }
+	| expr '>=' expr		{ mkMachOp MO_U_Ge [$1,$3] }
+	| expr '>' expr			{ mkMachOp MO_U_Gt [$1,$3] }
+	| expr '<=' expr		{ mkMachOp MO_U_Le [$1,$3] }
+	| expr '<' expr			{ mkMachOp MO_U_Lt [$1,$3] }
+	| expr '!=' expr		{ mkMachOp MO_Ne [$1,$3] }
+	| expr '==' expr		{ mkMachOp MO_Eq [$1,$3] }
+	| '~' expr			{ mkMachOp MO_Not [$2] }
+	| '-' expr			{ mkMachOp MO_S_Neg [$2] }
+	| expr0 '`' NAME '`' expr0  	{% do { mo <- nameToMachOp $3 ;
+					        return (mkMachOp mo [$1,$5]) } }
+	| expr0				{ $1 }
+
+expr0	:: { CmmParse CmmExpr }
+	: INT   maybe_ty	 { return (CmmLit (CmmInt $1 (typeWidth $2))) }
+	| FLOAT maybe_ty	 { return (CmmLit (CmmFloat $1 (typeWidth $2))) }
+	| STRING		 { do s <- code (newStringCLit $1); 
+				      return (CmmLit s) }
+	| reg			 { $1 }
+	| type '[' expr ']'	 { do e <- $3; return (CmmLoad e $1) }
+	| '%' NAME '(' exprs0 ')' {% exprOp $2 $4 }
+	| '(' expr ')'		 { $2 }
 
 
 -- leaving out the type of a literal gives you the native word size in C--
@@ -526,81 +689,78 @@ maybe_ty :: { CmmType }
         : {- empty -}                   {% do dflags <- getDynFlags; return $ bWord dflags }
         | '::' type                     { $2 }
 
-maybe_actuals :: { [ExtFCode HintedCmmActual] }
-        : {- empty -}                   { [] }
-        | '(' cmm_hint_exprs0 ')'       { $2 }
-
-cmm_hint_exprs0 :: { [ExtFCode HintedCmmActual] }
-        : {- empty -}                   { [] }
+cmm_hint_exprs0 :: { [CmmParse (CmmExpr, ForeignHint)] }
+	: {- empty -}			{ [] }
         | cmm_hint_exprs                { $1 }
 
-cmm_hint_exprs :: { [ExtFCode HintedCmmActual] }
-        : cmm_hint_expr                         { [$1] }
-        | cmm_hint_expr ',' cmm_hint_exprs      { $1 : $3 }
+cmm_hint_exprs :: { [CmmParse (CmmExpr, ForeignHint)] }
+	: cmm_hint_expr			{ [$1] }
+	| cmm_hint_expr ',' cmm_hint_exprs	{ $1 : $3 }
 
-cmm_hint_expr :: { ExtFCode HintedCmmActual }
-        : expr                          { do e <- $1; return (CmmHinted e (inferCmmHint e)) }
-        | expr STRING                   {% do h <- parseCmmHint $2;
-                                              return $ do
-                                                e <- $1; return (CmmHinted e h) }
+cmm_hint_expr :: { CmmParse (CmmExpr, ForeignHint) }
+        : expr                          { do e <- $1;
+                                             return (e, inferCmmHint e) }
+	| expr STRING			{% do h <- parseCmmHint $2;
+					      return $ do
+                                                e <- $1; return (e, h) }
 
-exprs0  :: { [ExtFCode CmmExpr] }
-        : {- empty -}                   { [] }
-        | exprs                         { $1 }
-
-exprs   :: { [ExtFCode CmmExpr] }
-        : expr                          { [ $1 ] }
-        | expr ',' exprs                { $1 : $3 }
-
-reg     :: { ExtFCode CmmExpr }
-        : NAME                  { lookupName $1 }
-        | GLOBALREG             { return (CmmReg (CmmGlobal $1)) }
-
-maybe_results :: { [ExtFCode HintedCmmFormal] }
-        : {- empty -}           { [] }
-        | '(' cmm_formals ')' '='       { $2 }
-
-cmm_formals :: { [ExtFCode HintedCmmFormal] }
-        : cmm_formal                    { [$1] }
-        | cmm_formal ','                { [$1] }
-        | cmm_formal ',' cmm_formals    { $1 : $3 }
-
-cmm_formal :: { ExtFCode HintedCmmFormal }
-        : local_lreg                    { do e <- $1; return (CmmHinted e (inferCmmHint (CmmReg (CmmLocal e)))) }
-        | STRING local_lreg             {% do h <- parseCmmHint $1;
-                                              return $ do
-                                                e <- $2; return (CmmHinted e h) }
-
-local_lreg :: { ExtFCode LocalReg }
-        : NAME                  { do e <- lookupName $1;
-                                     return $
-                                       case e of
-                                        CmmReg (CmmLocal r) -> r
-                                        other -> pprPanic "CmmParse:" (ftext $1 <> text " not a local register") }
-
-lreg    :: { ExtFCode CmmReg }
-        : NAME                  { do e <- lookupName $1;
-                                     return $
-                                       case e of
-                                        CmmReg r -> r
-                                        other -> pprPanic "CmmParse:" (ftext $1 <> text " not a register") }
-        | GLOBALREG             { return (CmmGlobal $1) }
-
-maybe_formals_without_hints :: { [ExtFCode LocalReg] }
-        : {- empty -}                           { [] }
-        | '(' formals_without_hints0 ')'        { $2 }
-
-formals_without_hints0 :: { [ExtFCode LocalReg] }
-        : {- empty -}                   { [] }
-        | formals_without_hints         { $1 }
+exprs0  :: { [CmmParse CmmExpr] }
+	: {- empty -}			{ [] }
+	| exprs				{ $1 }
 
-formals_without_hints :: { [ExtFCode LocalReg] }
-        : formal_without_hint ','                       { [$1] }
-        | formal_without_hint                           { [$1] }
-        | formal_without_hint ',' formals_without_hints { $1 : $3 }
+exprs   :: { [CmmParse CmmExpr] }
+	: expr				{ [ $1 ] }
+	| expr ',' exprs		{ $1 : $3 }
 
-formal_without_hint :: { ExtFCode LocalReg }
-        : type NAME             { newLocal $1 $2 }
+reg     :: { CmmParse CmmExpr }
+	: NAME			{ lookupName $1 }
+	| GLOBALREG		{ return (CmmReg (CmmGlobal $1)) }
+
+foreign_results :: { [CmmParse (LocalReg, ForeignHint)] }
+        : {- empty -}                   { [] }
+        | '(' foreign_formals ')' '='   { $2 }
+
+foreign_formals :: { [CmmParse (LocalReg, ForeignHint)] }
+	: foreign_formal			{ [$1] }
+        | foreign_formal ','                    { [$1] }
+        | foreign_formal ',' foreign_formals    { $1 : $3 }
+
+foreign_formal :: { CmmParse (LocalReg, ForeignHint) }
+        : local_lreg            { do e <- $1; return (e, (inferCmmHint (CmmReg (CmmLocal e)))) }
+        | STRING local_lreg     {% do h <- parseCmmHint $1;
+                                      return $ do
+                                         e <- $2; return (e,h) }
+
+local_lreg :: { CmmParse LocalReg }
+	: NAME			{ do e <- lookupName $1;
+				     return $
+				       case e of 
+					CmmReg (CmmLocal r) -> r
+					other -> pprPanic "CmmParse:" (ftext $1 <> text " not a local register") }
+
+lreg	:: { CmmParse CmmReg }
+	: NAME			{ do e <- lookupName $1;
+				     return $
+				       case e of 
+					CmmReg r -> r
+					other -> pprPanic "CmmParse:" (ftext $1 <> text " not a register") }
+	| GLOBALREG		{ return (CmmGlobal $1) }
+
+maybe_formals :: { Maybe [CmmParse LocalReg] }
+        : {- empty -}           { Nothing }
+        | '(' formals0 ')'      { Just $2 }
+
+formals0 :: { [CmmParse LocalReg] }
+	: {- empty -}		{ [] }
+        | formals               { $1 }
+
+formals :: { [CmmParse LocalReg] }
+        : formal ','            { [$1] }
+        | formal                { [$1] }
+        | formal ',' formals       { $1 : $3 }
+
+formal :: { CmmParse LocalReg }
+	: type NAME		{ newLocal $1 $2 }
 
 type    :: { CmmType }
         : 'bits8'               { b8 }
@@ -614,12 +774,6 @@ typenot8 :: { CmmType }
         | 'float64'             { f64 }
         | 'gcptr'               {% do dflags <- getDynFlags; return $ gcWord dflags }
 
-stgWord :: { StgWord }
-        : INT                   {% do dflags <- getDynFlags; return $ toStgWord dflags $1 }
-
-stgHalfWord :: { StgHalfWord }
-        : INT                   {% do dflags <- getDynFlags; return $ toStgHalfWord dflags $1 }
-
 {
 section :: String -> Section
 section "text"      = Text
@@ -632,11 +786,22 @@ section s           = OtherSection s
 mkString :: String -> CmmStatic
 mkString s = CmmString (map (fromIntegral.ord) s)
 
+-- |
+-- Given an info table, decide what the entry convention for the proc
+-- is.  That is, for an INFO_TABLE_RET we want the return convention,
+-- otherwise it is a NativeNodeCall.
+--
+infoConv :: Maybe CmmInfoTable -> Convention
+infoConv Nothing = NativeNodeCall
+infoConv (Just info)
+  | isStackRep (cit_rep info) = NativeReturn
+  | otherwise                 = NativeNodeCall
+
 -- mkMachOp infers the type of the MachOp from the type of its first
 -- argument.  We assume that this is correct: for MachOps that don't have
 -- symmetrical args (e.g. shift ops), the first arg determines the type of
 -- the op.
-mkMachOp :: (Width -> MachOp) -> [ExtFCode CmmExpr] -> ExtFCode CmmExpr
+mkMachOp :: (Width -> MachOp) -> [CmmParse CmmExpr] -> CmmParse CmmExpr
 mkMachOp fn args = do
   dflags <- getDynFlags
   arg_exprs <- sequence args
@@ -653,7 +818,7 @@ nameToMachOp name =
         Nothing -> fail ("unknown primitive " ++ unpackFS name)
         Just m  -> return m
 
-exprOp :: FastString -> [ExtFCode CmmExpr] -> P (ExtFCode CmmExpr)
+exprOp :: FastString -> [CmmParse CmmExpr] -> P (CmmParse CmmExpr)
 exprOp name args_code = do
   dflags <- getDynFlags
   case lookupUFM (exprMacros dflags) name of
@@ -755,10 +920,10 @@ callishMachOps = listToUFM $
         -- ToDo: the rest, maybe
     ]
 
-parseSafety :: String -> P CmmSafety
-parseSafety "safe"   = return (CmmSafe NoC_SRT)
-parseSafety "unsafe" = return CmmUnsafe
-parseSafety "interruptible" = return CmmInterruptible
+parseSafety :: String -> P Safety
+parseSafety "safe"   = return PlaySafe
+parseSafety "unsafe" = return PlayRisky
+parseSafety "interruptible" = return PlayInterruptible
 parseSafety str      = fail ("unrecognised safety: " ++ str)
 
 parseCmmHint :: String -> P ForeignHint
@@ -788,7 +953,7 @@ happyError = srcParseFail
 -- -----------------------------------------------------------------------------
 -- Statement-level macros
 
-stmtMacro :: FastString -> [ExtFCode CmmExpr] -> P ExtCode
+stmtMacro :: FastString -> [CmmParse CmmExpr] -> P (CmmParse ())
 stmtMacro fun args_code = do
   case lookupUFM stmtMacros fun of
     Nothing -> fail ("unknown macro: " ++ unpackFS fun)
@@ -796,49 +961,61 @@ stmtMacro fun args_code = do
         args <- sequence args_code
         code (fcode args)
 
-stmtMacros :: UniqFM ([CmmExpr] -> Code)
+stmtMacros :: UniqFM ([CmmExpr] -> FCode ())
 stmtMacros = listToUFM [
   ( fsLit "CCS_ALLOC",             \[words,ccs]  -> profAlloc words ccs ),
+  ( fsLit "ENTER_CCS_THUNK",       \[e] -> enterCostCentreThunk e ),
+
   ( fsLit "CLOSE_NURSERY",         \[]  -> emitCloseNursery ),
-  ( fsLit "ENTER_CCS_THUNK",      \[e] -> enterCostCentreThunk e ),
-  ( fsLit "HP_CHK_GEN",           \[words,liveness,reentry] ->
-                                      hpChkGen words liveness reentry ),
-  ( fsLit "HP_CHK_NP_ASSIGN_SP0", \[e,f] -> hpChkNodePointsAssignSp0 e f ),
-  ( fsLit "LOAD_THREAD_STATE",    \[] -> emitLoadThreadState ),
-  ( fsLit "LDV_ENTER",            \[e] -> ldvEnter e ),
-  ( fsLit "LDV_RECORD_CREATE",    \[e] -> ldvRecordCreate e ),
   ( fsLit "OPEN_NURSERY",          \[]  -> emitOpenNursery ),
+
+  -- completely generic heap and stack checks, for use in high-level cmm.
+  ( fsLit "HP_CHK_GEN",            \[bytes] ->
+                                      heapStackCheckGen Nothing (Just bytes) ),
+  ( fsLit "STK_CHK_GEN",           \[] ->
+                                      heapStackCheckGen (Just (CmmLit CmmHighStackMark)) Nothing ),
+
+  -- A stack check for a fixed amount of stack.  Sounds a bit strange, but
+  -- we use the stack for a bit of temporary storage in a couple of primops
+  ( fsLit "STK_CHK_GEN_N",         \[bytes] ->
+                                      heapStackCheckGen (Just bytes) Nothing ),
+
+  -- A stack check on entry to a thunk, where the argument is the thunk pointer.
+  ( fsLit "STK_CHK_NP"   ,         \[node] -> entryHeapCheck' False node 0 [] (return ())),
+
+  ( fsLit "LOAD_THREAD_STATE",     \[] -> emitLoadThreadState ),
+  ( fsLit "SAVE_THREAD_STATE",     \[] -> emitSaveThreadState ),
+
+  ( fsLit "LDV_ENTER",             \[e] -> ldvEnter e ),
+  ( fsLit "LDV_RECORD_CREATE",     \[e] -> ldvRecordCreate e ),
+
   ( fsLit "PUSH_UPD_FRAME",        \[sp,e] -> emitPushUpdateFrame sp e ),
-  ( fsLit "SAVE_THREAD_STATE",    \[] -> emitSaveThreadState ),
   ( fsLit "SET_HDR",               \[ptr,info,ccs] ->
-                                        emitSetDynHdr ptr info ccs ),
-  ( fsLit "STK_CHK_GEN",          \[words,liveness,reentry] ->
-                                      stkChkGen words liveness reentry ),
-  ( fsLit "STK_CHK_NP",    \[e] -> stkChkNodePoints e ),
+					emitSetDynHdr ptr info ccs ),
   ( fsLit "TICK_ALLOC_PRIM",       \[hdr,goods,slop] ->
-                                        tickyAllocPrim hdr goods slop ),
-  ( fsLit "TICK_ALLOC_PAP",       \[goods,slop] ->
-                                        tickyAllocPAP goods slop ),
-  ( fsLit "TICK_ALLOC_UP_THK",    \[goods,slop] ->
-                                        tickyAllocThunk goods slop ),
-  ( fsLit "UPD_BH_UPDATABLE",       \[] -> emitBlackHoleCode False ),
-  ( fsLit "UPD_BH_SINGLE_ENTRY",    \[] -> emitBlackHoleCode True ),
-
-  ( fsLit "RET_P",      \[a] ->       emitRetUT [(PtrArg,a)]),
-  ( fsLit "RET_N",      \[a] ->       emitRetUT [(NonPtrArg,a)]),
-  ( fsLit "RET_PP",     \[a,b] ->     emitRetUT [(PtrArg,a),(PtrArg,b)]),
-  ( fsLit "RET_NN",     \[a,b] ->     emitRetUT [(NonPtrArg,a),(NonPtrArg,b)]),
-  ( fsLit "RET_NP",     \[a,b] ->     emitRetUT [(NonPtrArg,a),(PtrArg,b)]),
-  ( fsLit "RET_PPP",    \[a,b,c] ->   emitRetUT [(PtrArg,a),(PtrArg,b),(PtrArg,c)]),
-  ( fsLit "RET_NPP",    \[a,b,c] ->   emitRetUT [(NonPtrArg,a),(PtrArg,b),(PtrArg,c)]),
-  ( fsLit "RET_NNP",    \[a,b,c] ->   emitRetUT [(NonPtrArg,a),(NonPtrArg,b),(PtrArg,c)]),
-  ( fsLit "RET_NNN",  \[a,b,c] -> emitRetUT [(NonPtrArg,a),(NonPtrArg,b),(NonPtrArg,c)]),
-  ( fsLit "RET_NNNN",  \[a,b,c,d] -> emitRetUT [(NonPtrArg,a),(NonPtrArg,b),(NonPtrArg,c),(NonPtrArg,d)]),
-  ( fsLit "RET_NNNP",   \[a,b,c,d] -> emitRetUT [(NonPtrArg,a),(NonPtrArg,b),(NonPtrArg,c),(PtrArg,d)]),
-  ( fsLit "RET_NPNP",   \[a,b,c,d] -> emitRetUT [(NonPtrArg,a),(PtrArg,b),(NonPtrArg,c),(PtrArg,d)])
-
+					tickyAllocPrim hdr goods slop ),
+  ( fsLit "TICK_ALLOC_PAP",        \[goods,slop] ->
+					tickyAllocPAP goods slop ),
+  ( fsLit "TICK_ALLOC_UP_THK",     \[goods,slop] ->
+					tickyAllocThunk goods slop ),
+  ( fsLit "UPD_BH_UPDATABLE",      \[reg] -> emitBlackHoleCode False reg ),
+  ( fsLit "UPD_BH_SINGLE_ENTRY",   \[reg] -> emitBlackHoleCode True  reg )
  ]
 
+emitPushUpdateFrame :: CmmExpr -> CmmExpr -> FCode ()
+emitPushUpdateFrame sp e = do
+  dflags <- getDynFlags
+  emitUpdateFrame dflags sp mkUpdInfoLabel e
+
+pushStackFrame :: [CmmParse CmmExpr] -> CmmParse () -> CmmParse ()
+pushStackFrame fields body = do
+  dflags <- getDynFlags
+  exprs <- sequence fields
+  updfr_off <- getUpdFrameOff
+  let (new_updfr_off, _, g) = copyOutOflow dflags NativeReturn Ret Old
+                                           [] updfr_off exprs
+  emit g
+  withUpdFrameOff new_updfr_off body
 
 profilingInfo dflags desc_str ty_str
   = if not (dopt Opt_SccProfilingOn dflags)
@@ -846,7 +1023,7 @@ profilingInfo dflags desc_str ty_str
     else ProfilingInfo (stringToWord8s desc_str)
                        (stringToWord8s ty_str)
 
-staticClosure :: PackageId -> FastString -> FastString -> [CmmLit] -> ExtCode
+staticClosure :: PackageId -> FastString -> FastString -> [CmmLit] -> CmmParse ()
 staticClosure pkg cl_label info payload
   = do dflags <- getDynFlags
        let lits = mkStaticClosure dflags (mkCmmInfoLabel pkg info) dontCareCCS payload [] [] []
@@ -854,78 +1031,93 @@ staticClosure pkg cl_label info payload
 
 foreignCall
         :: String
-        -> [ExtFCode HintedCmmFormal]
-        -> ExtFCode CmmExpr
-        -> [ExtFCode HintedCmmActual]
-        -> Maybe [GlobalReg]
-        -> CmmSafety
+        -> [CmmParse (LocalReg, ForeignHint)]
+	-> CmmParse CmmExpr
+        -> [CmmParse (CmmExpr, ForeignHint)]
+        -> Safety
         -> CmmReturnInfo
-        -> P ExtCode
-foreignCall conv_string results_code expr_code args_code vols safety ret
-  = do  convention <- case conv_string of
+        -> P (CmmParse ())
+foreignCall conv_string results_code expr_code args_code safety ret
+  = do  conv <- case conv_string of
           "C" -> return CCallConv
           "stdcall" -> return StdCallConv
-          "C--" -> return CmmCallConv
           _ -> fail ("unknown calling convention: " ++ conv_string)
         return $ do
           dflags <- getDynFlags
-          let platform = targetPlatform dflags
           results <- sequence results_code
-          expr <- expr_code
-          args <- sequence args_code
-          case convention of
-            -- Temporary hack so at least some functions are CmmSafe
-            CmmCallConv -> code (stmtC (CmmCall (CmmCallee expr convention) results args ret))
-            _ ->
-              let expr' = adjCallTarget dflags convention expr args in
-              case safety of
-              CmmUnsafe ->
-                code (emitForeignCall' PlayRisky results
-                   (CmmCallee expr' convention) args vols NoC_SRT ret)
-              CmmSafe srt ->
-                code (emitForeignCall' PlaySafe results
-                   (CmmCallee expr' convention) args vols NoC_SRT ret) where
-              CmmInterruptible ->
-                code (emitForeignCall' PlayInterruptible results
-                   (CmmCallee expr' convention) args vols NoC_SRT ret)
-
-adjCallTarget :: DynFlags -> CCallConv -> CmmExpr -> [CmmHinted CmmExpr]
+	  expr <- expr_code
+	  args <- sequence args_code
+          let
+                  expr' = adjCallTarget dflags conv expr args
+                  (arg_exprs, arg_hints) = unzip args
+                  (res_regs,  res_hints) = unzip results
+                  fc = ForeignConvention conv arg_hints res_hints ret
+                  target = ForeignTarget expr' fc
+          _ <- code $ emitForeignCall safety res_regs target arg_exprs
+          return ()
+
+
+doReturn :: [CmmParse CmmExpr] -> CmmParse ()
+doReturn exprs_code = do
+  dflags <- getDynFlags
+  exprs <- sequence exprs_code
+  updfr_off <- getUpdFrameOff
+  emit (mkReturnSimple dflags exprs updfr_off)
+
+doRawJump :: CmmParse CmmExpr -> [GlobalReg] -> CmmParse ()
+doRawJump expr_code vols = do
+  dflags <- getDynFlags
+  expr <- expr_code
+  updfr_off <- getUpdFrameOff
+  emit (mkRawJump dflags expr updfr_off vols)
+
+doJumpWithStack :: CmmParse CmmExpr -> [CmmParse CmmExpr]
+                -> [CmmParse CmmExpr] -> CmmParse ()
+doJumpWithStack expr_code stk_code args_code = do
+  dflags <- getDynFlags
+  expr <- expr_code
+  stk_args <- sequence stk_code
+  args <- sequence args_code
+  updfr_off <- getUpdFrameOff
+  emit (mkJumpExtra dflags expr args updfr_off stk_args)
+
+doCall :: CmmParse CmmExpr -> [CmmParse LocalReg] -> [CmmParse CmmExpr]
+       -> CmmParse ()
+doCall expr_code res_code args_code = do
+  dflags <- getDynFlags
+  expr <- expr_code
+  args <- sequence args_code
+  ress <- sequence res_code
+  updfr_off <- getUpdFrameOff
+  c <- code $ mkCall expr (NativeNodeCall,NativeReturn) ress args updfr_off []
+  emit c
+
+adjCallTarget :: DynFlags -> CCallConv -> CmmExpr -> [(CmmExpr, ForeignHint) ]
               -> CmmExpr
 -- On Windows, we have to add the '@N' suffix to the label when making
 -- a call with the stdcall calling convention.
 adjCallTarget dflags StdCallConv (CmmLit (CmmLabel lbl)) args
  | platformOS (targetPlatform dflags) == OSMinGW32
   = CmmLit (CmmLabel (addLabelSize lbl (sum (map size args))))
-  where size (CmmHinted e _) = max (wORD_SIZE dflags) (widthInBytes (typeWidth (cmmExprType dflags e)))
+  where size (e, _) = max (wORD_SIZE dflags) (widthInBytes (typeWidth (cmmExprType dflags e)))
                  -- c.f. CgForeignCall.emitForeignCall
 adjCallTarget _ _ expr _
   = expr
 
 primCall
-        :: [ExtFCode HintedCmmFormal]
-        -> FastString
-        -> [ExtFCode HintedCmmActual]
-        -> Maybe [GlobalReg]
-        -> CmmSafety
-        -> P ExtCode
-primCall results_code name args_code vols safety
+        :: [CmmParse (CmmFormal, ForeignHint)]
+	-> FastString
+        -> [CmmParse CmmExpr]
+        -> P (CmmParse ())
+primCall results_code name args_code
   = case lookupUFM callishMachOps name of
         Nothing -> fail ("unknown primitive " ++ unpackFS name)
-        Just p  -> return $ do
-                results <- sequence results_code
-                args <- sequence args_code
-                case safety of
-                  CmmUnsafe ->
-                    code (emitForeignCall' PlayRisky results
-                      (CmmPrim p Nothing) args vols NoC_SRT CmmMayReturn)
-                  CmmSafe srt ->
-                    code (emitForeignCall' PlaySafe results
-                      (CmmPrim p Nothing) args vols NoC_SRT CmmMayReturn) where
-                  CmmInterruptible ->
-                    code (emitForeignCall' PlayInterruptible results
-                      (CmmPrim p Nothing) args vols NoC_SRT CmmMayReturn)
-
-doStore :: CmmType -> ExtFCode CmmExpr  -> ExtFCode CmmExpr -> ExtCode
+	Just p  -> return $ do
+		results <- sequence results_code
+		args <- sequence args_code
+                code (emitPrimCall (map fst results) p args)
+
+doStore :: CmmType -> CmmParse CmmExpr  -> CmmParse CmmExpr -> CmmParse ()
 doStore rep addr_code val_code
   = do dflags <- getDynFlags
        addr <- addr_code
@@ -940,19 +1132,7 @@ doStore rep addr_code val_code
        let coerce_val
                 | val_width /= rep_width = CmmMachOp (MO_UU_Conv val_width rep_width) [val]
                 | otherwise              = val
-       stmtEC (CmmStore addr coerce_val)
-
--- Return an unboxed tuple.
-emitRetUT :: [(CgRep,CmmExpr)] -> Code
-emitRetUT args = do
-  dflags <- getDynFlags
-  tickyUnboxedTupleReturn (length args)  -- TICK
-  (sp, stmts, live) <- pushUnboxedTuple 0 args
-  emitSimultaneously stmts -- NB. the args might overlap with the stack slots
-                           -- or regs that we assign to, so better use
-                           -- simultaneous assignments here (#3546)
-  when (sp /= 0) $ stmtC (CmmAssign spReg (cmmRegOffW dflags spReg (-sp)))
-  stmtC $ CmmJump (entryCode dflags (CmmLoad (cmmRegOffW dflags spReg sp) (bWord dflags))) (Just live)
+       emitStore addr coerce_val
 
 -- -----------------------------------------------------------------------------
 -- If-then-else and boolean expressions
@@ -966,16 +1146,16 @@ data BoolExpr
 -- ToDo: smart constructors which simplify the boolean expression.
 
 cmmIfThenElse cond then_part else_part = do
-     then_id <- code newLabelC
-     join_id <- code newLabelC
+     then_id <- newBlockId
+     join_id <- newBlockId
      c <- cond
      emitCond c then_id
      else_part
-     stmtEC (CmmBranch join_id)
-     code (labelC then_id)
+     emit (mkBranch join_id)
+     emitLabel then_id
      then_part
      -- fall through to join
-     code (labelC join_id)
+     emitLabel join_id
 
 cmmRawIf cond then_id = do
     c <- cond
@@ -984,30 +1164,32 @@ cmmRawIf cond then_id = do
 -- 'emitCond cond true_id'  emits code to test whether the cond is true,
 -- branching to true_id if so, and falling through otherwise.
 emitCond (BoolTest e) then_id = do
-  stmtEC (CmmCondBranch e then_id)
+  else_id <- newBlockId
+  emit (mkCbranch e then_id else_id)
+  emitLabel else_id
 emitCond (BoolNot (BoolTest (CmmMachOp op args))) then_id
   | Just op' <- maybeInvertComparison op
   = emitCond (BoolTest (CmmMachOp op' args)) then_id
 emitCond (BoolNot e) then_id = do
-  else_id <- code newLabelC
+  else_id <- newBlockId
   emitCond e else_id
-  stmtEC (CmmBranch then_id)
-  code (labelC else_id)
+  emit (mkBranch then_id)
+  emitLabel else_id
 emitCond (e1 `BoolOr` e2) then_id = do
   emitCond e1 then_id
   emitCond e2 then_id
 emitCond (e1 `BoolAnd` e2) then_id = do
         -- we'd like to invert one of the conditionals here to avoid an
-        -- extra branch instruction, but we can't use maybeInvertComparison
-        -- here because we can't look too closely at the expression since
-        -- we're in a loop.
-  and_id <- code newLabelC
-  else_id <- code newLabelC
+	-- extra branch instruction, but we can't use maybeInvertComparison
+	-- here because we can't look too closely at the expression since
+	-- we're in a loop.
+  and_id <- newBlockId
+  else_id <- newBlockId
   emitCond e1 and_id
-  stmtEC (CmmBranch else_id)
-  code (labelC and_id)
+  emit (mkBranch else_id)
+  emitLabel and_id
   emitCond e2 then_id
-  code (labelC else_id)
+  emitLabel else_id
 
 
 -- -----------------------------------------------------------------------------
@@ -1020,38 +1202,45 @@ emitCond (e1 `BoolAnd` e2) then_id = do
 -- optional range on the switch (eg. switch [0..7] {...}), or by
 -- the minimum/maximum values from the branches.
 
-doSwitch :: Maybe (Int,Int) -> ExtFCode CmmExpr -> [([Int],Either BlockId ExtCode)]
-         -> Maybe ExtCode -> ExtCode
+doSwitch :: Maybe (Int,Int) -> CmmParse CmmExpr -> [([Int],Either BlockId (CmmParse ()))]
+         -> Maybe (CmmParse ()) -> CmmParse ()
 doSwitch mb_range scrut arms deflt
    = do
-        -- Compile code for the default branch
-        dflt_entry <-
-                case deflt of
-                  Nothing -> return Nothing
-                  Just e  -> do b <- forkLabelledCodeEC e; return (Just b)
-
-        -- Compile each case branch
-        table_entries <- mapM emitArm arms
-
-        -- Construct the table
-        let
-            all_entries = concat table_entries
-            ixs = map fst all_entries
-            (min,max)
-                | Just (l,u) <- mb_range = (l,u)
-                | otherwise              = (minimum ixs, maximum ixs)
-
-            entries = elems (accumArray (\_ a -> Just a) dflt_entry (min,max)
-                                all_entries)
-        expr <- scrut
-        -- ToDo: check for out of range and jump to default if necessary
-        stmtEC (CmmSwitch expr entries)
+	-- Compile code for the default branch
+	dflt_entry <- 
+		case deflt of
+		  Nothing -> return Nothing
+                  Just e  -> do b <- forkLabelledCode e; return (Just b)
+
+	-- Compile each case branch
+	table_entries <- mapM emitArm arms
+
+	-- Construct the table
+	let
+	    all_entries = concat table_entries
+	    ixs = map fst all_entries
+	    (min,max) 
+		| Just (l,u) <- mb_range = (l,u)
+		| otherwise              = (minimum ixs, maximum ixs)
+
+	    entries = elems (accumArray (\_ a -> Just a) dflt_entry (min,max)
+				all_entries)
+	expr <- scrut
+	-- ToDo: check for out of range and jump to default if necessary
+        emit (mkSwitch expr entries)
    where
-        emitArm :: ([Int],Either BlockId ExtCode) -> ExtFCode [(Int,BlockId)]
-        emitArm (ints,Left blockid) = return [ (i,blockid) | i <- ints ]
-        emitArm (ints,Right code) = do
-           blockid <- forkLabelledCodeEC code
-           return [ (i,blockid) | i <- ints ]
+        emitArm :: ([Int],Either BlockId (CmmParse ())) -> CmmParse [(Int,BlockId)]
+	emitArm (ints,Left blockid) = return [ (i,blockid) | i <- ints ]
+	emitArm (ints,Right code) = do
+           blockid <- forkLabelledCode code
+	   return [ (i,blockid) | i <- ints ]
+
+forkLabelledCode :: CmmParse () -> CmmParse BlockId
+forkLabelledCode p = do
+  ag <- getCode p
+  l <- newBlockId
+  emitOutOfLine l ag
+  return l
 
 -- -----------------------------------------------------------------------------
 -- Putting it all together
diff --git a/compiler/cmm/CmmPipeline.hs b/compiler/cmm/CmmPipeline.hs
index 5fca9e7164..4f5d3b926c 100644
--- a/compiler/cmm/CmmPipeline.hs
+++ b/compiler/cmm/CmmPipeline.hs
@@ -52,7 +52,7 @@ cmmPipeline hsc_env topSRT prog =
 
 cpsTop :: HscEnv -> CmmDecl -> IO (CAFEnv, [CmmDecl])
 cpsTop _ p@(CmmData {}) = return (mapEmpty, [p])
-cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}}) l g) =
+cpsTop hsc_env proc =
     do
        ----------- Control-flow optimisations ----------------------------------
 
@@ -60,10 +60,13 @@ cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}})
        -- later passes by removing lots of empty blocks, so we do it
        -- even when optimisation isn't turned on.
        --
-       g <- {-# SCC "cmmCfgOpts(1)" #-}
-            return $ cmmCfgOpts splitting_proc_points g
+       CmmProc h l g <- {-# SCC "cmmCfgOpts(1)" #-}
+            return $ cmmCfgOptsProc splitting_proc_points proc
        dump Opt_D_dump_cmmz_cfg "Post control-flow optimsations" g
 
+       let !TopInfo {stack_info=StackInfo { arg_space = entry_off
+                                          , do_layout = do_layout }} = h
+
        ----------- Eliminate common blocks -------------------------------------
        g <- {-# SCC "elimCommonBlocks" #-}
             condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
@@ -95,7 +98,9 @@ cpsTop hsc_env (CmmProc h@(TopInfo {stack_info=StackInfo {arg_space=entry_off}})
        ----------- Layout the stack and manifest Sp ----------------------------
        (g, stackmaps) <-
             {-# SCC "layoutStack" #-}
-            runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
+            if do_layout
+               then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
+               else return (g, mapEmpty)
        dump Opt_D_dump_cmmz_sp "Layout Stack" g
 
        ----------- Sink and inline assignments *after* stack layout ------------
diff --git a/compiler/cmm/CmmProcPoint.hs b/compiler/cmm/CmmProcPoint.hs
index 471faf8b0c..19f0155908 100644
--- a/compiler/cmm/CmmProcPoint.hs
+++ b/compiler/cmm/CmmProcPoint.hs
@@ -291,7 +291,8 @@ splitAtProcPoints dflags entry_label callPPs procPoints procMap
      let to_proc (bid, g) = case expectJust "pp label" $ mapLookup bid procLabels of
              (lbl, Just info_lbl)
                | bid == entry
-               -> CmmProc (TopInfo {info_tbls=info_tbls, stack_info=stack_info})
+               -> CmmProc (TopInfo {info_tbls  = info_tbls,
+                                    stack_info = stack_info})
                           top_l (replacePPIds g)
                | otherwise
                -> CmmProc (TopInfo {info_tbls = mapSingleton (g_entry g) (mkEmptyContInfoTable info_lbl), stack_info=stack_info})
@@ -300,7 +301,9 @@ splitAtProcPoints dflags entry_label callPPs procPoints procMap
                -> CmmProc (TopInfo {info_tbls = mapEmpty, stack_info=stack_info})
                           lbl (replacePPIds g)
             where
-             stack_info = StackInfo 0 Nothing -- panic "No StackInfo"
+             stack_info = StackInfo { arg_space = 0
+                                    , updfr_space =  Nothing
+                                    , do_layout = True }
                           -- cannot use panic, this is printed by -ddump-cmmz
 
          -- References to procpoint IDs can now be replaced with the
diff --git a/compiler/cmm/CmmSink.hs b/compiler/cmm/CmmSink.hs
index 2ff9b98d2a..6dccdabe89 100644
--- a/compiler/cmm/CmmSink.hs
+++ b/compiler/cmm/CmmSink.hs
@@ -6,6 +6,7 @@ module CmmSink (
 import CodeGen.Platform (callerSaves)
 
 import Cmm
+import CmmOpt
 import BlockId
 import CmmLive
 import CmmUtils
@@ -13,8 +14,7 @@ import Hoopl
 
 import DynFlags
 import UniqFM
--- import PprCmm ()
--- import Outputable
+import PprCmm ()
 
 import Data.List (partition)
 import qualified Data.Set as Set
@@ -76,9 +76,11 @@ import qualified Data.Set as Set
 -- *but*, that will invalidate the liveness analysis, and we'll have
 -- to re-do it.
 
--- TODO: things that we aren't optimising very well yet.
+-- -----------------------------------------------------------------------------
+-- things that we aren't optimising very well yet.
 --
--- From GHC's FastString.hashStr:
+-- -----------
+-- (1) From GHC's FastString.hashStr:
 --
 --  s2ay:
 --      if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
@@ -95,6 +97,26 @@ import qualified Data.Set as Set
 -- a nice loop, but we didn't eliminate the silly assignment at the end.
 -- See Note [dependent assignments], which would probably fix this.
 --
+-- -----------
+-- (2) From stg_atomically_frame in PrimOps.cmm
+--
+-- We have a diamond control flow:
+--
+--     x = ...
+--       |
+--      / \
+--     A   B
+--      \ /
+--       |
+--    use of x
+--
+-- Now x won't be sunk down to its use, because we won't push it into
+-- both branches of the conditional.  We certainly do have to check
+-- that we can sink it past all the code in both A and B, but having
+-- discovered that, we could sink it to its use.
+--
+
+-- -----------------------------------------------------------------------------
 
 type Assignment = (LocalReg, CmmExpr, AbsMem)
   -- Assignment caches AbsMem, an abstraction of the memory read by
@@ -130,7 +152,8 @@ cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
 
       -- Now sink and inline in this block
       (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
-      (final_last, assigs') = tryToInline dflags live last assigs
+      fold_last = constantFold dflags last
+      (final_last, assigs') = tryToInline dflags live fold_last assigs
 
       -- We cannot sink into join points (successors with more than
       -- one predecessor), so identify the join points and the set
@@ -246,13 +269,24 @@ walk dflags nodes assigs = go nodes emptyBlock assigs
    go []               block as = (block, as)
    go ((live,node):ns) block as
     | shouldDiscard node live    = go ns block as
-    | Just a <- shouldSink dflags node1 = go ns block (a : as1)
+    | Just a <- shouldSink dflags node2 = go ns block (a : as1)
     | otherwise                         = go ns block' as'
     where
-      (node1, as1) = tryToInline dflags live node as
+      node1 = constantFold dflags node
+
+      (node2, as1) = tryToInline dflags live node1 as
+
+      (dropped, as') = dropAssignmentsSimple dflags
+                          (\a -> conflicts dflags a node2) as1
+
+      block' = foldl blockSnoc block dropped `blockSnoc` node2
+
 
-      (dropped, as') = dropAssignmentsSimple dflags (\a -> conflicts dflags a node1) as1
-      block' = foldl blockSnoc block dropped `blockSnoc` node1
+constantFold :: DynFlags -> CmmNode e x -> CmmNode e x
+constantFold dflags node = mapExpDeep f node
+  where f (CmmMachOp op args) = cmmMachOpFold dflags op args
+        f (CmmRegOff r 0) = CmmReg r
+        f e = e
 
 --
 -- Heuristic to decide whether to pick up and sink an assignment
@@ -352,6 +386,8 @@ tryToInline dflags live node assigs = go usages node [] assigs
            where inline (CmmReg    (CmmLocal l'))     | l == l' = rhs
                  inline (CmmRegOff (CmmLocal l') off) | l == l'
                     = cmmOffset dflags rhs off
+                    -- re-constant fold after inlining
+                 inline (CmmMachOp op args) = cmmMachOpFold dflags op args
                  inline other = other
 
   go usages node skipped (assig@(l,rhs,_) : rest)
@@ -416,7 +452,8 @@ conflicts dflags (r, rhs, addr) node
   | foldRegsUsed (\b r' -> r == r' || b) False node               = True
 
   -- (2) a store to an address conflicts with a read of the same memory
-  | CmmStore addr' e <- node, memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
+  | CmmStore addr' e <- node
+  , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
 
   -- (3) an assignment to Hp/Sp conflicts with a heap/stack read respectively
   | HeapMem    <- addr, CmmAssign (CmmGlobal Hp) _ <- node        = True
@@ -526,5 +563,6 @@ loadAddr dflags e w =
 regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
 regAddr _      (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
 regAddr _      (CmmGlobal Hp) _ _ = HeapMem
+regAddr _      (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
 regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
 regAddr _      _ _ _ = AnyMem
diff --git a/compiler/cmm/CmmType.hs b/compiler/cmm/CmmType.hs
index d6da5a4022..9a443c1ae2 100644
--- a/compiler/cmm/CmmType.hs
+++ b/compiler/cmm/CmmType.hs
@@ -15,6 +15,8 @@ module CmmType
     , rEP_CostCentreStack_mem_alloc
     , rEP_CostCentreStack_scc_count
     , rEP_StgEntCounter_allocs
+
+    , ForeignHint(..)
    )
 where
 
@@ -52,7 +54,8 @@ instance Outputable CmmType where
 
 instance Outputable CmmCat where
   ppr FloatCat  = ptext $ sLit("F")
-  ppr _         = ptext $ sLit("I")
+  ppr GcPtrCat  = ptext $ sLit("P")
+  ppr BitsCat   = ptext $ sLit("I")
 
 -- Why is CmmType stratified?  For native code generation,
 -- most of the time you just want to know what sort of register
@@ -242,6 +245,19 @@ narrowS W64 x = fromIntegral (fromIntegral x :: Int64)
 narrowS _ _ = panic "narrowTo"
 
 -------------------------------------------------------------------------
+-- Hints
+
+-- Hints are extra type information we attach to the arguments and
+-- results of a foreign call, where more type information is sometimes
+-- needed by the ABI to make the correct kind of call.
+
+data ForeignHint
+  = NoHint | AddrHint | SignedHint
+  deriving( Eq )
+        -- Used to give extra per-argument or per-result
+        -- information needed by foreign calling conventions
+
+-------------------------------------------------------------------------
 
 -- These don't really belong here, but I don't know where is best to
 -- put them.
diff --git a/compiler/cmm/CmmUtils.hs b/compiler/cmm/CmmUtils.hs
index bf93a2f6ff..f420e7d94e 100644
--- a/compiler/cmm/CmmUtils.hs
+++ b/compiler/cmm/CmmUtils.hs
@@ -22,6 +22,7 @@ module CmmUtils(
         mkWordCLit, packHalfWordsCLit,
         mkByteStringCLit,
         mkDataLits, mkRODataLits,
+        mkStgWordCLit,
 
         -- CmmExpr
         mkIntExpr, zeroExpr,
@@ -120,6 +121,8 @@ typeForeignHint = primRepForeignHint . typePrimRep
 --
 ---------------------------------------------------
 
+-- XXX: should really be Integer, since Int doesn't necessarily cover
+-- the full range of target Ints.
 mkIntCLit :: DynFlags -> Int -> CmmLit
 mkIntCLit dflags i = CmmInt (toInteger i) (wordWidth dflags)
 
@@ -132,6 +135,9 @@ zeroCLit dflags = CmmInt 0 (wordWidth dflags)
 zeroExpr :: DynFlags -> CmmExpr
 zeroExpr dflags = CmmLit (zeroCLit dflags)
 
+mkWordCLit :: DynFlags -> Integer -> CmmLit
+mkWordCLit dflags wd = CmmInt wd (wordWidth dflags)
+
 mkByteStringCLit :: Unique -> [Word8] -> (CmmLit, GenCmmDecl CmmStatics info stmt)
 -- We have to make a top-level decl for the string,
 -- and return a literal pointing to it
@@ -155,8 +161,8 @@ mkRODataLits lbl lits
     needsRelocation (CmmLabelOff _ _) = True
     needsRelocation _                 = False
 
-mkWordCLit :: DynFlags -> StgWord -> CmmLit
-mkWordCLit dflags wd = CmmInt (fromStgWord wd) (wordWidth dflags)
+mkStgWordCLit :: DynFlags -> StgWord -> CmmLit
+mkStgWordCLit dflags wd = CmmInt (fromStgWord wd) (wordWidth dflags)
 
 packHalfWordsCLit :: DynFlags -> StgHalfWord -> StgHalfWord -> CmmLit
 -- Make a single word literal in which the lower_half_word is
@@ -168,8 +174,8 @@ packHalfWordsCLit dflags lower_half_word upper_half_word
    = if wORDS_BIGENDIAN dflags
      then mkWordCLit dflags ((l `shiftL` hALF_WORD_SIZE_IN_BITS dflags) .|. u)
      else mkWordCLit dflags (l .|. (u `shiftL` hALF_WORD_SIZE_IN_BITS dflags))
-    where l = toStgWord dflags (fromStgHalfWord lower_half_word)
-          u = toStgWord dflags (fromStgHalfWord upper_half_word)
+    where l = fromStgHalfWord lower_half_word
+          u = fromStgHalfWord upper_half_word
 
 ---------------------------------------------------
 --
@@ -197,6 +203,9 @@ cmmOffset _ e                 0        = e
 cmmOffset _ (CmmReg reg)      byte_off = cmmRegOff reg byte_off
 cmmOffset _ (CmmRegOff reg m) byte_off = cmmRegOff reg (m+byte_off)
 cmmOffset _ (CmmLit lit)      byte_off = CmmLit (cmmOffsetLit lit byte_off)
+cmmOffset _ (CmmStackSlot area off) byte_off
+  = CmmStackSlot area (off - byte_off)
+  -- note stack area offsets increase towards lower addresses
 cmmOffset _ (CmmMachOp (MO_Add rep) [expr, CmmLit (CmmInt byte_off1 _rep)]) byte_off2
   = CmmMachOp (MO_Add rep)
               [expr, CmmLit (CmmInt (byte_off1 + toInteger byte_off2) rep)]
@@ -207,6 +216,7 @@ cmmOffset dflags expr byte_off
 
 -- Smart constructor for CmmRegOff.  Same caveats as cmmOffset above.
 cmmRegOff :: CmmReg -> Int -> CmmExpr
+cmmRegOff reg 0        = CmmReg reg
 cmmRegOff reg byte_off = CmmRegOff reg byte_off
 
 cmmOffsetLit :: CmmLit -> Int -> CmmLit
diff --git a/compiler/cmm/MkGraph.hs b/compiler/cmm/MkGraph.hs
index 4ba82cd8f8..1e2ddfadd1 100644
--- a/compiler/cmm/MkGraph.hs
+++ b/compiler/cmm/MkGraph.hs
@@ -9,9 +9,10 @@ module MkGraph
   , stackStubExpr
   , mkNop, mkAssign, mkStore, mkUnsafeCall, mkFinalCall, mkCallReturnsTo
   , mkJumpReturnsTo
-  , mkJump, mkDirectJump, mkForeignJump, mkForeignJumpExtra, mkJumpGC
+  , mkJump, mkJumpExtra, mkDirectJump, mkForeignJump, mkForeignJumpExtra
+  , mkRawJump
   , mkCbranch, mkSwitch
-  , mkReturn, mkComment, mkCallEntry, mkBranch
+  , mkReturn, mkReturnSimple, mkComment, mkCallEntry, mkBranch
   , copyInOflow, copyOutOflow
   , noExtraStack
   , toCall, Transfer(..)
@@ -20,7 +21,7 @@ where
 
 import BlockId
 import Cmm
-import CmmCallConv (assignArgumentsPos, ParamLocation(..))
+import CmmCallConv
 
 
 import Compiler.Hoopl hiding (Unique, (<*>), mkFirst, mkMiddle, mkLast, mkLabel, mkBranch, Shape(..))
@@ -161,11 +162,11 @@ outOfLine l g   = unitOL (CgFork l g)
 -- | allocate a fresh label for the entry point
 lgraphOfAGraph :: CmmAGraph -> UniqSM CmmGraph
 lgraphOfAGraph g = do u <- getUniqueM
-                      return (flattenCmmAGraph (mkBlockId u) g)
+                      return (labelAGraph (mkBlockId u) g)
 
 -- | use the given BlockId as the label of the entry point
-labelAGraph    :: BlockId -> CmmAGraph -> UniqSM CmmGraph
-labelAGraph lbl ag = return (flattenCmmAGraph lbl ag)
+labelAGraph    :: BlockId -> CmmAGraph -> CmmGraph
+labelAGraph lbl ag = flattenCmmAGraph lbl ag
 
 ---------- No-ops
 mkNop        :: CmmAGraph
@@ -194,16 +195,25 @@ mkJump dflags e actuals updfr_off =
   lastWithArgs dflags Jump Old NativeNodeCall actuals updfr_off $
     toCall e Nothing updfr_off 0
 
-mkDirectJump    :: DynFlags -> CmmExpr -> [CmmActual] -> UpdFrameOffset
+-- | A jump where the caller says what the live GlobalRegs are.  Used
+-- for low-level hand-written Cmm.
+mkRawJump       :: DynFlags -> CmmExpr -> UpdFrameOffset -> [GlobalReg]
                 -> CmmAGraph
-mkDirectJump dflags e actuals updfr_off =
-  lastWithArgs dflags Jump Old NativeDirectCall actuals updfr_off $
+mkRawJump dflags e updfr_off vols =
+  lastWithArgs dflags Jump Old NativeNodeCall [] updfr_off $
+    \arg_space _  -> toCall e Nothing updfr_off 0 arg_space vols
+
+
+mkJumpExtra     :: DynFlags -> CmmExpr -> [CmmActual] -> UpdFrameOffset
+                -> [CmmActual] -> CmmAGraph
+mkJumpExtra dflags e actuals updfr_off extra_stack =
+  lastWithArgsAndExtraStack dflags Jump Old NativeNodeCall actuals updfr_off extra_stack $
     toCall e Nothing updfr_off 0
 
-mkJumpGC        :: DynFlags -> CmmExpr -> [CmmActual] -> UpdFrameOffset
+mkDirectJump    :: DynFlags -> CmmExpr -> [CmmActual] -> UpdFrameOffset
                 -> CmmAGraph
-mkJumpGC dflags e actuals updfr_off =
-  lastWithArgs dflags Jump Old GC actuals updfr_off $
+mkDirectJump dflags e actuals updfr_off =
+  lastWithArgs dflags Jump Old NativeDirectCall actuals updfr_off $
     toCall e Nothing updfr_off 0
 
 mkForeignJump   :: DynFlags
@@ -213,7 +223,7 @@ mkForeignJump dflags conv e actuals updfr_off =
   mkForeignJumpExtra dflags conv e actuals updfr_off noExtraStack
 
 mkForeignJumpExtra :: DynFlags -> Convention -> CmmExpr -> [CmmActual]
-                -> UpdFrameOffset -> (ByteOff, [(CmmExpr, ByteOff)])
+                -> UpdFrameOffset -> [CmmActual]
                 -> CmmAGraph
 mkForeignJumpExtra dflags conv e actuals updfr_off extra_stack =
   lastWithArgsAndExtraStack dflags Jump Old conv actuals updfr_off extra_stack $
@@ -231,6 +241,11 @@ mkReturn dflags e actuals updfr_off =
   lastWithArgs dflags Ret  Old NativeReturn actuals updfr_off $
     toCall e Nothing updfr_off 0
 
+mkReturnSimple  :: DynFlags -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkReturnSimple dflags actuals updfr_off =
+  mkReturn dflags e actuals updfr_off
+  where e = CmmLoad (CmmStackSlot Old updfr_off) (gcWord dflags)
+
 mkBranch        :: BlockId -> CmmAGraph
 mkBranch bid     = mkLast (CmmBranch bid)
 
@@ -245,7 +260,7 @@ mkCallReturnsTo :: DynFlags -> CmmExpr -> Convention -> [CmmActual]
                 -> BlockId
                 -> ByteOff
                 -> UpdFrameOffset
-                -> (ByteOff, [(CmmExpr,ByteOff)])
+                -> [CmmActual]
                 -> CmmAGraph
 mkCallReturnsTo dflags f callConv actuals ret_lbl ret_off updfr_off extra_stack = do
   lastWithArgsAndExtraStack dflags Call (Young ret_lbl) callConv actuals
@@ -282,39 +297,40 @@ stackStubExpr :: Width -> CmmExpr
 stackStubExpr w = CmmLit (CmmInt 0 w)
 
 -- When we copy in parameters, we usually want to put overflow
--- parameters on the stack, but sometimes we want to pass
--- the variables in their spill slots.
--- Therefore, for copying arguments and results, we provide different
--- functions to pass the arguments in an overflow area and to pass them in spill slots.
-copyInOflow  :: DynFlags -> Convention -> Area -> [CmmFormal]
+-- parameters on the stack, but sometimes we want to pass the
+-- variables in their spill slots.  Therefore, for copying arguments
+-- and results, we provide different functions to pass the arguments
+-- in an overflow area and to pass them in spill slots.
+copyInOflow  :: DynFlags -> Convention -> Area
+             -> [CmmFormal]
+             -> [CmmFormal]
              -> (Int, CmmAGraph)
 
-copyInOflow dflags conv area formals = (offset, catAGraphs $ map mkMiddle nodes)
-  where (offset, nodes) = copyIn dflags oneCopyOflowI conv area formals
-
-type SlotCopier = Area -> (LocalReg, ByteOff) -> (ByteOff, [CmmNode O O]) ->
-                          (ByteOff, [CmmNode O O])
-type CopyIn  = DynFlags -> SlotCopier -> Convention -> Area -> [CmmFormal] -> (ByteOff, [CmmNode O O])
+copyInOflow dflags conv area formals extra_stk
+  = (offset, catAGraphs $ map mkMiddle nodes)
+  where (offset, nodes) = copyIn dflags conv area formals extra_stk
 
 -- Return the number of bytes used for copying arguments, as well as the
 -- instructions to copy the arguments.
-copyIn :: CopyIn
-copyIn dflags oflow conv area formals =
-  foldr ci (init_offset, []) args'
-  where ci (reg, RegisterParam r) (n, ms) =
-          (n, CmmAssign (CmmLocal reg) (CmmReg $ CmmGlobal r) : ms)
-        ci (r, StackParam off) (n, ms) = oflow area (r, off) (n, ms)
-        init_offset = widthInBytes (wordWidth dflags) -- infotable
-        args  = assignArgumentsPos dflags conv localRegType formals
-        args' = foldl adjust [] args
-          where adjust rst (v, StackParam off) = (v, StackParam (off + init_offset)) : rst
-                adjust rst x@(_, RegisterParam _) = x : rst
-
--- Copy-in one arg, using overflow space if needed.
-oneCopyOflowI :: SlotCopier
-oneCopyOflowI area (reg, off) (n, ms) =
-  (max n off, CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot area off) ty) : ms)
-  where ty = localRegType reg
+copyIn :: DynFlags -> Convention -> Area
+       -> [CmmFormal]
+       -> [CmmFormal]
+       -> (ByteOff, [CmmNode O O])
+copyIn dflags conv area formals extra_stk
+  = (stk_size, map ci (stk_args ++ args))
+  where
+     ci (reg, RegisterParam r) =
+          CmmAssign (CmmLocal reg) (CmmReg (CmmGlobal r))
+     ci (reg, StackParam off) =
+          CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot area off) ty)
+          where ty = localRegType reg
+
+     init_offset = widthInBytes (wordWidth dflags) -- infotable
+
+     (stk_off, stk_args) = assignStack dflags init_offset localRegType extra_stk
+
+     (stk_size, args) = assignArgumentsPos dflags stk_off conv
+                                           localRegType formals
 
 -- Factoring out the common parts of the copyout functions yielded something
 -- more complicated:
@@ -323,7 +339,7 @@ data Transfer = Call | JumpRet | Jump | Ret deriving Eq
 
 copyOutOflow :: DynFlags -> Convention -> Transfer -> Area -> [CmmActual]
              -> UpdFrameOffset
-             -> (ByteOff, [(CmmExpr,ByteOff)]) -- extra stack stuff
+             -> [CmmActual] -- extra stack args
              -> (Int, [GlobalReg], CmmAGraph)
 
 -- Generate code to move the actual parameters into the locations
@@ -335,22 +351,20 @@ copyOutOflow :: DynFlags -> Convention -> Transfer -> Area -> [CmmActual]
 -- the info table for return and adjust the offsets of the other
 -- parameters.  If this is a call instruction, we adjust the offsets
 -- of the other parameters.
-copyOutOflow dflags conv transfer area actuals updfr_off
-  (extra_stack_off, extra_stack_stuff)
-  = foldr co (init_offset, [], mkNop) (args' ++ stack_params)
-  where 
-    co (v, RegisterParam r) (n, rs, ms)
-       = (n, r:rs, mkAssign (CmmGlobal r) v <*> ms)
-    co (v, StackParam off)  (n, rs, ms)
-       = (max n off, rs, mkStore (CmmStackSlot area off) v <*> ms)
-
-    stack_params = [ (e, StackParam (off + init_offset))
-                   | (e,off) <- extra_stack_stuff ]
+copyOutOflow dflags conv transfer area actuals updfr_off extra_stack_stuff
+  = (stk_size, regs, graph)
+  where
+    (regs, graph) = foldr co ([], mkNop) (setRA ++ args ++ stack_params)
+
+    co (v, RegisterParam r) (rs, ms)
+       = (r:rs, mkAssign (CmmGlobal r) v <*> ms)
+    co (v, StackParam off)  (rs, ms)
+       = (rs, mkStore (CmmStackSlot area off) v <*> ms)
 
     (setRA, init_offset) =
       case area of
-            Young id -> id `seq` -- Generate a store instruction for
-                                 -- the return address if making a call
+            Young id ->  -- Generate a store instruction for
+                         -- the return address if making a call
                   case transfer of
                      Call ->
                        ([(CmmLit (CmmBlock id), StackParam init_offset)],
@@ -362,19 +376,19 @@ copyOutOflow dflags conv transfer area actuals updfr_off
                        ([], 0)
             Old -> ([], updfr_off)
 
-    arg_offset = init_offset + extra_stack_off
+    (extra_stack_off, stack_params) =
+       assignStack dflags init_offset (cmmExprType dflags) extra_stack_stuff
 
     args :: [(CmmExpr, ParamLocation)]   -- The argument and where to put it
-    args = assignArgumentsPos dflags conv (cmmExprType dflags) actuals
-
-    args' = foldl adjust setRA args
-      where adjust rst   (v, StackParam off)  = (v, StackParam (off + arg_offset)) : rst
-            adjust rst x@(_, RegisterParam _) = x : rst
+    (stk_size, args) = assignArgumentsPos dflags extra_stack_off conv
+                                          (cmmExprType dflags) actuals
 
 
 
-mkCallEntry :: DynFlags -> Convention -> [CmmFormal] -> (Int, CmmAGraph)
-mkCallEntry dflags conv formals = copyInOflow dflags conv Old formals
+mkCallEntry :: DynFlags -> Convention -> [CmmFormal] -> [CmmFormal]
+            -> (Int, CmmAGraph)
+mkCallEntry dflags conv formals extra_stk
+  = copyInOflow dflags conv Old formals extra_stk
 
 lastWithArgs :: DynFlags -> Transfer -> Area -> Convention -> [CmmActual]
              -> UpdFrameOffset
@@ -386,7 +400,7 @@ lastWithArgs dflags transfer area conv actuals updfr_off last =
 
 lastWithArgsAndExtraStack :: DynFlags
              -> Transfer -> Area -> Convention -> [CmmActual]
-             -> UpdFrameOffset -> (ByteOff, [(CmmExpr,ByteOff)])
+             -> UpdFrameOffset -> [CmmActual]
              -> (ByteOff -> [GlobalReg] -> CmmAGraph)
              -> CmmAGraph
 lastWithArgsAndExtraStack dflags transfer area conv actuals updfr_off
@@ -397,8 +411,8 @@ lastWithArgsAndExtraStack dflags transfer area conv actuals updfr_off
                                updfr_off extra_stack
 
 
-noExtraStack :: (ByteOff, [(CmmExpr,ByteOff)])
-noExtraStack = (0,[])
+noExtraStack :: [CmmActual]
+noExtraStack = []
 
 toCall :: CmmExpr -> Maybe BlockId -> UpdFrameOffset -> ByteOff
        -> ByteOff -> [GlobalReg]
diff --git a/compiler/cmm/OldCmm.hs b/compiler/cmm/OldCmm.hs
index 05aa5fb811..3d0599b7ea 100644
--- a/compiler/cmm/OldCmm.hs
+++ b/compiler/cmm/OldCmm.hs
@@ -16,7 +16,7 @@ module OldCmm (
 
         GenBasicBlock(..), CmmBasicBlock, blockId, blockStmts, mapBlockStmts,
 
-        CmmStmt(..), CmmReturnInfo(..), CmmHinted(..),
+        CmmStmt(..), New.CmmReturnInfo(..), CmmHinted(..),
         HintedCmmFormal, HintedCmmActual,
 
         CmmSafety(..), CmmCallTarget(..),
@@ -120,11 +120,6 @@ cmmTopMapGraph :: (g -> g') -> GenCmmDecl d h g -> GenCmmDecl d h g'
 cmmTopMapGraph f (CmmProc h l g) = CmmProc h l (f g)
 cmmTopMapGraph _ (CmmData s ds)  = CmmData s ds
 
-data CmmReturnInfo
-  = CmmMayReturn
-  | CmmNeverReturns
-  deriving ( Eq )
-
 -----------------------------------------------------------------------------
 --              CmmStmt
 -- A "statement".  Note that all branches are explicit: there are no
@@ -145,7 +140,7 @@ data CmmStmt
       CmmCallTarget
       [HintedCmmFormal]            -- zero or more results
       [HintedCmmActual]            -- zero or more arguments
-      CmmReturnInfo
+      New.CmmReturnInfo
       -- Some care is necessary when handling the arguments of these, see
       -- [Register parameter passing] and the hack in cmm/CmmOpt.hs
 
diff --git a/compiler/cmm/OldPprCmm.hs b/compiler/cmm/OldPprCmm.hs
index a3857d4e47..dcde86e37c 100644
--- a/compiler/cmm/OldPprCmm.hs
+++ b/compiler/cmm/OldPprCmm.hs
@@ -111,12 +111,8 @@ pprStmt stmt = case stmt of
           pp_lhs | null results = empty
                  | otherwise    = commafy (map ppr_ar results) <+> equals
                 -- Don't print the hints on a native C-- call
-          ppr_ar (CmmHinted ar k) = case cconv of
-                            CmmCallConv -> ppr ar
-                            _           -> ppr (ar,k)
-          pp_conv = case cconv of
-                      CmmCallConv -> empty
-                      _           -> ptext (sLit("foreign")) <+> doubleQuotes (ppr cconv)
+          ppr_ar (CmmHinted ar k) = ppr (ar,k)
+          pp_conv = ptext (sLit("foreign")) <+> doubleQuotes (ppr cconv)
 
     -- Call a CallishMachOp, like sin or cos that might be implemented as a library call.
     CmmCall (CmmPrim op _) results args ret ->
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index 1a3eb0d716..a2427df868 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -865,7 +865,6 @@ is_cishCC :: CCallConv -> Bool
 is_cishCC CCallConv    = True
 is_cishCC CApiConv     = True
 is_cishCC StdCallConv  = True
-is_cishCC CmmCallConv  = False
 is_cishCC PrimCallConv = False
 
 -- ---------------------------------------------------------------------
diff --git a/compiler/cmm/PprCmm.hs b/compiler/cmm/PprCmm.hs
index 423bcd5504..f3e2a02737 100644
--- a/compiler/cmm/PprCmm.hs
+++ b/compiler/cmm/PprCmm.hs
@@ -75,6 +75,8 @@ instance Outputable ForeignConvention where
 instance Outputable ForeignTarget where
     ppr = pprForeignTarget
 
+instance Outputable CmmReturnInfo where
+    ppr = pprReturnInfo
 
 instance Outputable (Block CmmNode C C) where
     ppr = pprBlock
@@ -145,17 +147,18 @@ pprConvention (NativeDirectCall {}) = text "<native-direct-call-convention>"
 pprConvention (NativeReturn {})     = text "<native-ret-convention>"
 pprConvention  Slow                 = text "<slow-convention>"
 pprConvention  GC                   = text "<gc-convention>"
-pprConvention  PrimOpCall           = text "<primop-call-convention>"
-pprConvention  PrimOpReturn         = text "<primop-ret-convention>"
 
 pprForeignConvention :: ForeignConvention -> SDoc
-pprForeignConvention (ForeignConvention c as rs) = ppr c <> ppr as <> ppr rs
+pprForeignConvention (ForeignConvention c args res ret) =
+          doubleQuotes (ppr c) <+> text "arg hints: " <+> ppr args <+> text " result hints: " <+> ppr res <+> ppr ret
+
+pprReturnInfo :: CmmReturnInfo -> SDoc
+pprReturnInfo CmmMayReturn = empty
+pprReturnInfo CmmNeverReturns = ptext (sLit "never returns")
 
 pprForeignTarget :: ForeignTarget -> SDoc
-pprForeignTarget (ForeignTarget fn c) = ppr_fc c <+> ppr_target fn
-  where ppr_fc :: ForeignConvention -> SDoc
-        ppr_fc (ForeignConvention c args res) =
-          doubleQuotes (ppr c) <+> text "arg hints: " <+> ppr args <+> text " result hints: " <+> ppr res
+pprForeignTarget (ForeignTarget fn c) = ppr c <+> ppr_target fn
+  where
         ppr_target :: CmmExpr -> SDoc
         ppr_target t@(CmmLit _) = ppr t
         ppr_target fn'          = parens (ppr fn')
diff --git a/compiler/cmm/SMRep.lhs b/compiler/cmm/SMRep.lhs
index d9644488fc..ac021df761 100644
--- a/compiler/cmm/SMRep.lhs
+++ b/compiler/cmm/SMRep.lhs
@@ -30,6 +30,7 @@ module SMRep (
 
         -- ** Predicates
         isStaticRep, isConRep, isThunkRep, isFunRep, isStaticNoCafCon,
+        isStackRep,
 
         -- ** Size-related things
         heapClosureSize,
@@ -148,7 +149,7 @@ data SMRep
         Liveness
 
   | RTSRep              -- The RTS needs to declare info tables with specific
-        StgHalfWord     -- type tags, so this form lets us override the default
+        Int             -- type tags, so this form lets us override the default
         SMRep           -- tag for an SMRep.
 
 -- | True <=> This is a static closure.  Affects how we garbage-collect it.
@@ -166,10 +167,10 @@ data ClosureTypeInfo
   | ThunkSelector SelectorOffset
   | BlackHole
 
-type ConstrTag         = StgHalfWord
+type ConstrTag         = Int
 type ConstrDescription = [Word8] -- result of dataConIdentity
-type FunArity          = StgHalfWord
-type SelectorOffset    = StgWord
+type FunArity          = Int
+type SelectorOffset    = Int
 
 -------------------------
 -- We represent liveness bitmaps as a Bitmap (whose internal
@@ -188,7 +189,7 @@ type Liveness = [Bool]   -- One Bool per word; True  <=> non-ptr or dead
 
 data ArgDescr
   = ArgSpec             -- Fits one of the standard patterns
-        !StgHalfWord    -- RTS type identifier ARG_P, ARG_N, ...
+        !Int            -- RTS type identifier ARG_P, ARG_N, ...
 
   | ArgGen              -- General case
         Liveness        -- Details about the arguments
@@ -212,7 +213,7 @@ mkHeapRep dflags is_static ptr_wds nonptr_wds cl_type_info
      hdr_size     = closureTypeHdrSize dflags cl_type_info
      payload_size = ptr_wds + nonptr_wds
 
-mkRTSRep :: StgHalfWord -> SMRep -> SMRep
+mkRTSRep :: Int -> SMRep -> SMRep
 mkRTSRep = RTSRep
 
 mkStackRep :: [Bool] -> SMRep
@@ -229,6 +230,11 @@ isStaticRep (HeapRep is_static _ _ _) = is_static
 isStaticRep (StackRep {})             = False
 isStaticRep (RTSRep _ rep)            = isStaticRep rep
 
+isStackRep :: SMRep -> Bool
+isStackRep StackRep{}     = True
+isStackRep (RTSRep _ rep) = isStackRep rep
+isStackRep _              = False
+
 isConRep :: SMRep -> Bool
 isConRep (HeapRep _ _ _ Constr{}) = True
 isConRep _                        = False
@@ -314,11 +320,10 @@ closureTypeHdrSize dflags ty = case ty of
 -- Defines CONSTR, CONSTR_1_0 etc
 
 -- | Derives the RTS closure type from an 'SMRep'
-rtsClosureType :: DynFlags -> SMRep -> StgHalfWord
-rtsClosureType dflags rep
-    = toStgHalfWord dflags
-    $ case rep of
-      RTSRep ty _ -> fromStgHalfWord ty
+rtsClosureType :: SMRep -> Int
+rtsClosureType rep
+    = case rep of
+      RTSRep ty _ -> ty
 
       HeapRep False 1 0 Constr{} -> CONSTR_1_0
       HeapRep False 0 1 Constr{} -> CONSTR_0_1
@@ -355,11 +360,11 @@ rtsClosureType dflags rep
       _ -> panic "rtsClosureType"
 
 -- We export these ones
-rET_SMALL, rET_BIG, aRG_GEN, aRG_GEN_BIG :: DynFlags -> StgHalfWord
-rET_SMALL   dflags = toStgHalfWord dflags RET_SMALL
-rET_BIG     dflags = toStgHalfWord dflags RET_BIG
-aRG_GEN     dflags = toStgHalfWord dflags ARG_GEN
-aRG_GEN_BIG dflags = toStgHalfWord dflags ARG_GEN_BIG
+rET_SMALL, rET_BIG, aRG_GEN, aRG_GEN_BIG :: Int
+rET_SMALL   = RET_SMALL
+rET_BIG     = RET_BIG
+aRG_GEN     = ARG_GEN
+aRG_GEN_BIG = ARG_GEN_BIG
 \end{code}
 
 Note [Static NoCaf constructors]
diff --git a/compiler/codeGen/CgCallConv.hs b/compiler/codeGen/CgCallConv.hs
index 1f5b711d86..d548741e1f 100644
--- a/compiler/codeGen/CgCallConv.hs
+++ b/compiler/codeGen/CgCallConv.hs
@@ -70,7 +70,7 @@ mkArgDescr _nm args
        let arg_bits = argBits dflags arg_reps
            arg_reps = filter nonVoidArg (map idCgRep args)
            -- Getting rid of voids eases matching of standard patterns
-       case stdPattern dflags arg_reps of
+       case stdPattern arg_reps of
            Just spec_id -> return (ArgSpec spec_id)
            Nothing      -> return (ArgGen arg_bits)
 
@@ -79,10 +79,9 @@ argBits _      []              = []
 argBits dflags (PtrArg : args) = False : argBits dflags args
 argBits dflags (arg    : args) = take (cgRepSizeW dflags arg) (repeat True) ++ argBits dflags args
 
-stdPattern :: DynFlags -> [CgRep] -> Maybe StgHalfWord
-stdPattern dflags reps
-    = fmap (toStgHalfWord dflags)
-    $ case reps of
+stdPattern :: [CgRep] -> Maybe Int
+stdPattern reps
+    = case reps of
       []          -> Just ARG_NONE  -- just void args, probably
 
       [PtrArg]    -> Just ARG_P
diff --git a/compiler/codeGen/CgCon.lhs b/compiler/codeGen/CgCon.lhs
index aeb87235e3..858de3a616 100644
--- a/compiler/codeGen/CgCon.lhs
+++ b/compiler/codeGen/CgCon.lhs
@@ -189,7 +189,7 @@ buildDynCon' dflags platform binder _ con [arg_amode]
   , (_, CmmLit (CmmInt val _)) <- arg_amode
   , let val_int = (fromIntegral val) :: Int
   , val_int <= mAX_INTLIKE dflags && val_int >= mIN_INTLIKE dflags
-  = do  { let intlike_lbl   = mkCmmGcPtrLabel rtsPackageId (fsLit "stg_INTLIKE_closure")
+  = do  { let intlike_lbl   = mkCmmClosureLabel rtsPackageId (fsLit "stg_INTLIKE")
               offsetW = (val_int - mIN_INTLIKE dflags) * (fixedHdrSize dflags + 1)
                 -- INTLIKE closures consist of a header and one word payload
               intlike_amode = CmmLit (cmmLabelOffW dflags intlike_lbl offsetW)
@@ -201,7 +201,7 @@ buildDynCon' dflags platform binder _ con [arg_amode]
   , (_, CmmLit (CmmInt val _)) <- arg_amode
   , let val_int = (fromIntegral val) :: Int
   , val_int <= mAX_CHARLIKE dflags && val_int >= mIN_CHARLIKE dflags
-  = do  { let charlike_lbl   = mkCmmGcPtrLabel rtsPackageId (fsLit "stg_CHARLIKE_closure")
+  = do  { let charlike_lbl   = mkCmmClosureLabel rtsPackageId (fsLit "stg_CHARLIKE")
               offsetW = (val_int - mIN_CHARLIKE dflags) * (fixedHdrSize dflags + 1)
                 -- CHARLIKE closures consist of a header and one word payload
               charlike_amode = CmmLit (cmmLabelOffW dflags charlike_lbl offsetW)
diff --git a/compiler/codeGen/CgHeapery.lhs b/compiler/codeGen/CgHeapery.lhs
index 965abf0db8..8cff77381d 100644
--- a/compiler/codeGen/CgHeapery.lhs
+++ b/compiler/codeGen/CgHeapery.lhs
@@ -415,7 +415,7 @@ unbxTupleHeapCheck regs ptrs nptrs fail_code code
         { dflags <- getDynFlags
         ; let full_fail_code  = fail_code `plusStmts` oneStmt assign_liveness
               assign_liveness = CmmAssign (CmmGlobal (VanillaReg 9 VNonGcPtr))    -- Ho ho ho!
-                                          (CmmLit (mkWordCLit dflags liveness))
+                                          (CmmLit (mkStgWordCLit dflags liveness))
               liveness        = mkRegLiveness dflags regs ptrs nptrs
               live            = Just $ map snd regs
               rts_label       = CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId (fsLit "stg_gc_ut")))
diff --git a/compiler/codeGen/CgProf.hs b/compiler/codeGen/CgProf.hs
index c124b5f68a..03e01b332a 100644
--- a/compiler/codeGen/CgProf.hs
+++ b/compiler/codeGen/CgProf.hs
@@ -258,7 +258,7 @@ dynLdvInit :: DynFlags -> CmmExpr
 dynLdvInit dflags =     -- (era << LDV_SHIFT) | LDV_STATE_CREATE
   CmmMachOp (mo_wordOr dflags) [
       CmmMachOp (mo_wordShl dflags) [loadEra dflags, mkIntExpr dflags (lDV_SHIFT dflags)],
-      CmmLit (mkWordCLit dflags (lDV_STATE_CREATE dflags))
+      CmmLit (mkWordCLit dflags (iLDV_STATE_CREATE dflags))
   ]
 
 --
@@ -289,8 +289,8 @@ ldvEnter cl_ptr = do
         -- don't forget to substract node's tag
     ldv_wd = ldvWord dflags cl_ptr
     new_ldv_wd = cmmOrWord dflags (cmmAndWord dflags (CmmLoad ldv_wd (bWord dflags))
-                                                     (CmmLit (mkWordCLit dflags (lDV_CREATE_MASK dflags))))
-                 (cmmOrWord dflags (loadEra dflags) (CmmLit (mkWordCLit dflags (lDV_STATE_USE dflags))))
+                                                     (CmmLit (mkWordCLit dflags (iLDV_CREATE_MASK dflags))))
+                 (cmmOrWord dflags (loadEra dflags) (CmmLit (mkWordCLit dflags (iLDV_STATE_USE dflags))))
   ifProfiling $
      -- if (era > 0) {
      --    LDVW((c)) = (LDVW((c)) & LDV_CREATE_MASK) |
@@ -308,10 +308,3 @@ ldvWord :: DynFlags -> CmmExpr -> CmmExpr
 ldvWord dflags closure_ptr
     = cmmOffsetB dflags closure_ptr (oFFSET_StgHeader_ldvw dflags)
 
-lDV_CREATE_MASK :: DynFlags -> StgWord
-lDV_CREATE_MASK dflags = toStgWord dflags (iLDV_CREATE_MASK dflags)
-lDV_STATE_CREATE :: DynFlags -> StgWord
-lDV_STATE_CREATE dflags = toStgWord dflags (iLDV_STATE_CREATE dflags)
-lDV_STATE_USE :: DynFlags -> StgWord
-lDV_STATE_USE dflags = toStgWord dflags (iLDV_STATE_USE dflags)
-
diff --git a/compiler/codeGen/CgUtils.hs b/compiler/codeGen/CgUtils.hs
index 3a106abfb4..9f9a2cfe26 100644
--- a/compiler/codeGen/CgUtils.hs
+++ b/compiler/codeGen/CgUtils.hs
@@ -800,8 +800,8 @@ getSRTInfo = do
             let srt_desc_lbl = mkLargeSRTLabel id
             emitRODataLits "getSRTInfo" srt_desc_lbl
              ( cmmLabelOffW dflags srt_lbl off
-               : mkWordCLit dflags (toStgWord dflags (toInteger len))
-               : map (mkWordCLit dflags) bmp)
+               : mkWordCLit dflags (toInteger len)
+               : map (mkWordCLit dflags . fromStgWord) bmp)
             return (C_SRT srt_desc_lbl 0 (srt_escape dflags))
 
       | otherwise
diff --git a/compiler/codeGen/ClosureInfo.lhs b/compiler/codeGen/ClosureInfo.lhs
index 740bfab845..f2cbc21d27 100644
--- a/compiler/codeGen/ClosureInfo.lhs
+++ b/compiler/codeGen/ClosureInfo.lhs
@@ -480,7 +480,7 @@ mkClosureInfo dflags is_static id lf_info tot_wds ptr_wds srt_info descr
 		    -- anything else gets eta expanded.
   where
     name   = idName id
-    sm_rep = mkHeapRep dflags is_static ptr_wds nonptr_wds (lfClosureType dflags lf_info)
+    sm_rep = mkHeapRep dflags is_static ptr_wds nonptr_wds (lfClosureType lf_info)
     nonptr_wds = tot_wds - ptr_wds
 
 mkConInfo :: DynFlags
@@ -492,7 +492,7 @@ mkConInfo dflags is_static data_con tot_wds ptr_wds
    = ConInfo {	closureSMRep = sm_rep,
 		closureCon = data_con }
   where
-    sm_rep  = mkHeapRep dflags is_static ptr_wds nonptr_wds (lfClosureType dflags lf_info)
+    sm_rep  = mkHeapRep dflags is_static ptr_wds nonptr_wds (lfClosureType lf_info)
     lf_info = mkConLFInfo data_con
     nonptr_wds = tot_wds - ptr_wds
 \end{code}
@@ -526,16 +526,16 @@ closureNeedsUpdSpace cl_info = closureUpdReqd cl_info
 %************************************************************************
 
 \begin{code}
-lfClosureType :: DynFlags -> LambdaFormInfo -> ClosureTypeInfo
-lfClosureType dflags (LFReEntrant _ arity _ argd) = Fun (toStgHalfWord dflags (toInteger arity)) argd
-lfClosureType dflags (LFCon con)                  = Constr (toStgHalfWord dflags (toInteger (dataConTagZ con)))
-                                                           (dataConIdentity con)
-lfClosureType dflags (LFThunk _ _ _ is_sel _)     = thunkClosureType dflags is_sel
-lfClosureType _      _                            = panic "lfClosureType"
-
-thunkClosureType :: DynFlags -> StandardFormInfo -> ClosureTypeInfo
-thunkClosureType dflags (SelectorThunk off) = ThunkSelector (toStgWord dflags (toInteger off))
-thunkClosureType _      _                   = Thunk
+lfClosureType :: LambdaFormInfo -> ClosureTypeInfo
+lfClosureType (LFReEntrant _ arity _ argd) = Fun arity argd
+lfClosureType (LFCon con)                  = Constr (dataConTagZ con)
+                                                    (dataConIdentity con)
+lfClosureType (LFThunk _ _ _ is_sel _)     = thunkClosureType is_sel
+lfClosureType _                            = panic "lfClosureType"
+
+thunkClosureType :: StandardFormInfo -> ClosureTypeInfo
+thunkClosureType (SelectorThunk off) = ThunkSelector off
+thunkClosureType _                   = Thunk
 
 -- We *do* get non-updatable top-level thunks sometimes.  eg. f = g
 -- gets compiled to a jump to g (if g has non-zero arity), instead of
diff --git a/compiler/codeGen/StgCmm.hs b/compiler/codeGen/StgCmm.hs
index 37ca5e0d43..67aae3f6c0 100644
--- a/compiler/codeGen/StgCmm.hs
+++ b/compiler/codeGen/StgCmm.hs
@@ -245,7 +245,7 @@ cgDataCon data_con
             arg_reps = [(typePrimRep rep_ty, rep_ty) | ty <- dataConRepArgTys data_con, rep_ty <- flattenRepType (repType ty)]
 
             -- Dynamic closure code for non-nullary constructors only
-        ; whenC (not (isNullaryRepDataCon data_con))
+        ; when (not (isNullaryRepDataCon data_con))
                 (emit_info dyn_info_tbl tickyEnterDynCon)
 
                 -- Dynamic-Closure first, to reduce forward references
diff --git a/compiler/codeGen/StgCmmBind.hs b/compiler/codeGen/StgCmmBind.hs
index 89d27dd161..5e46dcfd65 100644
--- a/compiler/codeGen/StgCmmBind.hs
+++ b/compiler/codeGen/StgCmmBind.hs
@@ -10,7 +10,7 @@ module StgCmmBind (
         cgTopRhsClosure,
         cgBind,
         emitBlackHoleCode,
-        pushUpdateFrame
+        pushUpdateFrame, emitUpdateFrame
   ) where
 
 #include "HsVersions.h"
@@ -37,7 +37,6 @@ import CLabel
 import StgSyn
 import CostCentre
 import Id
-import Control.Monad
 import Name
 import Module
 import ListSetOps
@@ -48,6 +47,8 @@ import FastString
 import Maybes
 import DynFlags
 
+import Control.Monad
+
 ------------------------------------------------------------------------
 --              Top-level bindings
 ------------------------------------------------------------------------
@@ -460,7 +461,7 @@ closureCodeBody top_lvl bndr cl_info cc args arity body fv_details
                     (CmmMachOp (mo_wordSub dflags)
                          [ CmmReg nodeReg
                          , mkIntExpr dflags (funTag dflags cl_info) ])
-                ; whenC node_points (ldvEnterClosure cl_info)
+                ; when node_points (ldvEnterClosure cl_info)
                 ; granYield arg_regs node_points
 
                 -- Main payload
@@ -525,8 +526,8 @@ thunkCode cl_info fv_details _cc node arity body
         ; entryHeapCheck cl_info node' arity [] $ do
         { -- Overwrite with black hole if necessary
           -- but *after* the heap-overflow check
-        ; whenC (blackHoleOnEntry cl_info && node_points)
-                (blackHoleIt cl_info)
+        ; when (blackHoleOnEntry cl_info && node_points)
+                (blackHoleIt cl_info node)
 
           -- Push update frame
         ; setupUpdate cl_info node $
@@ -545,13 +546,14 @@ thunkCode cl_info fv_details _cc node arity body
 --              Update and black-hole wrappers
 ------------------------------------------------------------------------
 
-blackHoleIt :: ClosureInfo -> FCode ()
+blackHoleIt :: ClosureInfo -> LocalReg -> FCode ()
 -- Only called for closures with no args
 -- Node points to the closure
-blackHoleIt closure_info = emitBlackHoleCode (closureSingleEntry closure_info)
+blackHoleIt closure_info node
+  = emitBlackHoleCode (closureSingleEntry closure_info) (CmmReg (CmmLocal node))
 
-emitBlackHoleCode :: Bool -> FCode ()
-emitBlackHoleCode is_single_entry = do
+emitBlackHoleCode :: Bool -> CmmExpr -> FCode ()
+emitBlackHoleCode is_single_entry node = do
   dflags <- getDynFlags
 
   -- Eager blackholing is normally disabled, but can be turned on with
@@ -578,12 +580,12 @@ emitBlackHoleCode is_single_entry = do
              -- profiling), so currently eager blackholing doesn't
              -- work with profiling.
 
-  whenC eager_blackholing $ do
+  when eager_blackholing $ do
     tickyBlackHole (not is_single_entry)
-    emitStore (cmmOffsetW dflags (CmmReg nodeReg) (fixedHdrSize dflags))
+    emitStore (cmmOffsetW dflags node (fixedHdrSize dflags))
                   (CmmReg (CmmGlobal CurrentTSO))
     emitPrimCall [] MO_WriteBarrier []
-    emitStore (CmmReg nodeReg) (CmmReg (CmmGlobal EagerBlackholeInfo))
+    emitStore node (CmmReg (CmmGlobal EagerBlackholeInfo))
 
 setupUpdate :: ClosureInfo -> LocalReg -> FCode () -> FCode ()
         -- Nota Bene: this function does not change Node (even if it's a CAF),
@@ -634,13 +636,20 @@ pushUpdateFrame lbl updatee body
        let
            hdr         = fixedHdrSize dflags * wORD_SIZE dflags
            frame       = updfr + hdr + sIZEOF_StgUpdateFrame_NoHdr dflags
-           off_updatee = hdr + oFFSET_StgUpdateFrame_updatee dflags
        --
-       emitStore (CmmStackSlot Old frame) (mkLblExpr lbl)
-       emitStore (CmmStackSlot Old (frame - off_updatee)) updatee
-       initUpdFrameProf frame
+       emitUpdateFrame dflags (CmmStackSlot Old frame) lbl updatee
        withUpdFrameOff frame body
 
+emitUpdateFrame :: DynFlags -> CmmExpr -> CLabel -> CmmExpr -> FCode ()
+emitUpdateFrame dflags frame lbl updatee = do
+  let
+           hdr         = fixedHdrSize dflags * wORD_SIZE dflags
+           off_updatee = hdr + oFFSET_StgUpdateFrame_updatee dflags
+  --
+  emitStore frame (mkLblExpr lbl)
+  emitStore (cmmOffset dflags frame off_updatee) updatee
+  initUpdFrameProf frame
+
 -----------------------------------------------------------------------------
 -- Entering a CAF
 --
diff --git a/compiler/codeGen/StgCmmClosure.hs b/compiler/codeGen/StgCmmClosure.hs
index 4be5bd3d0c..f865c37ad8 100644
--- a/compiler/codeGen/StgCmmClosure.hs
+++ b/compiler/codeGen/StgCmmClosure.hs
@@ -353,16 +353,16 @@ isLFReEntrant _                = False
 --		Choosing SM reps
 -----------------------------------------------------------------------------
 
-lfClosureType :: DynFlags -> LambdaFormInfo -> ClosureTypeInfo
-lfClosureType dflags (LFReEntrant _ arity _ argd) = Fun (toStgHalfWord dflags (toInteger arity)) argd
-lfClosureType dflags (LFCon con)                  = Constr (toStgHalfWord dflags (toInteger (dataConTagZ con)))
-                                                           (dataConIdentity con)
-lfClosureType dflags (LFThunk _ _ _ is_sel _)     = thunkClosureType dflags is_sel
-lfClosureType _      _                            = panic "lfClosureType"
+lfClosureType :: LambdaFormInfo -> ClosureTypeInfo
+lfClosureType (LFReEntrant _ arity _ argd) = Fun arity argd
+lfClosureType (LFCon con)                  = Constr (dataConTagZ con)
+                                                    (dataConIdentity con)
+lfClosureType (LFThunk _ _ _ is_sel _)     = thunkClosureType is_sel
+lfClosureType _                            = panic "lfClosureType"
 
-thunkClosureType :: DynFlags -> StandardFormInfo -> ClosureTypeInfo
-thunkClosureType dflags (SelectorThunk off) = ThunkSelector (toStgWord dflags (toInteger off))
-thunkClosureType _      _                   = Thunk
+thunkClosureType :: StandardFormInfo -> ClosureTypeInfo
+thunkClosureType (SelectorThunk off) = ThunkSelector off
+thunkClosureType _                   = Thunk
 
 -- We *do* get non-updatable top-level thunks sometimes.  eg. f = g
 -- gets compiled to a jump to g (if g has non-zero arity), instead of
@@ -373,8 +373,6 @@ thunkClosureType _      _                   = Thunk
 --		nodeMustPointToIt
 -----------------------------------------------------------------------------
 
--- Be sure to see the stg-details notes about these...
-
 nodeMustPointToIt :: DynFlags -> LambdaFormInfo -> Bool
 nodeMustPointToIt _ (LFReEntrant top _ no_fvs _)
   = not no_fvs ||   -- Certainly if it has fvs we need to point to it
@@ -687,7 +685,7 @@ mkClosureInfo dflags is_static id lf_info tot_wds ptr_wds val_descr
                   closureProf      = prof }     -- (we don't have an SRT yet)
   where
     name       = idName id
-    sm_rep     = mkHeapRep dflags is_static ptr_wds nonptr_wds (lfClosureType dflags lf_info)
+    sm_rep     = mkHeapRep dflags is_static ptr_wds nonptr_wds (lfClosureType lf_info)
     prof       = mkProfilingInfo dflags id val_descr
     nonptr_wds = tot_wds - ptr_wds
 
@@ -899,8 +897,7 @@ mkDataConInfoTable dflags data_con is_static ptr_wds nonptr_wds
 
    sm_rep = mkHeapRep dflags is_static ptr_wds nonptr_wds cl_type
 
-   cl_type = Constr (toStgHalfWord dflags (toInteger (dataConTagZ data_con)))
-                    (dataConIdentity data_con)
+   cl_type = Constr (dataConTagZ data_con) (dataConIdentity data_con)
 
    prof | not (dopt Opt_SccProfilingOn dflags) = NoProfilingInfo
         | otherwise                            = ProfilingInfo ty_descr val_descr
diff --git a/compiler/codeGen/StgCmmCon.hs b/compiler/codeGen/StgCmmCon.hs
index c822a64e2c..8e775dec51 100644
--- a/compiler/codeGen/StgCmmCon.hs
+++ b/compiler/codeGen/StgCmmCon.hs
@@ -185,7 +185,7 @@ buildDynCon' dflags platform binder _cc con [arg]
   , StgLitArg (MachInt val) <- arg
   , val <= fromIntegral (mAX_INTLIKE dflags) -- Comparisons at type Integer!
   , val >= fromIntegral (mIN_INTLIKE dflags) -- ...ditto...
-  = do  { let intlike_lbl   = mkCmmGcPtrLabel rtsPackageId (fsLit "stg_INTLIKE_closure")
+  = do  { let intlike_lbl   = mkCmmClosureLabel rtsPackageId (fsLit "stg_INTLIKE")
               val_int = fromIntegral val :: Int
               offsetW = (val_int - mIN_INTLIKE dflags) * (fixedHdrSize dflags + 1)
                 -- INTLIKE closures consist of a header and one word payload
@@ -200,7 +200,7 @@ buildDynCon' dflags platform binder _cc con [arg]
   , let val_int = ord val :: Int
   , val_int <= mAX_CHARLIKE dflags
   , val_int >= mIN_CHARLIKE dflags
-  = do  { let charlike_lbl   = mkCmmGcPtrLabel rtsPackageId (fsLit "stg_CHARLIKE_closure")
+  = do  { let charlike_lbl   = mkCmmClosureLabel rtsPackageId (fsLit "stg_CHARLIKE")
               offsetW = (val_int - mIN_CHARLIKE dflags) * (fixedHdrSize dflags + 1)
                 -- CHARLIKE closures consist of a header and one word payload
               charlike_amode = cmmLabelOffW dflags charlike_lbl offsetW
diff --git a/compiler/codeGen/StgCmmExpr.hs b/compiler/codeGen/StgCmmExpr.hs
index a8ffc12bb0..a0859252ff 100644
--- a/compiler/codeGen/StgCmmExpr.hs
+++ b/compiler/codeGen/StgCmmExpr.hs
@@ -717,12 +717,12 @@ emitEnter fun = do
       --
       AssignTo res_regs _ -> do
        { lret <- newLabelC
-       ; let (off, copyin) = copyInOflow dflags NativeReturn (Young lret) res_regs
+       ; let (off, copyin) = copyInOflow dflags NativeReturn (Young lret) res_regs []
        ; lcall <- newLabelC
        ; updfr_off <- getUpdFrameOff
        ; let area = Young lret
        ; let (outArgs, regs, copyout) = copyOutOflow dflags NativeNodeCall Call area
-                                          [fun] updfr_off (0,[])
+                                          [fun] updfr_off []
          -- refer to fun via nodeReg after the copyout, to avoid having
          -- both live simultaneously; this sometimes enables fun to be
          -- inlined in the RHS of the R1 assignment.
diff --git a/compiler/codeGen/CgExtCode.hs b/compiler/codeGen/StgCmmExtCode.hs
index a651319a49..b0608227ae 100644
--- a/compiler/codeGen/CgExtCode.hs
+++ b/compiler/codeGen/StgCmmExtCode.hs
@@ -9,44 +9,36 @@
 -- to collect declarations as we parse the proc, and feed the environment
 -- back in circularly (to avoid a two-pass algorithm).
 
-{-# OPTIONS -fno-warn-tabs #-}
--- The above warning supression flag is a temporary kludge.
--- While working on this module you are encouraged to remove it and
--- detab the module (please do the detabbing in a separate patch). See
---     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#TabsvsSpaces
--- for details
-
-module CgExtCode (
-	ExtFCode(..),
-	ExtCode,
-	Named(..), Env,
+module StgCmmExtCode (
+	CmmParse(..),
+        Named(..), Env,
 	
 	loopDecls,
 	getEnv,
 
 	newLocal,
-	newLabel,
+        newLabel,
+        newBlockId,
 	newFunctionName,
 	newImport,
 	lookupLabel,
 	lookupName,
 
 	code,
-	code2,
-	nopEC,
-	stmtEC,
-	stmtsEC,
-	getCgStmtsEC,
-	getCgStmtsEC',
-	forkLabelledCodeEC
+        emit, emitLabel, emitAssign, emitStore,
+        getCode, getCodeR,
+        emitOutOfLine,
+        withUpdFrameOff, getUpdFrameOff
 )
 
 where
 
-import CgMonad
+import qualified StgCmmMonad as F
+import StgCmmMonad (FCode, newUnique)
 
+import Cmm
 import CLabel
-import OldCmm hiding( ClosureTypeInfo(..) )
+import MkGraph
 
 -- import BasicTypes
 import BlockId
@@ -73,22 +65,22 @@ type Decls 	= [(FastString,Named)]
 
 -- | Does a computation in the FCode monad, with a current environment
 --	and a list of local declarations. Returns the resulting list of declarations.
-newtype ExtFCode a 	
+newtype CmmParse a 	
 	= EC { unEC :: Env -> Decls -> FCode (Decls, a) }
 
-type ExtCode = ExtFCode ()
+type ExtCode = CmmParse ()
 
-returnExtFC :: a -> ExtFCode a
+returnExtFC :: a -> CmmParse a
 returnExtFC a 	= EC $ \_ s -> return (s, a)
 
-thenExtFC :: ExtFCode a -> (a -> ExtFCode b) -> ExtFCode b
+thenExtFC :: CmmParse a -> (a -> CmmParse b) -> CmmParse b
 thenExtFC (EC m) k = EC $ \e s -> do (s',r) <- m e s; unEC (k r) e s'
 
-instance Monad ExtFCode where
+instance Monad CmmParse where
   (>>=) = thenExtFC
   return = returnExtFC
 
-instance HasDynFlags ExtFCode where
+instance HasDynFlags CmmParse where
     getDynFlags = EC (\_ d -> do dflags <- getDynFlags
                                  return (d, dflags))
 
@@ -99,15 +91,15 @@ instance HasDynFlags ExtFCode where
 -- 	procedure, and imports that scope over the entire module.
 --	Discards the local declaration contained within decl'
 --
-loopDecls :: ExtFCode a -> ExtFCode a
+loopDecls :: CmmParse a -> CmmParse a
 loopDecls (EC fcode) =
       EC $ \e globalDecls -> do
-	(_, a) <- fixC (\ ~(decls, _) -> fcode (addListToUFM e (decls ++ globalDecls)) globalDecls)
+        (_, a) <- F.fixC (\ ~(decls, _) -> fcode (addListToUFM e (decls ++ globalDecls)) globalDecls)
 	return (globalDecls, a)
 
 
 -- | Get the current environment from the monad.
-getEnv :: ExtFCode Env
+getEnv :: CmmParse Env
 getEnv 	= EC $ \e s -> return (s, e)
 
 
@@ -127,7 +119,7 @@ addLabel name block_id
 newLocal 
 	:: CmmType 		-- ^ data type
 	-> FastString 		-- ^ name of variable
-	-> ExtFCode LocalReg	-- ^ register holding the value
+	-> CmmParse LocalReg	-- ^ register holding the value
 	
 newLocal ty name = do
    u <- code newUnique
@@ -137,12 +129,14 @@ newLocal ty name = do
 
 
 -- | Allocate a fresh label.
-newLabel :: FastString -> ExtFCode BlockId
+newLabel :: FastString -> CmmParse BlockId
 newLabel name = do
    u <- code newUnique
    addLabel name (mkBlockId u)
    return (mkBlockId u)
 
+newBlockId :: CmmParse BlockId
+newBlockId = code F.newLabelC
 
 -- | Add add a local function to the environment.
 newFunctionName 
@@ -159,7 +153,7 @@ newFunctionName name pkg
 --	over the whole module.
 newImport 
 	:: (FastString, CLabel) 
-	-> ExtFCode ()
+	-> CmmParse ()
 
 newImport (name, cmmLabel) 
    = addVarDecl name (CmmLit (CmmLabel cmmLabel))
@@ -168,7 +162,7 @@ newImport (name, cmmLabel)
 -- | Lookup the BlockId bound to the label with this name.
 --	If one hasn't been bound yet, create a fresh one based on the 
 --	Unique of the name.
-lookupLabel :: FastString -> ExtFCode BlockId
+lookupLabel :: FastString -> CmmParse BlockId
 lookupLabel name = do
   env <- getEnv
   return $ 
@@ -181,7 +175,7 @@ lookupLabel name = do
 --	Unknown names are treated as if they had been 'import'ed from the runtime system.
 -- 	This saves us a lot of bother in the RTS sources, at the expense of
 -- 	deferring some errors to link time.
-lookupName :: FastString -> ExtFCode CmmExpr
+lookupName :: FastString -> CmmParse CmmExpr
 lookupName name = do
   env    <- getEnv
   return $ 
@@ -191,51 +185,40 @@ lookupName name = do
 	_other 		-> CmmLit (CmmLabel (mkCmmCodeLabel rtsPackageId name))
 
 
--- | Lift an FCode computation into the ExtFCode monad
-code :: FCode a -> ExtFCode a
+-- | Lift an FCode computation into the CmmParse monad
+code :: FCode a -> CmmParse a
 code fc = EC $ \_ s -> do 
 		r <- fc
  		return (s, r)
 
+emit :: CmmAGraph -> CmmParse ()
+emit = code . F.emit
 
-code2 :: (FCode (Decls,b) -> FCode ((Decls,b),c)) -> ExtFCode b -> ExtFCode c
-code2 f (EC ec) 
-	= EC $ \e s -> do 
-		((s', _),c) <- f (ec e s)
-		return (s',c)
+emitLabel :: BlockId -> CmmParse ()
+emitLabel = code. F.emitLabel
 
+emitAssign :: CmmReg  -> CmmExpr -> CmmParse ()
+emitAssign l r = code (F.emitAssign l r)
 
--- | Do nothing in the ExtFCode monad.
-nopEC :: ExtFCode ()
-nopEC = code nopC
+emitStore :: CmmExpr  -> CmmExpr -> CmmParse ()
+emitStore l r = code (F.emitStore l r)
 
+getCode :: CmmParse a -> CmmParse CmmAGraph
+getCode (EC ec) = EC $ \e s -> do
+  ((s',_), gr) <- F.getCodeR (ec e s)
+  return (s', gr)
 
--- | Accumulate a CmmStmt into the monad state.
-stmtEC :: CmmStmt -> ExtFCode () 
-stmtEC stmt = code (stmtC stmt)
+getCodeR :: CmmParse a -> CmmParse (a, CmmAGraph)
+getCodeR (EC ec) = EC $ \e s -> do
+  ((s', r), gr) <- F.getCodeR (ec e s)
+  return (s', (r,gr))
 
+emitOutOfLine :: BlockId -> CmmAGraph -> CmmParse ()
+emitOutOfLine l g = code (F.emitOutOfLine l g)
 
--- | Accumulate some CmmStmts into the monad state.
-stmtsEC :: [CmmStmt] -> ExtFCode ()
-stmtsEC stmts = code (stmtsC stmts)
-
-
--- | Get the generated statements out of the monad state.
-getCgStmtsEC :: ExtFCode a -> ExtFCode CgStmts
-getCgStmtsEC = code2 getCgStmts'
-
-
--- | Get the generated statements, and the return value out of the monad state.
-getCgStmtsEC' :: ExtFCode a -> ExtFCode (a, CgStmts)
-getCgStmtsEC' = code2 (\m -> getCgStmts' m >>= f)
-  where f ((decl, b), c) = return ((decl, b), (b, c))
-
-
--- | Emit a chunk of code outside the instruction stream, 
---	and return its block id. 
-forkLabelledCodeEC :: ExtFCode a -> ExtFCode BlockId
-forkLabelledCodeEC ec = do
-  stmts <- getCgStmtsEC ec
-  code (forkCgStmts stmts)
-
+withUpdFrameOff :: UpdFrameOffset -> CmmParse () -> CmmParse ()
+withUpdFrameOff size inner
+  = EC $ \e s -> F.withUpdFrameOff size $ (unEC inner) e s
 
+getUpdFrameOff :: CmmParse UpdFrameOffset
+getUpdFrameOff = code $ F.getUpdFrameOff
diff --git a/compiler/codeGen/StgCmmForeign.hs b/compiler/codeGen/StgCmmForeign.hs
index 9e4db9cdaa..1830f7b6d6 100644
--- a/compiler/codeGen/StgCmmForeign.hs
+++ b/compiler/codeGen/StgCmmForeign.hs
@@ -9,9 +9,10 @@
 module StgCmmForeign (
   cgForeignCall, loadThreadState, saveThreadState,
   emitPrimCall, emitCCall,
+  emitForeignCall,     -- For CmmParse
   emitSaveThreadState, -- will be needed by the Cmm parser
   emitLoadThreadState, -- ditto
-  emitOpenNursery,
+  emitCloseNursery, emitOpenNursery
  ) where
 
 #include "HsVersions.h"
@@ -24,10 +25,8 @@ import StgCmmUtils
 import StgCmmClosure
 import StgCmmLayout
 
-import BlockId
 import Cmm
 import CmmUtils
-import OldCmm ( CmmReturnInfo(..) )
 import MkGraph
 import Type
 import TysPrim
@@ -85,7 +84,7 @@ cgForeignCall (CCall (CCallSpec target cconv safety)) stg_args res_ty
                    DynamicTarget    ->  case cmm_args of
                                            (fn,_):rest -> (unzip rest, fn)
                                            [] -> panic "cgForeignCall []"
-              fc = ForeignConvention cconv arg_hints res_hints
+              fc = ForeignConvention cconv arg_hints res_hints CmmMayReturn
               call_target = ForeignTarget cmm_target fc
 
         -- we want to emit code for the call, and then emitReturn.
@@ -100,12 +99,10 @@ cgForeignCall (CCall (CCallSpec target cconv safety)) stg_args res_ty
         ; sequel <- getSequel
         ; case sequel of
             AssignTo assign_to_these _ ->
-                emitForeignCall safety assign_to_these call_target
-                                     call_args CmmMayReturn
+                emitForeignCall safety assign_to_these call_target call_args
 
             _something_else ->
-                do { _ <- emitForeignCall safety res_regs call_target
-                                     call_args CmmMayReturn
+                do { _ <- emitForeignCall safety res_regs call_target call_args
                    ; emitReturn (map (CmmReg . CmmLocal) res_regs)
                    }
          }
@@ -183,17 +180,17 @@ emitCCall :: [(CmmFormal,ForeignHint)]
           -> [(CmmActual,ForeignHint)]
           -> FCode ()
 emitCCall hinted_results fn hinted_args
-  = void $ emitForeignCall PlayRisky results target args CmmMayReturn
+  = void $ emitForeignCall PlayRisky results target args
   where
     (args, arg_hints) = unzip hinted_args
     (results, result_hints) = unzip hinted_results
     target = ForeignTarget fn fc
-    fc = ForeignConvention CCallConv arg_hints result_hints
+    fc = ForeignConvention CCallConv arg_hints result_hints CmmMayReturn
 
 
 emitPrimCall :: [CmmFormal] -> CallishMachOp -> [CmmActual] -> FCode ()
 emitPrimCall res op args
-  = void $ emitForeignCall PlayRisky res (PrimTarget op) args CmmMayReturn
+  = void $ emitForeignCall PlayRisky res (PrimTarget op) args
 
 -- alternative entry point, used by CmmParse
 emitForeignCall
@@ -201,10 +198,8 @@ emitForeignCall
         -> [CmmFormal]          -- where to put the results
         -> ForeignTarget        -- the op
         -> [CmmActual]          -- arguments
-        -> CmmReturnInfo        -- This can say "never returns"
-                                --   only RTS procedures do this
         -> FCode ReturnKind
-emitForeignCall safety results target args _ret
+emitForeignCall safety results target args
   | not (playSafe safety) = do
     dflags <- getDynFlags
     let (caller_save, caller_load) = callerSaveVolatileRegs dflags
@@ -218,7 +213,7 @@ emitForeignCall safety results target args _ret
     updfr_off <- getUpdFrameOff
     temp_target <- load_target_into_temp target
     k <- newLabelC
-    let (off, copyout) = copyInOflow dflags NativeReturn (Young k) results
+    let (off, copyout) = copyInOflow dflags NativeReturn (Young k) results []
        -- see Note [safe foreign call convention]
     emit $
            (    mkStore (CmmStackSlot (Young k) (widthInBytes (wordWidth dflags)))
@@ -285,17 +280,15 @@ saveThreadState dflags =
         mkStore (cmmOffset dflags stgCurrentTSO (tso_CCCS dflags)) curCCS
       else mkNop
 
-emitSaveThreadState :: BlockId -> FCode ()
-emitSaveThreadState bid = do
+emitSaveThreadState :: FCode ()
+emitSaveThreadState = do
   dflags <- getDynFlags
+  emit (saveThreadState dflags)
 
-  -- CurrentTSO->stackobj->sp = Sp;
-  emitStore (cmmOffset dflags (CmmLoad (cmmOffset dflags stgCurrentTSO (tso_stackobj dflags)) (bWord dflags)) (stack_SP dflags))
-                 (CmmStackSlot (Young bid) (widthInBytes (typeWidth (gcWord dflags))))
-  emit $ closeNursery dflags
-  -- and save the current cost centre stack in the TSO when profiling:
-  when (dopt Opt_SccProfilingOn dflags) $
-        emitStore (cmmOffset dflags stgCurrentTSO (tso_CCCS dflags)) curCCS
+emitCloseNursery :: FCode ()
+emitCloseNursery = do
+  df <- getDynFlags
+  emit (closeNursery df)
 
    -- CurrentNursery->free = Hp+1;
 closeNursery :: DynFlags -> CmmAGraph
@@ -303,8 +296,6 @@ closeNursery dflags = mkStore (nursery_bdescr_free dflags) (cmmOffsetW dflags st
 
 loadThreadState :: DynFlags -> LocalReg -> LocalReg -> CmmAGraph
 loadThreadState dflags tso stack = do
-  -- tso <- newTemp (gcWord dflags) -- TODO FIXME NOW
-  -- stack <- newTemp (gcWord dflags) -- TODO FIXME NOW
   catAGraphs [
         -- tso = CurrentTSO;
         mkAssign (CmmLocal tso) stgCurrentTSO,
@@ -321,9 +312,18 @@ loadThreadState dflags tso stack = do
           storeCurCCS
             (CmmLoad (cmmOffset dflags (CmmReg (CmmLocal tso)) (tso_CCCS dflags)) (ccsType dflags))
         else mkNop]
-emitLoadThreadState :: LocalReg -> LocalReg -> FCode ()
-emitLoadThreadState tso stack = do dflags <- getDynFlags
-                                   emit $ loadThreadState dflags tso stack
+
+emitLoadThreadState :: FCode ()
+emitLoadThreadState = do
+  dflags <- getDynFlags
+  load_tso <- newTemp (gcWord dflags)
+  load_stack <- newTemp (gcWord dflags)
+  emit $ loadThreadState dflags load_tso load_stack
+
+emitOpenNursery :: FCode ()
+emitOpenNursery = do
+  df <- getDynFlags
+  emit (openNursery df)
 
 openNursery :: DynFlags -> CmmAGraph
 openNursery dflags = catAGraphs [
@@ -345,9 +345,6 @@ openNursery dflags = catAGraphs [
                 )
             )
    ]
-emitOpenNursery :: FCode ()
-emitOpenNursery = do dflags <- getDynFlags
-                     emit $ openNursery dflags
 
 nursery_bdescr_free, nursery_bdescr_start, nursery_bdescr_blocks :: DynFlags -> CmmExpr
 nursery_bdescr_free   dflags = cmmOffset dflags stgCurrentNursery (oFFSET_bdescr_free dflags)
diff --git a/compiler/codeGen/StgCmmGran.hs b/compiler/codeGen/StgCmmGran.hs
index 2abca3fe16..fe00d7c384 100644
--- a/compiler/codeGen/StgCmmGran.hs
+++ b/compiler/codeGen/StgCmmGran.hs
@@ -53,7 +53,7 @@ staticGranHdr = []
 doGranAllocate :: CmmExpr -> Code
 -- macro DO_GRAN_ALLOCATE
 doGranAllocate hp
-  | not opt_GranMacros = nopC
+  | not opt_GranMacros = return ()
   | otherwise          = panic "doGranAllocate"
 
 
@@ -75,7 +75,7 @@ granFetchAndReschedule regs node_reqd
   = do { fetch
        ; reschedule liveness node_reqd }
   | otherwise
-  = nopC
+  = return ()
   where
     liveness = mkRegLiveness regs 0 0
 
@@ -109,7 +109,7 @@ granYield :: [(Id,GlobalReg)]   -- Live registers
 
 granYield regs node_reqd
   | opt_GranMacros && node_reqd = yield liveness
-  | otherwise                   = nopC
+  | otherwise                   = return ()
   where
      liveness = mkRegLiveness regs 0 0
 
diff --git a/compiler/codeGen/StgCmmHeap.hs b/compiler/codeGen/StgCmmHeap.hs
index b7cca48f5a..c133ab00d4 100644
--- a/compiler/codeGen/StgCmmHeap.hs
+++ b/compiler/codeGen/StgCmmHeap.hs
@@ -11,6 +11,8 @@ module StgCmmHeap (
         getHpRelOffset, hpRel,
 
         entryHeapCheck, altHeapCheck, noEscapeHeapCheck, altHeapCheckReturnsTo,
+        heapStackCheckGen,
+        entryHeapCheck',
 
         mkVirtHeapOffsets, mkVirtConstrOffsets,
         mkStaticClosureFields, mkStaticClosure,
@@ -47,6 +49,7 @@ import FastString( mkFastString, fsLit )
 import Util
 
 import Control.Monad (when)
+import Data.Maybe (isJust)
 
 -----------------------------------------------------------
 --              Initialise dynamic heap objects
@@ -334,16 +337,28 @@ entryHeapCheck :: ClosureInfo
                -> FCode ()
 
 entryHeapCheck cl_info nodeSet arity args code
+  = entryHeapCheck' is_fastf node arity args code
+  where
+    node = case nodeSet of
+              Just r  -> CmmReg (CmmLocal r)
+              Nothing -> CmmLit (CmmLabel $ staticClosureLabel cl_info)
+
+    is_fastf = case closureFunInfo cl_info of
+                 Just (_, ArgGen _) -> False
+                 _otherwise         -> True
+
+-- | lower-level version for CmmParse
+entryHeapCheck' :: Bool           -- is a known function pattern
+                -> CmmExpr        -- expression for the closure pointer
+                -> Int            -- Arity -- not same as len args b/c of voids
+                -> [LocalReg]     -- Non-void args (empty for thunk)
+                -> FCode ()
+                -> FCode ()
+entryHeapCheck' is_fastf node arity args code
   = do dflags <- getDynFlags
        let is_thunk = arity == 0
-           is_fastf = case closureFunInfo cl_info of
-                           Just (_, ArgGen _) -> False
-                           _otherwise         -> True
 
            args' = map (CmmReg . CmmLocal) args
-           node = case nodeSet of
-                      Just r  -> CmmReg (CmmLocal r)
-                      Nothing -> CmmLit (CmmLabel $ staticClosureLabel cl_info)
            stg_gc_fun    = CmmReg (CmmGlobal GCFun)
            stg_gc_enter1 = CmmReg (CmmGlobal GCEnter1)
 
@@ -373,50 +388,6 @@ entryHeapCheck cl_info nodeSet arity args code
        emitLabel loop_id
        heapCheck True True (gc_call updfr_sz <*> mkBranch loop_id) code
 
-{-
-    -- This code is slightly outdated now and we could easily keep the above
-    -- GC methods. However, there may be some performance gains to be made by
-    -- using more specialised GC entry points. Since the semi generic GCFun
-    -- entry needs to check the node and figure out what registers to save...
-    -- if we provided and used more specialised GC entry points then these
-    -- runtime decisions could be turned into compile time decisions.
-
-    args'     = case fun of Just f  -> f : args
-                            Nothing -> args
-    arg_exprs = map (CmmReg . CmmLocal) args'
-    gc_call updfr_sz
-        | arity == 0 = mkJumpGC (CmmReg (CmmGlobal GCEnter1)) arg_exprs updfr_sz
-        | otherwise =
-            case gc_lbl args' of
-                Just _lbl -> panic "StgCmmHeap.entryHeapCheck: not finished"
-                            -- mkJumpGC (CmmLit (CmmLabel (mkRtsCodeLabel lbl)))
-                            --         arg_exprs updfr_sz
-                Nothing  -> mkCall generic_gc (GC, GC) [] [] updfr_sz
-
-    gc_lbl :: [LocalReg] -> Maybe FastString
-    gc_lbl [reg]
-        | isGcPtrType ty  = Just (sLit "stg_gc_unpt_r1") -- "stg_gc_fun_1p"
-        | isFloatType ty  = case width of
-                              W32 -> Just (sLit "stg_gc_f1")
-                              W64 -> Just (sLit "stg_gc_d1")
-                              _other -> Nothing
-        | width == wordWidth dflags = Just (mkGcLabel "stg_gc_unbx_r1")
-        | width == W64              = Just (mkGcLabel "stg_gc_l1")
-        | otherwise                 = Nothing
-        where
-          ty = localRegType reg
-          width = typeWidth ty
-
-    gc_lbl regs = gc_lbl_ptrs (map (isGcPtrType . localRegType) regs)
-
-    gc_lbl_ptrs :: [Bool] -> Maybe FastString
-    -- JD: TEMPORARY -- UNTIL THESE FUNCTIONS EXIST...
-    --gc_lbl_ptrs [True,True]      = Just (sLit "stg_gc_fun_2p")
-    --gc_lbl_ptrs [True,True,True] = Just (sLit "stg_gc_fun_3p")
-    gc_lbl_ptrs _ = Nothing
--}
-
-
 -- ------------------------------------------------------------
 -- A heap/stack check in a case alternative
 
@@ -445,7 +416,7 @@ altOrNoEscapeHeapCheck checkYield regs code = do
       Nothing -> genericGC checkYield code
       Just gc -> do
         lret <- newLabelC
-        let (off, copyin) = copyInOflow dflags NativeReturn (Young lret) regs
+        let (off, copyin) = copyInOflow dflags NativeReturn (Young lret) regs []
         lcont <- newLabelC
         emitOutOfLine lret (copyin <*> mkBranch lcont)
         emitLabel lcont
@@ -475,23 +446,29 @@ cannedGCReturnsTo checkYield cont_on_stack gc regs lret off code
     reg_exprs = map (CmmReg . CmmLocal) regs
       -- Note [stg_gc arguments]
 
+      -- NB. we use the NativeReturn convention for passing arguments
+      -- to the canned heap-check routines, because we are in a case
+      -- alternative and hence the [LocalReg] was passed to us in the
+      -- NativeReturn convention.
     gc_call dflags label sp
-      | cont_on_stack = mkJumpReturnsTo dflags label GC reg_exprs lret off sp
-      | otherwise     = mkCallReturnsTo dflags label GC reg_exprs lret off sp (0,[])
+      | cont_on_stack
+      = mkJumpReturnsTo dflags label NativeReturn reg_exprs lret off sp
+      | otherwise
+      = mkCallReturnsTo dflags label NativeReturn reg_exprs lret off sp []
 
 genericGC :: Bool -> FCode a -> FCode a
 genericGC checkYield code
   = do updfr_sz <- getUpdFrameOff
        lretry <- newLabelC
        emitLabel lretry
-       call <- mkCall generic_gc (GC, GC) [] [] updfr_sz (0,[])
+       call <- mkCall generic_gc (GC, GC) [] [] updfr_sz []
        heapCheck False checkYield (call <*> mkBranch lretry) code
 
 cannedGCEntryPoint :: DynFlags -> [LocalReg] -> Maybe CmmExpr
 cannedGCEntryPoint dflags regs
-  = case regs of
+  = case map localRegType regs of
       []  -> Just (mkGcLabel "stg_gc_noregs")
-      [reg]
+      [ty]
           | isGcPtrType ty -> Just (mkGcLabel "stg_gc_unpt_r1")
           | isFloatType ty -> case width of
                                   W32       -> Just (mkGcLabel "stg_gc_f1")
@@ -502,8 +479,19 @@ cannedGCEntryPoint dflags regs
           | width == W64              -> Just (mkGcLabel "stg_gc_l1")
           | otherwise                 -> Nothing
           where
-              ty = localRegType reg
               width = typeWidth ty
+      [ty1,ty2]
+          |  isGcPtrType ty1
+          && isGcPtrType ty2 -> Just (mkGcLabel "stg_gc_pp")
+      [ty1,ty2,ty3]
+          |  isGcPtrType ty1
+          && isGcPtrType ty2
+          && isGcPtrType ty3 -> Just (mkGcLabel "stg_gc_ppp")
+      [ty1,ty2,ty3,ty4]
+          |  isGcPtrType ty1
+          && isGcPtrType ty2
+          && isGcPtrType ty3
+          && isGcPtrType ty4 -> Just (mkGcLabel "stg_gc_pppp")
       _otherwise -> Nothing
 
 -- Note [stg_gc arguments]
@@ -538,51 +526,70 @@ heapCheck checkStack checkYield do_gc code
   = getHeapUsage $ \ hpHw ->
     -- Emit heap checks, but be sure to do it lazily so
     -- that the conditionals on hpHw don't cause a black hole
-    do  { codeOnly $ do_checks checkStack checkYield hpHw do_gc
+    do  { dflags <- getDynFlags
+        ; let mb_alloc_bytes
+                 | hpHw > 0  = Just (mkIntExpr dflags (hpHw * (wORD_SIZE dflags)))
+                 | otherwise = Nothing
+              stk_hwm | checkStack = Just (CmmLit CmmHighStackMark)
+                      | otherwise  = Nothing
+        ; codeOnly $ do_checks stk_hwm checkYield mb_alloc_bytes do_gc
         ; tickyAllocHeap hpHw
         ; doGranAllocate hpHw
         ; setRealHp hpHw
         ; code }
 
-do_checks :: Bool       -- Should we check the stack?
+heapStackCheckGen :: Maybe CmmExpr -> Maybe CmmExpr -> FCode ()
+heapStackCheckGen stk_hwm mb_bytes
+  = do updfr_sz <- getUpdFrameOff
+       lretry <- newLabelC
+       emitLabel lretry
+       call <- mkCall generic_gc (GC, GC) [] [] updfr_sz []
+       do_checks stk_hwm False  mb_bytes (call <*> mkBranch lretry)
+
+do_checks :: Maybe CmmExpr    -- Should we check the stack?
           -> Bool       -- Should we check for preemption?
-          -> WordOff    -- Heap headroom
+          -> Maybe CmmExpr    -- Heap headroom (bytes)
           -> CmmAGraph  -- What to do on failure
           -> FCode ()
-do_checks checkStack checkYield alloc do_gc = do
+do_checks mb_stk_hwm checkYield mb_alloc_lit do_gc = do
   dflags <- getDynFlags
+  gc_id <- newLabelC
+
   let
-    alloc_lit = mkIntExpr dflags (alloc * wORD_SIZE dflags) -- Bytes
-    bump_hp = cmmOffsetExprB dflags (CmmReg hpReg) alloc_lit
+    Just alloc_lit = mb_alloc_lit
+
+    bump_hp   = cmmOffsetExprB dflags (CmmReg hpReg) alloc_lit
 
     -- Sp overflow if (Sp - CmmHighStack < SpLim)
-    sp_oflo = CmmMachOp (mo_wordULt dflags)
+    sp_oflo sp_hwm =
+         CmmMachOp (mo_wordULt dflags)
                   [CmmMachOp (MO_Sub (typeWidth (cmmRegType dflags spReg)))
-                             [CmmReg spReg, CmmLit CmmHighStackMark],
+                             [CmmReg spReg, sp_hwm],
                    CmmReg spLimReg]
 
     -- Hp overflow if (Hp > HpLim)
     -- (Hp has been incremented by now)
     -- HpLim points to the LAST WORD of valid allocation space.
     hp_oflo = CmmMachOp (mo_wordUGt dflags)
-                        [CmmReg hpReg, CmmReg (CmmGlobal HpLim)]
-
-    -- Yielding if HpLim == 0
-    yielding = CmmMachOp (mo_wordEq dflags)
-                        [CmmReg (CmmGlobal HpLim), CmmLit (zeroCLit dflags)]
+                  [CmmReg hpReg, CmmReg (CmmGlobal HpLim)]
 
     alloc_n = mkAssign (CmmGlobal HpAlloc) alloc_lit
-  gc_id <- newLabelC
 
-  when checkStack $ do
-     emit =<< mkCmmIfGoto sp_oflo gc_id
+  case mb_stk_hwm of
+    Nothing -> return ()
+    Just stk_hwm -> emit =<< mkCmmIfGoto (sp_oflo stk_hwm) gc_id
 
-  if (alloc /= 0)
+  if (isJust mb_alloc_lit)
     then do
-      emitAssign hpReg bump_hp
-      emit =<< mkCmmIfThen hp_oflo (alloc_n <*> mkBranch gc_id)
+     emitAssign hpReg bump_hp
+     emit =<< mkCmmIfThen hp_oflo (alloc_n <*> mkBranch gc_id)
     else do
-      when (not (dopt Opt_OmitYields dflags) && checkYield) (emit =<< mkCmmIfGoto yielding gc_id)
+      when (not (dopt Opt_OmitYields dflags) && checkYield) $ do
+         -- Yielding if HpLim == 0
+         let yielding = CmmMachOp (mo_wordEq dflags)
+                                  [CmmReg (CmmGlobal HpLim),
+                                   CmmLit (zeroCLit dflags)]
+         emit =<< mkCmmIfGoto yielding gc_id
 
   emitOutOfLine gc_id $
      do_gc -- this is expected to jump back somewhere
diff --git a/compiler/codeGen/StgCmmHpc.hs b/compiler/codeGen/StgCmmHpc.hs
index cb60e9dd71..85f4c161ad 100644
--- a/compiler/codeGen/StgCmmHpc.hs
+++ b/compiler/codeGen/StgCmmHpc.hs
@@ -19,6 +19,8 @@ import StgCmmUtils
 import HscTypes
 import DynFlags
 
+import Control.Monad
+
 mkTickBox :: DynFlags -> Module -> Int -> CmmAGraph
 mkTickBox dflags mod n
   = mkStore tick_box (CmmMachOp (MO_Add W64)
@@ -36,7 +38,7 @@ initHpc _ (NoHpcInfo {})
   = return ()
 initHpc this_mod (HpcInfo tickCount _hashNo)
   = do dflags <- getDynFlags
-       whenC (dopt Opt_Hpc dflags) $
+       when (dopt Opt_Hpc dflags) $
            do emitDataLits (mkHpcTicksLabel this_mod)
                            [ (CmmInt 0 W64)
                            | _ <- take tickCount [0 :: Int ..]
diff --git a/compiler/codeGen/StgCmmLayout.hs b/compiler/codeGen/StgCmmLayout.hs
index 75d8d1c38f..4742332107 100644
--- a/compiler/codeGen/StgCmmLayout.hs
+++ b/compiler/codeGen/StgCmmLayout.hs
@@ -111,7 +111,7 @@ emitCall convs fun args
 --
 emitCallWithExtraStack
    :: (Convention, Convention) -> CmmExpr -> [CmmExpr]
-   -> (ByteOff, [(CmmExpr,ByteOff)]) -> FCode ReturnKind
+   -> [CmmExpr] -> FCode ReturnKind
 emitCallWithExtraStack (callConv, retConv) fun args extra_stack
   = do	{ dflags <- getDynFlags
         ; adjustHpBackwards
@@ -124,7 +124,7 @@ emitCallWithExtraStack (callConv, retConv) fun args extra_stack
             AssignTo res_regs _ -> do
               k <- newLabelC
               let area = Young k
-                  (off, copyin) = copyInOflow dflags retConv area res_regs
+                  (off, copyin) = copyInOflow dflags retConv area res_regs []
                   copyout = mkCallReturnsTo dflags fun callConv args k off updfr_off
                                    extra_stack
               emit (copyout <*> mkLabel k <*> copyin)
@@ -222,7 +222,7 @@ direct_call caller call_conv lbl arity args
        emitCallWithExtraStack (call_conv, NativeReturn)
                               target
                               (nonVArgs fast_args)
-                              (mkStkOffsets dflags (stack_args dflags))
+                              (nonVArgs (stack_args dflags))
   where
     target = CmmLit (CmmLabel lbl)
     (fast_args, rest_args) = splitAt real_arity args
@@ -326,32 +326,7 @@ slowCallPattern []		      = (fsLit "stg_ap_0", 0)
 
 
 -------------------------------------------------------------------------
--- Fix the byte-offsets of a bunch of things to push on the stack
-
--- This is used for pushing slow-call continuations.
--- See Note [over-saturated calls].
-
-mkStkOffsets
-  :: DynFlags
-  -> [(ArgRep, Maybe CmmExpr)]    -- things to make offsets for
-  -> ( ByteOff                    -- OUTPUTS: Topmost allocated word
-     , [(CmmExpr, ByteOff)] )     -- things with offsets (voids filtered out)
-mkStkOffsets dflags things
-    = loop 0 [] (reverse things)
-  where
-    loop offset offs [] = (offset,offs)
-    loop offset offs ((_,Nothing):things) = loop offset offs things
-	-- ignore Void arguments
-    loop offset offs ((rep,Just thing):things)
-        = loop thing_off ((thing, thing_off):offs) things
-	where
-          thing_off = offset + argRepSizeW dflags rep * wORD_SIZE dflags
-	    -- offset of thing is offset+size, because we're 
-	    -- growing the stack *downwards* as the offsets increase.
-
-
--------------------------------------------------------------------------
---	Classifying arguments: ArgRep
+--      Classifying arguments: ArgRep
 -------------------------------------------------------------------------
 
 -- ArgRep is not exported (even abstractly)
@@ -472,7 +447,7 @@ mkArgDescr _nm args
        let arg_bits = argBits dflags arg_reps
            arg_reps = filter isNonV (map idArgRep args)
            -- Getting rid of voids eases matching of standard patterns
-       case stdPattern dflags arg_reps of
+       case stdPattern arg_reps of
            Just spec_id -> return (ArgSpec spec_id)
            Nothing      -> return (ArgGen arg_bits)
 
@@ -483,10 +458,9 @@ argBits dflags (arg : args) = take (argRepSizeW dflags arg) (repeat True)
                     ++ argBits dflags args
 
 ----------------------
-stdPattern :: DynFlags -> [ArgRep] -> Maybe StgHalfWord
-stdPattern dflags reps
-  = fmap (toStgHalfWord dflags)
-  $ case reps of
+stdPattern :: [ArgRep] -> Maybe Int
+stdPattern reps
+  = case reps of
 	[]  -> Just ARG_NONE	-- just void args, probably
 	[N] -> Just ARG_N
 	[P] -> Just ARG_P
@@ -545,7 +519,7 @@ emitClosureProcAndInfoTable top_lvl bndr lf_info info_tbl args body
         ; let args' = if node_points then (node : arg_regs) else arg_regs
               conv  = if nodeMustPointToIt dflags lf_info then NativeNodeCall
                                                           else NativeDirectCall
-              (offset, _) = mkCallEntry dflags conv args'
+              (offset, _) = mkCallEntry dflags conv args' []
         ; emitClosureAndInfoTable info_tbl conv args' $ body (offset, node, arg_regs)
         }
 
diff --git a/compiler/codeGen/StgCmmMonad.hs b/compiler/codeGen/StgCmmMonad.hs
index fb290d8e96..b7797bdae6 100644
--- a/compiler/codeGen/StgCmmMonad.hs
+++ b/compiler/codeGen/StgCmmMonad.hs
@@ -18,15 +18,16 @@ module StgCmmMonad (
 	FCode,	-- type
 
         initC, runC, thenC, thenFC, listCs,
-        returnFC, nopC, whenC,
+        returnFC, fixC,
 	newUnique, newUniqSupply, 
 
         newLabelC, emitLabel,
 
-	emit, emitDecl, emitProc, emitProcWithConvention, emitSimpleProc,
+        emit, emitDecl, emitProc,
+        emitProcWithConvention, emitProcWithStackFrame,
         emitOutOfLine, emitAssign, emitStore, emitComment,
 
-	getCmm, cgStmtsToBlocks,
+        getCmm, aGraphToGraph,
 	getCodeR, getCode, getHeapUsage,
 
         mkCmmIfThenElse, mkCmmIfThen, mkCmmIfGoto,
@@ -89,7 +90,30 @@ infixr 9 `thenFC`
 
 
 --------------------------------------------------------
---	The FCode monad and its types
+-- The FCode monad and its types
+--
+-- FCode is the monad plumbed through the Stg->Cmm code generator, and
+-- the Cmm parser.  It contains the following things:
+--
+--  - A writer monad, collecting:
+--    - code for the current function, in the form of a CmmAGraph.
+--      The function "emit" appends more code to this.
+--    - the top-level CmmDecls accumulated so far
+--
+--  - A state monad with:
+--    - the local bindings in scope
+--    - the current heap usage
+--    - a UniqSupply
+--
+--  - A reader monad, for CgInfoDownwards, containing
+--    - DynFlags,
+--    - the current Module
+--    - the static top-level environmnet
+--    - the update-frame offset
+--    - the ticky counter label
+--    - the Sequel (the continuation to return to)
+
+
 --------------------------------------------------------
 
 newtype FCode a = FCode (CgInfoDownwards -> CgState -> (# a, CgState #))
@@ -120,13 +144,6 @@ thenC (FCode m) (FCode k) =
         FCode $ \info_down state -> case m info_down state of
                                      (# _,new_state #) -> k info_down new_state
 
-nopC :: FCode ()
-nopC = return ()
-
-whenC :: Bool -> FCode () -> FCode ()
-whenC True  code  = code
-whenC False _code = nopC
-
 listCs :: [FCode ()] -> FCode ()
 listCs [] = return ()
 listCs (fc:fcs) = do
@@ -141,6 +158,15 @@ thenFC (FCode m) k = FCode $
                  case k m_result of
                    FCode kcode -> kcode info_down new_state
 
+fixC :: (a -> FCode a) -> FCode a
+fixC fcode = FCode (
+	\info_down state -> 
+		let
+                        (v,s) = doFCode (fcode v) info_down state
+                in
+                        (# v, s #)
+	)
+
 --------------------------------------------------------
 --	The code generator environment
 --------------------------------------------------------
@@ -478,7 +504,7 @@ getSequel = do  { info <- getInfoDown
 -- Note: I'm including the size of the original return address
 -- in the size of the update frame -- hence the default case on `get'.
 
-withUpdFrameOff :: UpdFrameOffset -> FCode () -> FCode ()
+withUpdFrameOff :: UpdFrameOffset -> FCode a -> FCode a
 withUpdFrameOff size code
   = do	{ info  <- getInfoDown
 	; withInfoDown code (info {cgd_updfr_off = size }) }
@@ -675,31 +701,60 @@ emitDecl decl
 emitOutOfLine :: BlockId -> CmmAGraph -> FCode ()
 emitOutOfLine l stmts = emitCgStmt (CgFork l stmts)
 
+emitProcWithStackFrame
+   :: Convention                        -- entry convention
+   -> Maybe CmmInfoTable                -- info table?
+   -> CLabel                            -- label for the proc
+   -> [CmmFormal]                       -- stack frame
+   -> [CmmFormal]                       -- arguments
+   -> CmmAGraph                         -- code
+   -> Bool                              -- do stack layout?
+   -> FCode ()
+
+emitProcWithStackFrame _conv mb_info lbl _stk_args [] blocks False
+  = do  { dflags <- getDynFlags
+        ; emitProc_ mb_info lbl blocks (widthInBytes (wordWidth dflags)) False
+        }
+emitProcWithStackFrame conv mb_info lbl stk_args args blocks True -- do layout
+  = do  { dflags <- getDynFlags
+        ; let (offset, entry) = mkCallEntry dflags conv args stk_args
+        ; emitProc_ mb_info lbl (entry <*> blocks) offset True
+        }
+emitProcWithStackFrame _ _ _ _ _ _ _ = panic "emitProcWithStackFrame"
+
 emitProcWithConvention :: Convention -> Maybe CmmInfoTable -> CLabel
-                       -> [CmmFormal] -> CmmAGraph -> FCode ()
+                       -> [CmmFormal]
+                       -> CmmAGraph
+                       -> FCode ()
 emitProcWithConvention conv mb_info lbl args blocks
+  = emitProcWithStackFrame conv mb_info lbl [] args blocks True
+
+emitProc :: Maybe CmmInfoTable -> CLabel -> CmmAGraph -> Int -> FCode ()
+emitProc  mb_info lbl blocks offset
+ = emitProc_ mb_info lbl blocks offset True
+
+emitProc_ :: Maybe CmmInfoTable -> CLabel -> CmmAGraph -> Int -> Bool
+          -> FCode ()
+emitProc_ mb_info lbl blocks offset do_layout
   = do  { dflags <- getDynFlags
-        ; us <- newUniqSupply
-        ; let (offset, entry) = mkCallEntry dflags conv args
-              blks = initUs_ us $ lgraphOfAGraph $ entry <*> blocks
-        ; let sinfo = StackInfo {arg_space = offset, updfr_space = Just (initUpdFrameOff dflags)}
-              tinfo = TopInfo {info_tbls = infos, stack_info=sinfo}
-              proc_block = CmmProc tinfo lbl blks
+        ; l <- newLabelC
+        ; let
+              blks = labelAGraph l blocks
 
-              infos | Just info <- mb_info
-                    = mapSingleton (g_entry blks) info
-                    | otherwise
-                    = mapEmpty
+              infos | Just info <- mb_info = mapSingleton (g_entry blks) info
+                    | otherwise            = mapEmpty
 
-        ; state <- getState
-        ; setState $ state { cgs_tops = cgs_tops state `snocOL` proc_block } }
+              sinfo = StackInfo { arg_space = offset
+                                , updfr_space = Just (initUpdFrameOff dflags)
+                                , do_layout = do_layout }
+
+              tinfo = TopInfo { info_tbls = infos
+                              , stack_info=sinfo}
 
-emitProc :: Maybe CmmInfoTable -> CLabel -> [CmmFormal] -> CmmAGraph -> FCode ()
-emitProc = emitProcWithConvention NativeNodeCall
+              proc_block = CmmProc tinfo lbl blks
 
-emitSimpleProc :: CLabel -> CmmAGraph -> FCode ()
-emitSimpleProc lbl code = 
-  emitProc Nothing lbl [] code
+        ; state <- getState
+        ; setState $ state { cgs_tops = cgs_tops state `snocOL` proc_block } }
 
 getCmm :: FCode () -> FCode CmmGroup
 -- Get all the CmmTops (there should be no stmts)
@@ -735,29 +790,25 @@ mkCmmIfThen e tbranch = do
 
 
 mkCall :: CmmExpr -> (Convention, Convention) -> [CmmFormal] -> [CmmActual]
-       -> UpdFrameOffset -> (ByteOff,[(CmmExpr,ByteOff)]) -> FCode CmmAGraph
+       -> UpdFrameOffset -> [CmmActual] -> FCode CmmAGraph
 mkCall f (callConv, retConv) results actuals updfr_off extra_stack = do
   dflags <- getDynFlags
   k <- newLabelC
   let area = Young k
-      (off, copyin) = copyInOflow dflags retConv area results
+      (off, copyin) = copyInOflow dflags retConv area results []
       copyout = mkCallReturnsTo dflags f callConv actuals k off updfr_off extra_stack
   return (copyout <*> mkLabel k <*> copyin)
 
 mkCmmCall :: CmmExpr -> [CmmFormal] -> [CmmActual] -> UpdFrameOffset
           -> FCode CmmAGraph
 mkCmmCall f results actuals updfr_off
-   = mkCall f (NativeDirectCall, NativeReturn) results actuals updfr_off (0,[])
+   = mkCall f (NativeDirectCall, NativeReturn) results actuals updfr_off []
 
 
 -- ----------------------------------------------------------------------------
--- CgStmts
-
--- These functions deal in terms of CgStmts, which is an abstract type
--- representing the code in the current proc.
+-- turn CmmAGraph into CmmGraph, for making a new proc.
 
--- turn CgStmts into [CmmBasicBlock], for making a new proc.
-cgStmtsToBlocks :: CmmAGraph -> FCode CmmGraph
-cgStmtsToBlocks stmts
-  = do  { us <- newUniqSupply
-	; return (initUs_ us (lgraphOfAGraph stmts)) }	
+aGraphToGraph :: CmmAGraph -> FCode CmmGraph
+aGraphToGraph stmts
+  = do  { l <- newLabelC
+        ; return (labelAGraph l stmts) }
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs
index cbb2aa70bd..97104ce4a2 100644
--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -97,7 +97,7 @@ cgOpApp (StgPrimOp primop) args res_ty
   | primOpOutOfLine primop
   = do	{ cmm_args <- getNonVoidArgAmodes args
         ; let fun = CmmLit (CmmLabel (mkRtsPrimOpLabel primop))
-        ; emitCall (PrimOpCall, PrimOpReturn) fun cmm_args }
+        ; emitCall (NativeNodeCall, NativeReturn) fun cmm_args }
 
   | ReturnsPrim VoidRep <- result_info
   = do cgPrimOp [] primop args 
@@ -130,7 +130,7 @@ cgOpApp (StgPrimOp primop) args res_ty
 cgOpApp (StgPrimCallOp primcall) args _res_ty
   = do	{ cmm_args <- getNonVoidArgAmodes args
         ; let fun = CmmLit (CmmLabel (mkPrimCallLabel primcall))
-        ; emitCall (PrimOpCall, PrimOpReturn) fun cmm_args }
+        ; emitCall (NativeNodeCall, NativeReturn) fun cmm_args }
 
 ---------------------------------------------------
 cgPrimOp   :: [LocalReg]	-- where to put the results
diff --git a/compiler/codeGen/StgCmmProf.hs b/compiler/codeGen/StgCmmProf.hs
index b666554403..1b218462e1 100644
--- a/compiler/codeGen/StgCmmProf.hs
+++ b/compiler/codeGen/StgCmmProf.hs
@@ -82,24 +82,22 @@ costCentreFrom :: DynFlags
 	       -> CmmExpr	-- The cost centre from that closure
 costCentreFrom dflags cl = CmmLoad (cmmOffsetB dflags cl (oFFSET_StgHeader_ccs dflags)) (ccsType dflags)
 
+-- | The profiling header words in a static closure
 staticProfHdr :: DynFlags -> CostCentreStack -> [CmmLit]
--- The profiling header words in a static closure
--- Was SET_STATIC_PROF_HDR
 staticProfHdr dflags ccs
  = ifProfilingL dflags [mkCCostCentreStack ccs, staticLdvInit dflags]
 
+-- | Profiling header words in a dynamic closure
 dynProfHdr :: DynFlags -> CmmExpr -> [CmmExpr]
--- Profiling header words in a dynamic closure
 dynProfHdr dflags ccs = ifProfilingL dflags [ccs, dynLdvInit dflags]
 
-initUpdFrameProf :: ByteOff -> FCode ()
--- Initialise the profiling field of an update frame
-initUpdFrameProf frame_off
+-- | Initialise the profiling field of an update frame
+initUpdFrameProf :: CmmExpr -> FCode ()
+initUpdFrameProf frame
   = ifProfiling $	-- frame->header.prof.ccs = CCCS
     do dflags <- getDynFlags
-       emitStore (CmmStackSlot Old (frame_off - oFFSET_StgHeader_ccs dflags))
-                 curCCS
-	-- frame->header.prof.hp.rs = NULL (or frame-header.prof.hp.ldvw = 0) 
+       emitStore (cmmOffset dflags frame (oFFSET_StgHeader_ccs dflags)) curCCS
+        -- frame->header.prof.hp.rs = NULL (or frame-header.prof.hp.ldvw = 0)
 	-- is unnecessary because it is not used anyhow.
 
 ---------------------------------------------------------------------------
@@ -200,7 +198,7 @@ ifProfiling code
   = do dflags <- getDynFlags
        if dopt Opt_SccProfilingOn dflags
            then code
-           else nopC
+           else return ()
 
 ifProfilingL :: DynFlags -> [a] -> [a]
 ifProfilingL dflags xs
@@ -216,7 +214,7 @@ initCostCentres :: CollectedCCs -> FCode ()
 -- Emit the declarations
 initCostCentres (local_CCs, ___extern_CCs, singleton_CCSs)
   = do dflags <- getDynFlags
-       whenC (dopt Opt_SccProfilingOn dflags) $
+       when (dopt Opt_SccProfilingOn dflags) $
            do mapM_ emitCostCentreDecl local_CCs
               mapM_ emitCostCentreStackDecl singleton_CCSs
 
@@ -283,7 +281,7 @@ emitSetCCC :: CostCentre -> Bool -> Bool -> FCode ()
 emitSetCCC cc tick push
  = do dflags <- getDynFlags
       if not (dopt Opt_SccProfilingOn dflags)
-          then nopC
+          then return ()
           else do tmp <- newTemp (ccsType dflags) -- TODO FIXME NOW
                   pushCostCentre tmp curCCS cc
                   when tick $ emit (bumpSccCount dflags (CmmReg (CmmLocal tmp)))
@@ -321,7 +319,7 @@ dynLdvInit :: DynFlags -> CmmExpr
 dynLdvInit dflags =     -- (era << LDV_SHIFT) | LDV_STATE_CREATE  
   CmmMachOp (mo_wordOr dflags) [
       CmmMachOp (mo_wordShl dflags) [loadEra dflags, mkIntExpr dflags (lDV_SHIFT dflags)],
-      CmmLit (mkWordCLit dflags (lDV_STATE_CREATE dflags))
+      CmmLit (mkWordCLit dflags (iLDV_STATE_CREATE dflags))
   ]
         
 --
@@ -350,8 +348,8 @@ ldvEnter cl_ptr = do
     let -- don't forget to substract node's tag
         ldv_wd = ldvWord dflags cl_ptr
         new_ldv_wd = cmmOrWord dflags (cmmAndWord dflags (CmmLoad ldv_wd (bWord dflags))
-                                                         (CmmLit (mkWordCLit dflags (lDV_CREATE_MASK dflags))))
-                                      (cmmOrWord dflags (loadEra dflags) (CmmLit (mkWordCLit dflags (lDV_STATE_USE dflags))))
+                                                         (CmmLit (mkWordCLit dflags (iLDV_CREATE_MASK dflags))))
+                                      (cmmOrWord dflags (loadEra dflags) (CmmLit (mkWordCLit dflags (iLDV_STATE_USE dflags))))
     ifProfiling $
          -- if (era > 0) {
          --    LDVW((c)) = (LDVW((c)) & LDV_CREATE_MASK) |
@@ -371,10 +369,3 @@ ldvWord :: DynFlags -> CmmExpr -> CmmExpr
 ldvWord dflags closure_ptr
     = cmmOffsetB dflags closure_ptr (oFFSET_StgHeader_ldvw dflags)
 
-lDV_CREATE_MASK :: DynFlags -> StgWord
-lDV_CREATE_MASK dflags = toStgWord dflags (iLDV_CREATE_MASK dflags)
-lDV_STATE_CREATE :: DynFlags -> StgWord
-lDV_STATE_CREATE dflags = toStgWord dflags (iLDV_STATE_CREATE dflags)
-lDV_STATE_USE :: DynFlags -> StgWord
-lDV_STATE_USE dflags = toStgWord dflags (iLDV_STATE_USE dflags)
-
diff --git a/compiler/codeGen/StgCmmTicky.hs b/compiler/codeGen/StgCmmTicky.hs
index 79ad3ff822..01babb212f 100644
--- a/compiler/codeGen/StgCmmTicky.hs
+++ b/compiler/codeGen/StgCmmTicky.hs
@@ -333,7 +333,7 @@ tickyAllocHeap hp
 ifTicky :: FCode () -> FCode ()
 ifTicky code = do dflags <- getDynFlags
                   if dopt Opt_Ticky dflags then code
-                                           else nopC
+                                           else return ()
 
 -- All the ticky-ticky counters are declared "unsigned long" in C
 bumpTickyCounter :: FastString -> FCode ()
diff --git a/compiler/codeGen/StgCmmUtils.hs b/compiler/codeGen/StgCmmUtils.hs
index 386e7f46d6..138e00ee52 100644
--- a/compiler/codeGen/StgCmmUtils.hs
+++ b/compiler/codeGen/StgCmmUtils.hs
@@ -36,7 +36,6 @@ module StgCmmUtils (
         addToMem, addToMemE, addToMemLbl,
         mkWordCLit,
         newStringCLit, newByteStringCLit,
-        packHalfWordsCLit,
         blankWord
   ) where
 
@@ -196,9 +195,9 @@ emitRtsCallGen res pkg fun args safe
     call updfr_off =
       if safe then
         emit =<< mkCmmCall fun_expr res' args' updfr_off
-      else
-        emit $ mkUnsafeCall (ForeignTarget fun_expr
-                         (ForeignConvention CCallConv arg_hints res_hints)) res' args'
+      else do
+        let conv = ForeignConvention CCallConv arg_hints res_hints CmmMayReturn
+        emit $ mkUnsafeCall (ForeignTarget fun_expr conv) res' args'
     (args', arg_hints) = unzip args
     (res',  res_hints) = unzip res
     fun_expr = mkLblExpr (mkCmmCodeLabel pkg fun)
diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in
index 6d83150eb6..888ff1a0be 100644
--- a/compiler/ghc.cabal.in
+++ b/compiler/ghc.cabal.in
@@ -214,7 +214,6 @@ Library
         CgClosure
         CgCon
         CgExpr
-        CgExtCode
         CgForeignCall
         CgHeapery
         CgHpc
@@ -244,6 +243,7 @@ Library
         StgCmmProf
         StgCmmTicky
         StgCmmUtils
+        StgCmmExtCode
         ClosureInfo
         SMRep
         CoreArity
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index 448bd4d94c..207a237b7e 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -274,7 +274,6 @@ genCall env target res args ret = do
             CCallConv    -> CC_Ccc
             CApiConv     -> CC_Ccc
             PrimCallConv -> CC_Ccc
-            CmmCallConv  -> panic "CmmCallConv not supported here!"
 
     {-
         Some of the possibilities here are a worry with the use of a custom
diff --git a/compiler/main/HscMain.hs b/compiler/main/HscMain.hs
index 04f89bf63e..62a472037b 100644
--- a/compiler/main/HscMain.hs
+++ b/compiler/main/HscMain.hs
@@ -1349,7 +1349,11 @@ hscCompileCmmFile hsc_env filename = runHsc hsc_env $ do
     let dflags = hsc_dflags hsc_env
     cmm <- ioMsgMaybe $ parseCmmFile dflags filename
     liftIO $ do
-        rawCmms <- cmmToRawCmm dflags (Stream.yield cmm)
+        us <- mkSplitUniqSupply 'S'
+        let initTopSRT = initUs_ us emptySRT
+        dumpIfSet_dyn dflags Opt_D_dump_cmmz "Parsed Cmm" (ppr cmm)
+        (_, cmmgroup) <- cmmPipeline hsc_env initTopSRT cmm
+        rawCmms <- cmmToRawCmm dflags (Stream.yield (cmmOfZgraph cmmgroup))
         _ <- codeOutput dflags no_mod no_loc NoStubs [] rawCmms
         return ()
   where
diff --git a/compiler/nativeGen/AsmCodeGen.lhs b/compiler/nativeGen/AsmCodeGen.lhs
index 47fd96c426..d0e4a17746 100644
--- a/compiler/nativeGen/AsmCodeGen.lhs
+++ b/compiler/nativeGen/AsmCodeGen.lhs
@@ -971,12 +971,13 @@ cmmStmtConFold stmt
 cmmExprConFold :: ReferenceKind -> CmmExpr -> CmmOptM CmmExpr
 cmmExprConFold referenceKind expr = do
     dflags <- getDynFlags
-    -- Skip constant folding if new code generator is running
-    -- (this optimization is done in Hoopl)
-    -- SDM: re-enabled for now, while cmmRewriteAssignments is turned off
-    let expr' = if False -- dopt Opt_TryNewCodeGen dflags
+
+    -- With -O1 and greater, the cmmSink pass does constant-folding, so
+    -- we don't need to do it again here.
+    let expr' = if optLevel dflags >= 1
                     then expr
                     else cmmExprCon dflags expr
+
     cmmExprNative referenceKind expr'
 
 cmmExprCon :: DynFlags -> CmmExpr -> CmmExpr
diff --git a/compiler/nativeGen/RegAlloc/Linear/Main.hs b/compiler/nativeGen/RegAlloc/Linear/Main.hs
index a15bca07e7..3f1efe5824 100644
--- a/compiler/nativeGen/RegAlloc/Linear/Main.hs
+++ b/compiler/nativeGen/RegAlloc/Linear/Main.hs
@@ -434,10 +434,21 @@ raInsn block_live new_instrs id (LiveInstr (Instr instr) (Just live))
                         (uniqSetToList $ liveDieRead live)
                         (uniqSetToList $ liveDieWrite live)
 
-
 raInsn _ _ _ instr
         = pprPanic "raInsn" (text "no match for:" <> ppr instr)
 
+-- ToDo: what can we do about
+--
+--     R1 = x
+--     jump I64[x] // [R1]
+--
+-- where x is mapped to the same reg as R1.  We want to coalesce x and
+-- R1, but the register allocator doesn't know whether x will be
+-- assigned to again later, in which case x and R1 should be in
+-- different registers.  Right now we assume the worst, and the
+-- assignment to R1 will clobber x, so we'll spill x into another reg,
+-- generating another reg->reg move.
+
 
 isInReg :: Reg -> RegMap Loc -> Bool
 isInReg src assig | Just (InReg _) <- lookupUFM assig src = True
diff --git a/compiler/prelude/ForeignCall.lhs b/compiler/prelude/ForeignCall.lhs
index b3a2ad3ff1..b53ae7cf50 100644
--- a/compiler/prelude/ForeignCall.lhs
+++ b/compiler/prelude/ForeignCall.lhs
@@ -156,8 +156,7 @@ platforms.
 See: http://www.programmersheaven.com/2/Calling-conventions
 
 \begin{code}
-data CCallConv = CCallConv | CApiConv | StdCallConv
-               | CmmCallConv | PrimCallConv
+data CCallConv = CCallConv | CApiConv | StdCallConv | PrimCallConv
   deriving (Eq, Data, Typeable)
   {-! derive: Binary !-}
 
@@ -165,7 +164,6 @@ instance Outputable CCallConv where
   ppr StdCallConv = ptext (sLit "stdcall")
   ppr CCallConv   = ptext (sLit "ccall")
   ppr CApiConv    = ptext (sLit "capi")
-  ppr CmmCallConv = ptext (sLit "C--")
   ppr PrimCallConv = ptext (sLit "prim")
 
 defaultCCallConv :: CCallConv
@@ -175,7 +173,6 @@ ccallConvToInt :: CCallConv -> Int
 ccallConvToInt StdCallConv = 0
 ccallConvToInt CCallConv   = 1
 ccallConvToInt CApiConv    = panic "ccallConvToInt CApiConv"
-ccallConvToInt (CmmCallConv {})  = panic "ccallConvToInt CmmCallConv"
 ccallConvToInt (PrimCallConv {}) = panic "ccallConvToInt PrimCallConv"
 \end{code}
 
@@ -187,7 +184,6 @@ ccallConvAttribute :: CCallConv -> SDoc
 ccallConvAttribute StdCallConv       = text "__attribute__((__stdcall__))"
 ccallConvAttribute CCallConv         = empty
 ccallConvAttribute CApiConv          = empty
-ccallConvAttribute (CmmCallConv {})  = panic "ccallConvAttribute CmmCallConv"
 ccallConvAttribute (PrimCallConv {}) = panic "ccallConvAttribute PrimCallConv"
 \end{code}
 
@@ -326,17 +322,14 @@ instance Binary CCallConv where
             putByte bh 1
     put_ bh PrimCallConv = do
             putByte bh 2
-    put_ bh CmmCallConv = do
-            putByte bh 3
     put_ bh CApiConv = do
-            putByte bh 4
+            putByte bh 3
     get bh = do
             h <- getByte bh
             case h of
               0 -> do return CCallConv
               1 -> do return StdCallConv
               2 -> do return PrimCallConv
-              3 -> do return CmmCallConv
               _ -> do return CApiConv
 
 instance Binary CType where
diff --git a/compiler/typecheck/TcForeign.lhs b/compiler/typecheck/TcForeign.lhs
index 90a174081c..9c8b9807f2 100644
--- a/compiler/typecheck/TcForeign.lhs
+++ b/compiler/typecheck/TcForeign.lhs
@@ -464,7 +464,6 @@ checkCConv StdCallConv  = do dflags <- getDynFlags
                                          return CCallConv
 checkCConv PrimCallConv = do addErrTc (text "The `prim' calling convention can only be used with `foreign import'")
                              return PrimCallConv
-checkCConv CmmCallConv  = panic "checkCConv CmmCallConv"
 \end{code}
 
 Warnings
diff --git a/includes/Cmm.h b/includes/Cmm.h
index edcf46e7c0..afe08a26a3 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -9,36 +9,6 @@
  *
  * For the syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
  *
- * If you're used to the old HC file syntax, here's a quick cheat sheet
- * for converting HC code:
- *
- *       - Remove FB_/FE_
- *       - Remove all type casts
- *       - Remove '&'
- *       - STGFUN(foo) { ... }  ==>  foo { ... }
- *       - FN_(foo) { ... }  ==>  foo { ... }
- *       - JMP_(e)  ==> jump e;
- *       - Remove EXTFUN(foo)
- *       - Sp[n]  ==>  Sp(n)
- *       - Hp[n]  ==>  Hp(n)
- *       - Sp += n  ==> Sp_adj(n)
- *       - Hp += n  ==> Hp_adj(n)
- *       - R1.i   ==>  R1   (similarly for R1.w, R1.cl etc.)
- *       - You need to explicitly dereference variables; eg. 
- *             alloc_blocks   ==>  CInt[alloc_blocks]
- *       - convert all word offsets into byte offsets:
- *         	- e ==> WDS(e)
- *       - sizeofW(StgFoo)  ==>  SIZEOF_StgFoo
- *       - ENTRY_CODE(e)  ==>  %ENTRY_CODE(e)
- *       - get_itbl(c)  ==>  %GET_STD_INFO(c)
- *       - Change liveness masks in STK_CHK_GEN, HP_CHK_GEN:
- *        	R1_PTR | R2_PTR  ==>  R1_PTR & R2_PTR
- *         	(NOTE: | becomes &)
- *       - Declarations like 'StgPtr p;' become just 'W_ p;'
- *       - e->payload[n] ==> PAYLOAD(e,n)
- *       - Be very careful with comparisons: the infix versions (>, >=, etc.)
- *         are unsigned, so use %lt(a,b) to get signed less-than for example.
- *
  * Accessing fields of structures defined in the RTS header files is
  * done via automatically-generated macros in DerivedConstants.h.  For
  * example, where previously we used
@@ -136,6 +106,8 @@
    Misc useful stuff
    -------------------------------------------------------------------------- */
 
+#define ccall foreign "C"
+
 #define NULL (0::W_)
 
 #define STRING(name,str)			\
@@ -210,7 +182,7 @@
 #define Sp(n)  W_[Sp + WDS(n)]
 #define Hp(n)  W_[Hp + WDS(n)]
 
-#define Sp_adj(n) Sp = Sp + WDS(n)
+#define Sp_adj(n) Sp = Sp + WDS(n)  /* pronounced "spadge" */
 #define Hp_adj(n) Hp = Hp + WDS(n)
 
 /* -----------------------------------------------------------------------------
@@ -278,25 +250,37 @@
 #define LOAD_INFO \
     info = %INFO_PTR(UNTAG(P1));
 
-#define UNTAG_R1 \
-    P1 = UNTAG(P1);
+#define MAYBE_UNTAG(x) UNTAG(x);
 
 #else
 
-#define LOAD_INFO                               \
-  if (GETTAG(P1) != 0) {                        \
-      jump %ENTRY_CODE(Sp(0));                  \
+#define LOAD_INFO(ret,x)                        \
+  if (GETTAG(x) != 0) {                         \
+      ret(x);                                   \
   }                                             \
-  info = %INFO_PTR(P1);
+  info = %INFO_PTR(x);
 
-#define UNTAG_R1 /* nothing */
+#define MAYBE_UNTAG(x) (x) /* already untagged */
 
 #endif
 
-#define ENTER()						\
+// We need two versions of ENTER():
+//  - ENTER(x) takes the closure as an argument and uses return(),
+//    for use in civilized code where the stack is handled by GHC
+//
+//  - ENTER_NOSTACK() where the closure is in R1, and returns are
+//    explicit jumps, for use when we are doing the stack management
+//    ourselves.
+
+#define ENTER(x) ENTER_(return,x)
+#define ENTER_R1() ENTER_(RET_R1,R1)
+
+#define RET_R1(x) jump %ENTRY_CODE(Sp(0)) [R1]
+
+#define ENTER_(ret,x)                                   \
  again:							\
   W_ info;						\
-  LOAD_INFO                                             \
+  LOAD_INFO(ret,x)                                       \
   switch [INVALID_OBJECT .. N_CLOSURE_TYPES]		\
          (TO_W_( %INFO_TYPE(%STD_INFO(info)) )) {	\
   case							\
@@ -304,7 +288,7 @@
     IND_PERM,						\
     IND_STATIC:						\
    {							\
-      P1 = StgInd_indirectee(P1);			\
+      x = StgInd_indirectee(x);                         \
       goto again;					\
    }							\
   case							\
@@ -318,12 +302,12 @@
     BCO,						\
     PAP:						\
    {							\
-      jump %ENTRY_CODE(Sp(0));				\
+       ret(x);                                          \
    }							\
   default:						\
    {							\
-      UNTAG_R1                                          \
-      jump %ENTRY_CODE(info);				\
+       x = MAYBE_UNTAG(x);                              \
+       jump %ENTRY_CODE(info) (x);                      \
    }							\
   }
 
@@ -348,7 +332,6 @@
  */
 #include "stg/RtsMachRegs.h"
 
-#include "rts/storage/Liveness.h"
 #include "rts/prof/LDV.h"
 
 #undef BLOCK_SIZE
@@ -359,6 +342,18 @@
 #define MyCapability()  (BaseReg - OFFSET_Capability_r)
 
 /* -------------------------------------------------------------------------
+   Info tables
+   ------------------------------------------------------------------------- */
+
+#if defined(PROFILING)
+#define PROF_HDR_FIELDS(w_)                     \
+  w_ prof_hdr_1,                                \
+  w_ prof_hdr_2,
+#else
+#define PROF_HDR_FIELDS(w_) /* nothing */
+#endif
+
+/* -------------------------------------------------------------------------
    Allocation and garbage collection
    ------------------------------------------------------------------------- */
 
@@ -371,30 +366,134 @@
  * ticky-ticky.  It's not clear whether eg. the size field of an array
  * should be counted as "admin", or the various fields of a BCO.
  */
-#define ALLOC_PRIM(bytes,liveness,reentry)			\
-   HP_CHK_GEN_TICKY(bytes,liveness,reentry);			\
+#define ALLOC_PRIM(bytes)                                       \
+   HP_CHK_GEN_TICKY(bytes);                                     \
    TICK_ALLOC_PRIM(SIZEOF_StgHeader,bytes-SIZEOF_StgHeader,0);	\
    CCCS_ALLOC(bytes);
 
+#define HEAP_CHECK(bytes,failure)                       \
+    Hp = Hp + bytes;                                    \
+    if (Hp > HpLim) { HpAlloc = bytes; failure; }       \
+    TICK_ALLOC_HEAP_NOCTR(bytes);
+
+#define ALLOC_PRIM_WITH_CUSTOM_FAILURE(bytes,failure)           \
+    HEAP_CHECK(bytes,failure)                                   \
+    TICK_ALLOC_PRIM(SIZEOF_StgHeader,bytes-SIZEOF_StgHeader,0); \
+    CCCS_ALLOC(bytes);
+
+#define ALLOC_PRIM_P(bytes,fun,arg)                             \
+    ALLOC_PRIM_WITH_CUSTOM_FAILURE(bytes,GC_PRIM_P(fun,arg));
+
+#define ALLOC_PRIM_N(bytes,fun,arg)                             \
+    ALLOC_PRIM_WITH_CUSTOM_FAILURE(bytes,GC_PRIM_N(fun,arg));
+
 /* CCS_ALLOC wants the size in words, because ccs->mem_alloc is in words */
 #define CCCS_ALLOC(__alloc) CCS_ALLOC(BYTES_TO_WDS(__alloc), CCCS)
 
-#define HP_CHK_GEN_TICKY(alloc,liveness,reentry)	\
-   HP_CHK_GEN(alloc,liveness,reentry);			\
+#define HP_CHK_GEN_TICKY(alloc)                 \
+   HP_CHK_GEN(alloc);                           \
    TICK_ALLOC_HEAP_NOCTR(alloc);
 
+#define HP_CHK_P(bytes, fun, arg)               \
+   HEAP_CHECK(bytes, GC_PRIM_P(fun,arg))
+
+#define ALLOC_P_TICKY(alloc, fun, arg)         \
+   HP_CHK_P(alloc);                           \
+   TICK_ALLOC_HEAP_NOCTR(alloc);
+
+#define CHECK_GC()                                                      \
+  (bdescr_link(CurrentNursery) == NULL ||                               \
+   generation_n_new_large_words(W_[g0]) >= TO_W_(CLong[large_alloc_lim]))
+
 // allocate() allocates from the nursery, so we check to see
 // whether the nursery is nearly empty in any function that uses
 // allocate() - this includes many of the primops.
-#define MAYBE_GC(liveness,reentry)			\
-    if (bdescr_link(CurrentNursery) == NULL || \
-        generation_n_new_large_words(W_[g0]) >= TO_W_(CLong[large_alloc_lim])) {   \
-	R9  = liveness;					\
-        R10 = reentry;					\
-        HpAlloc = 0;					\
-        jump stg_gc_gen_hp;				\
+//
+// HACK alert: the __L__ stuff is here to coax the common-block
+// eliminator into commoning up the call stg_gc_noregs() with the same
+// code that gets generated by a STK_CHK_GEN() in the same proc.  We
+// also need an if (0) { goto __L__; } so that the __L__ label isn't
+// optimised away by the control-flow optimiser prior to common-block
+// elimination (it will be optimised away later).
+//
+// This saves some code in gmp-wrappers.cmm where we have lots of
+// MAYBE_GC() in the same proc as STK_CHK_GEN().
+//
+#define MAYBE_GC(retry)                         \
+    if (CHECK_GC()) {                           \
+        HpAlloc = 0;                            \
+        goto __L__;                             \
+  __L__:                                        \
+        call stg_gc_noregs();                   \
+        goto retry;                             \
+   }                                            \
+   if (0) { goto __L__; }
+
+#define GC_PRIM(fun)                            \
+        R9 = fun;                               \
+        jump stg_gc_prim();
+
+#define GC_PRIM_N(fun,arg)                      \
+        R9 = fun;                               \
+        jump stg_gc_prim_n(arg);
+
+#define GC_PRIM_P(fun,arg)                      \
+        R9 = fun;                               \
+        jump stg_gc_prim_p(arg);
+
+#define GC_PRIM_PP(fun,arg1,arg2)               \
+        R9 = fun;                               \
+        jump stg_gc_prim_pp(arg1,arg2);
+
+#define MAYBE_GC_(fun)                          \
+    if (CHECK_GC()) {                           \
+        HpAlloc = 0;                            \
+        GC_PRIM(fun)                            \
+   }
+
+#define MAYBE_GC_N(fun,arg)                     \
+    if (CHECK_GC()) {                           \
+        HpAlloc = 0;                            \
+        GC_PRIM_N(fun,arg)                      \
+   }
+
+#define MAYBE_GC_P(fun,arg)                     \
+    if (CHECK_GC()) {                           \
+        HpAlloc = 0;                            \
+        GC_PRIM_P(fun,arg)                      \
    }
 
+#define MAYBE_GC_PP(fun,arg1,arg2)              \
+    if (CHECK_GC()) {                           \
+        HpAlloc = 0;                            \
+        GC_PRIM_PP(fun,arg1,arg2)               \
+   }
+
+#define STK_CHK(n, fun)                         \
+    if (Sp - n < SpLim) {                       \
+        GC_PRIM(fun)                            \
+    }
+
+#define STK_CHK_P(n, fun, arg)                  \
+    if (Sp - n < SpLim) {                       \
+        GC_PRIM_P(fun,arg)                      \
+    }
+
+#define STK_CHK_PP(n, fun, arg1, arg2)          \
+    if (Sp - n < SpLim) {                       \
+        GC_PRIM_PP(fun,arg1,arg2)               \
+    }
+
+#define STK_CHK_ENTER(n, closure)               \
+    if (Sp - n < SpLim) {                       \
+        jump __stg_gc_enter_1(closure);         \
+    }
+
+// A funky heap check used by AutoApply.cmm
+
+#define HP_CHK_NP_ASSIGN_SP0(size,f)                    \
+    HEAP_CHECK(size, Sp(0) = f; jump __stg_gc_enter_1 [R1];)
+
 /* -----------------------------------------------------------------------------
    Closure headers
    -------------------------------------------------------------------------- */
@@ -481,23 +580,6 @@
 #endif
 
 /* -----------------------------------------------------------------------------
-   Voluntary Yields/Blocks
-
-   We only have a generic version of this at the moment - if it turns
-   out to be slowing us down we can make specialised ones.
-   -------------------------------------------------------------------------- */
-
-#define YIELD(liveness,reentry)			\
-   R9  = liveness;				\
-   R10 = reentry;				\
-   jump stg_gen_yield;
-
-#define BLOCK(liveness,reentry)			\
-   R9  = liveness;				\
-   R10 = reentry;				\
-   jump stg_gen_block;
-
-/* -----------------------------------------------------------------------------
    Ticky macros 
    -------------------------------------------------------------------------- */
 
@@ -585,6 +667,63 @@
     TICK_BUMP_BY(ALLOC_HEAP_tot,n)
 
 /* -----------------------------------------------------------------------------
+   Saving and restoring STG registers
+
+   STG registers must be saved around a C call, just in case the STG
+   register is mapped to a caller-saves machine register.  Normally we
+   don't need to worry about this the code generator has already
+   loaded any live STG registers into variables for us, but in
+   hand-written low-level Cmm code where we don't know which registers
+   are live, we might have to save them all.
+   -------------------------------------------------------------------------- */
+
+#define SAVE_STGREGS                            \
+    W_ r1, r2, r3,  r4,  r5,  r6,  r7,  r8;     \
+    F_ f1, f2, f3, f4;                          \
+    D_ d1, d2;                                  \
+    L_ l1;                                      \
+                                                \
+    r1 = R1;                                    \
+    r2 = R2;                                    \
+    r3 = R3;                                    \
+    r4 = R4;                                    \
+    r5 = R5;                                    \
+    r6 = R6;                                    \
+    r7 = R7;                                    \
+    r8 = R8;                                    \
+                                                \
+    f1 = F1;                                    \
+    f2 = F2;                                    \
+    f3 = F3;                                    \
+    f4 = F4;                                    \
+                                                \
+    d1 = D1;                                    \
+    d2 = D2;                                    \
+                                                \
+    l1 = L1;
+
+
+#define RESTORE_STGREGS                         \
+    R1 = r1;                                    \
+    R2 = r2;                                    \
+    R3 = r3;                                    \
+    R4 = r4;                                    \
+    R5 = r5;                                    \
+    R6 = r6;                                    \
+    R7 = r7;                                    \
+    R8 = r8;                                    \
+                                                \
+    F1 = f1;                                    \
+    F2 = f2;                                    \
+    F3 = f3;                                    \
+    F4 = f4;                                    \
+                                                \
+    D1 = d1;                                    \
+    D2 = d2;                                    \
+                                                \
+    L1 = l1;
+
+/* -----------------------------------------------------------------------------
    Misc junk
    -------------------------------------------------------------------------- */
 
@@ -592,14 +731,14 @@
 #define END_TSO_QUEUE             stg_END_TSO_QUEUE_closure
 #define END_INVARIANT_CHECK_QUEUE stg_END_INVARIANT_CHECK_QUEUE_closure
 
-#define recordMutableCap(p, gen, regs)					\
+#define recordMutableCap(p, gen)                                        \
   W_ __bd;								\
   W_ mut_list;								\
   mut_list = Capability_mut_lists(MyCapability()) + WDS(gen);		\
  __bd = W_[mut_list];							\
   if (bdescr_free(__bd) >= bdescr_start(__bd) + BLOCK_SIZE) {		\
       W_ __new_bd;							\
-      ("ptr" __new_bd) = foreign "C" allocBlock_lock() [regs];		\
+      ("ptr" __new_bd) = foreign "C" allocBlock_lock();			\
       bdescr_link(__new_bd) = __bd;					\
       __bd = __new_bd;							\
       W_[mut_list] = __bd;						\
@@ -609,13 +748,13 @@
   W_[free] = p;								\
   bdescr_free(__bd) = free + WDS(1);
 
-#define recordMutable(p, regs)                                  \
+#define recordMutable(p)                                        \
       P_ __p;                                                   \
       W_ __bd;                                                  \
       W_ __gen;                                                 \
       __p = p;                                                  \
       __bd = Bdescr(__p);                                       \
       __gen = TO_W_(bdescr_gen_no(__bd));                       \
-      if (__gen > 0) { recordMutableCap(__p, __gen, regs); }
+      if (__gen > 0) { recordMutableCap(__p, __gen); }
 
 #endif /* CMM_H */
diff --git a/includes/Rts.h b/includes/Rts.h
index c52fe63d78..b31776828f 100644
--- a/includes/Rts.h
+++ b/includes/Rts.h
@@ -208,7 +208,6 @@ INLINE_HEADER Time fsecondsToTime (double t)
 #include "rts/storage/FunTypes.h"
 #include "rts/storage/InfoTables.h"
 #include "rts/storage/Closures.h"
-#include "rts/storage/Liveness.h"
 #include "rts/storage/ClosureTypes.h"
 #include "rts/storage/TSO.h"
 #include "stg/MiscClosures.h" /* InfoTables, closures etc. defined in the RTS */
diff --git a/includes/rts/Constants.h b/includes/rts/Constants.h
index cd741be7e0..2fab041c22 100644
--- a/includes/rts/Constants.h
+++ b/includes/rts/Constants.h
@@ -118,11 +118,6 @@
    pushed in one of the heap check fragments in HeapStackCheck.hc
    (ie. currently the generic heap checks - 3 words for StgRetDyn,
    18 words for the saved registers, see StgMacros.h).
-
-   In the event of an unboxed tuple or let-no-escape stack/heap check
-   failure, there will be other words on the stack which are covered
-   by the RET_DYN frame.  These will have been accounted for by stack
-   checks however, so we don't need to allow for them here.
    -------------------------------------------------------------------------- */
 
 #define RESERVED_STACK_WORDS 21
@@ -277,25 +272,6 @@
  */
 #define TSO_SQUEEZED 128
 
-/* -----------------------------------------------------------------------------
-   RET_DYN stack frames
-   -------------------------------------------------------------------------- */
-
-/* VERY MAGIC CONSTANTS!
- * must agree with code in HeapStackCheck.c, stg_gen_chk, and
- * RESERVED_STACK_WORDS in Constants.h.
- */
-#define RET_DYN_BITMAP_SIZE 8
-#define RET_DYN_NONPTR_REGS_SIZE 10
-
-/* Sanity check that RESERVED_STACK_WORDS is reasonable.  We can't
- * just derive RESERVED_STACK_WORDS because it's used in Haskell code
- * too.
- */
-#if RESERVED_STACK_WORDS != (3 + RET_DYN_BITMAP_SIZE + RET_DYN_NONPTR_REGS_SIZE)
-#error RESERVED_STACK_WORDS may be wrong!
-#endif
-
 /*
  * The number of times we spin in a spin lock before yielding (see
  * #3758).  To tune this value, use the benchmark in #3758: run the
diff --git a/includes/rts/storage/ClosureMacros.h b/includes/rts/storage/ClosureMacros.h
index 6fdd55727a..dd5f428135 100644
--- a/includes/rts/storage/ClosureMacros.h
+++ b/includes/rts/storage/ClosureMacros.h
@@ -410,14 +410,6 @@ EXTERN_INLINE StgWord stack_frame_sizeW( StgClosure *frame )
     info = get_ret_itbl(frame);
     switch (info->i.type) {
 
-    case RET_DYN:
-    {
-	StgRetDyn *dyn = (StgRetDyn *)frame;
-	return  sizeofW(StgRetDyn) + RET_DYN_BITMAP_SIZE + 
-	    RET_DYN_NONPTR_REGS_SIZE +
-	    RET_DYN_PTRS(dyn->liveness) + RET_DYN_NONPTRS(dyn->liveness);
-    }
-	    
     case RET_FUN:
 	return sizeofW(StgRetFun) + ((StgRetFun *)frame)->size;
 
diff --git a/includes/rts/storage/ClosureTypes.h b/includes/rts/storage/ClosureTypes.h
index 75ec08bf18..4e3b1e6a72 100644
--- a/includes/rts/storage/ClosureTypes.h
+++ b/includes/rts/storage/ClosureTypes.h
@@ -52,33 +52,32 @@
 #define RET_BCO                 31
 #define RET_SMALL	        32
 #define RET_BIG		        33
-#define RET_DYN		        34
-#define RET_FUN                 35
-#define UPDATE_FRAME	        36
-#define CATCH_FRAME	        37
-#define UNDERFLOW_FRAME         38
-#define STOP_FRAME              39
-#define BLOCKING_QUEUE		40
-#define BLACKHOLE	        41
-#define MVAR_CLEAN	        42
-#define MVAR_DIRTY	        43
-#define ARR_WORDS	        44
-#define MUT_ARR_PTRS_CLEAN      45
-#define MUT_ARR_PTRS_DIRTY      46
-#define MUT_ARR_PTRS_FROZEN0    47
-#define MUT_ARR_PTRS_FROZEN     48
-#define MUT_VAR_CLEAN	        49
-#define MUT_VAR_DIRTY	        50
-#define WEAK		        51
-#define PRIM		        52
-#define MUT_PRIM                53
-#define TSO		        54
-#define STACK                   55
-#define TREC_CHUNK              56
-#define ATOMICALLY_FRAME        57
-#define CATCH_RETRY_FRAME       58
-#define CATCH_STM_FRAME         59
-#define WHITEHOLE               60
-#define N_CLOSURE_TYPES         61
+#define RET_FUN                 34
+#define UPDATE_FRAME	        35
+#define CATCH_FRAME	        36
+#define UNDERFLOW_FRAME         37
+#define STOP_FRAME              38
+#define BLOCKING_QUEUE          39
+#define BLACKHOLE	        40
+#define MVAR_CLEAN	        41
+#define MVAR_DIRTY	        42
+#define ARR_WORDS	        43
+#define MUT_ARR_PTRS_CLEAN      44
+#define MUT_ARR_PTRS_DIRTY      45
+#define MUT_ARR_PTRS_FROZEN0    46
+#define MUT_ARR_PTRS_FROZEN     47
+#define MUT_VAR_CLEAN	        48
+#define MUT_VAR_DIRTY           49
+#define WEAK		        50
+#define PRIM		        51
+#define MUT_PRIM                52
+#define TSO		        53
+#define STACK                   54
+#define TREC_CHUNK              55
+#define ATOMICALLY_FRAME        56
+#define CATCH_RETRY_FRAME       57
+#define CATCH_STM_FRAME         58
+#define WHITEHOLE               59
+#define N_CLOSURE_TYPES         60
 
 #endif /* RTS_STORAGE_CLOSURETYPES_H */
diff --git a/includes/rts/storage/Closures.h b/includes/rts/storage/Closures.h
index 5f4f03541f..fcba1ebeb6 100644
--- a/includes/rts/storage/Closures.h
+++ b/includes/rts/storage/Closures.h
@@ -240,60 +240,6 @@ typedef struct {
 #define BCO_BITMAP_SIZEW(bco) ((BCO_BITMAP_SIZE(bco) + BITS_IN(StgWord) - 1) \
 			        / BITS_IN(StgWord))
 
-/* -----------------------------------------------------------------------------
-   Dynamic stack frames for generic heap checks.
-
-   These generic heap checks are slow, but have the advantage of being
-   usable in a variety of situations.
-
-   The one restriction is that any relevant SRTs must already be pointed
-   to from the stack.  The return address doesn't need to have an info
-   table attached: hence it can be any old code pointer.
-
-   The liveness mask contains a 1 at bit n, if register Rn contains a
-   non-pointer.  The contents of all 8 vanilla registers are always saved
-   on the stack; the liveness mask tells the GC which ones contain
-   pointers.
-
-   Good places to use a generic heap check: 
-
-        - case alternatives (the return address with an SRT is already
-	  on the stack).
-
-	- primitives (no SRT required).
-
-   The stack frame layout for a RET_DYN is like this:
-
-          some pointers         |-- RET_DYN_PTRS(liveness) words
-          some nonpointers      |-- RET_DYN_NONPTRS(liveness) words
-			       
-	  L1                    \
-          D1-2                  |-- RET_DYN_NONPTR_REGS_SIZE words
-	  F1-4                  /
-			       
-	  R1-8                  |-- RET_DYN_BITMAP_SIZE words
-			       
-	  return address        \
-	  liveness mask         |-- StgRetDyn structure
-	  stg_gen_chk_info      /
-
-   we assume that the size of a double is always 2 pointers (wasting a
-   word when it is only one pointer, but avoiding lots of #ifdefs).
-
-   See Liveness.h for the macros (RET_DYN_PTRS() etc.).
-
-   NOTE: if you change the layout of RET_DYN stack frames, then you
-   might also need to adjust the value of RESERVED_STACK_WORDS in
-   Constants.h.
-   -------------------------------------------------------------------------- */
-
-typedef struct {
-    const StgInfoTable* info;
-    StgWord        liveness;
-    StgWord        ret_addr;
-    StgClosure *   payload[FLEXIBLE_ARRAY];
-} StgRetDyn;
-
 /* A function return stack frame: used when saving the state for a
  * garbage collection at a function entry point.  The function
  * arguments are on the stack, and we also save the function (its
@@ -430,7 +376,7 @@ typedef struct {
 
 typedef struct {
   StgHeader      header;
-  StgBool        running_alt_code;
+  StgWord        running_alt_code;
   StgClosure    *first_code;
   StgClosure    *alt_code;
 } StgCatchRetryFrame;
diff --git a/includes/rts/storage/Liveness.h b/includes/rts/storage/Liveness.h
deleted file mode 100644
index 66c82f3134..0000000000
--- a/includes/rts/storage/Liveness.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* -----------------------------------------------------------------------------
- *
- * (c) The University of Glasgow 2004
- *
- * Building liveness masks for RET_DYN stack frames.
- * A few macros that are used in both .cmm and .c sources.
- *
- * A liveness mask is constructed like so:
- *
- *    R1_PTR & R2_PTR & R3_PTR
- *
- * -------------------------------------------------------------------------- */
-
-#ifndef RTS_STORAGE_LIVENESS_H
-#define RTS_STORAGE_LIVENESS_H
-
-#define NO_PTRS   0xff
-#define R1_PTR	  (NO_PTRS ^ (1<<0))
-#define R2_PTR	  (NO_PTRS ^ (1<<1))
-#define R3_PTR	  (NO_PTRS ^ (1<<2))
-#define R4_PTR	  (NO_PTRS ^ (1<<3))
-#define R5_PTR	  (NO_PTRS ^ (1<<4))
-#define R6_PTR	  (NO_PTRS ^ (1<<5))
-#define R7_PTR	  (NO_PTRS ^ (1<<6))
-#define R8_PTR	  (NO_PTRS ^ (1<<7))
-
-#define N_NONPTRS(n)  ((n)<<16)
-#define N_PTRS(n)     ((n)<<24)
-
-#define RET_DYN_NONPTRS(l) ((l)>>16 & 0xff)
-#define RET_DYN_PTRS(l)    ((l)>>24 & 0xff)
-#define RET_DYN_LIVENESS(l) ((l) & 0xffff)
-
-#endif /* RTS_STORAGE_LIVENESS_H */
diff --git a/includes/rts/storage/SMPClosureOps.h b/includes/rts/storage/SMPClosureOps.h
index 8dee7cbcf9..cd6a789af4 100644
--- a/includes/rts/storage/SMPClosureOps.h
+++ b/includes/rts/storage/SMPClosureOps.h
@@ -12,7 +12,7 @@
 #ifdef CMINUSMINUS
 
 #define unlockClosure(ptr,info)                 \
-    prim %write_barrier() [];                   \
+    prim %write_barrier();                      \
     StgHeader_info(ptr) = info;    
 
 #else
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index c93cc319c0..b7b24a8632 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -169,23 +169,6 @@ RTS_RET(stg_noforceIO);
 
 /* standard selector thunks */
 
-RTS_RET(stg_sel_ret_0_upd);
-RTS_RET(stg_sel_ret_1_upd);
-RTS_RET(stg_sel_ret_2_upd);
-RTS_RET(stg_sel_ret_3_upd);
-RTS_RET(stg_sel_ret_4_upd);
-RTS_RET(stg_sel_ret_5_upd);
-RTS_RET(stg_sel_ret_6_upd);
-RTS_RET(stg_sel_ret_7_upd);
-RTS_RET(stg_sel_ret_8_upd);
-RTS_RET(stg_sel_ret_9_upd);
-RTS_RET(stg_sel_ret_10_upd);
-RTS_RET(stg_sel_ret_11_upd);
-RTS_RET(stg_sel_ret_12_upd);
-RTS_RET(stg_sel_ret_13_upd);
-RTS_RET(stg_sel_ret_14_upd);
-RTS_RET(stg_sel_ret_15_upd);
-
 RTS_ENTRY(stg_sel_0_upd);
 RTS_ENTRY(stg_sel_1_upd);
 RTS_ENTRY(stg_sel_2_upd);
@@ -267,45 +250,39 @@ RTS_FUN_DECL(stg_PAP_apply);
 
 /* standard GC & stack check entry points, all defined in HeapStackCheck.hc */
 
-RTS_RET(stg_enter);
+RTS_FUN_DECL(stg_gc_noregs);
+
 RTS_RET(stg_enter_checkbh);
 
-RTS_RET(stg_gc_void);
+RTS_RET(stg_ret_v);
+RTS_RET(stg_ret_p);
+RTS_RET(stg_ret_n);
+RTS_RET(stg_ret_f);
+RTS_RET(stg_ret_d);
+RTS_RET(stg_ret_l);
 
+RTS_FUN_DECL(stg_gc_prim_p);
+RTS_FUN_DECL(stg_gc_prim_pp);
+RTS_FUN_DECL(stg_gc_prim_n);
+
+RTS_RET(stg_enter);
 RTS_FUN_DECL(__stg_gc_enter_1);
-RTS_FUN_DECL(stg_gc_noregs);
 
-RTS_RET(stg_gc_unpt_r1);
 RTS_FUN_DECL(stg_gc_unpt_r1);
-
-RTS_RET(stg_gc_unbx_r1);
 RTS_FUN_DECL(stg_gc_unbx_r1);
-
-RTS_RET(stg_gc_f1);
 RTS_FUN_DECL(stg_gc_f1);
-
-RTS_RET(stg_gc_d1);
 RTS_FUN_DECL(stg_gc_d1);
-
-RTS_RET(stg_gc_l1);
 RTS_FUN_DECL(stg_gc_l1);
+RTS_FUN_DECL(stg_gc_pp);
+RTS_FUN_DECL(stg_gc_ppp);
+RTS_FUN_DECL(stg_gc_pppp);
 
 RTS_RET(stg_gc_fun);
 RTS_FUN_DECL(__stg_gc_fun);
 
-RTS_RET(stg_gc_gen);
-RTS_FUN_DECL(stg_gc_gen);
-
-RTS_RET(stg_ut_1_0_unreg);
-
-RTS_FUN_DECL(stg_gc_gen_hp);
-RTS_FUN_DECL(stg_gc_ut);
-RTS_FUN_DECL(stg_gen_yield);
 RTS_FUN_DECL(stg_yield_noregs);
 RTS_FUN_DECL(stg_yield_to_interpreter);
-RTS_FUN_DECL(stg_gen_block);
 RTS_FUN_DECL(stg_block_noregs);
-RTS_FUN_DECL(stg_block_1);
 RTS_FUN_DECL(stg_block_blackhole);
 RTS_FUN_DECL(stg_block_blackhole_finally);
 RTS_FUN_DECL(stg_block_takemvar);
diff --git a/includes/stg/Regs.h b/includes/stg/Regs.h
index bf17b7e825..70e93d3234 100644
--- a/includes/stg/Regs.h
+++ b/includes/stg/Regs.h
@@ -93,10 +93,10 @@ typedef struct {
 
 /*
  * Registers Hp and HpLim are global across the entire system, and are
- * copied into the RegTable before executing a thread.
+ * copied into the RegTable or registers before executing a thread.
  *
- * Registers Sp and SpLim are saved in the TSO for the
- * thread, but are copied into the RegTable before executing a thread.
+ * Registers Sp and SpLim are saved in the TSO for the thread, but are
+ * copied into the RegTable or registers before executing a thread.
  *
  * All other registers are "general purpose", and are used for passing
  * arguments to functions, and returning values.  The code generator
@@ -116,45 +116,6 @@ typedef struct {
  * (pseudo-)registers in those cases.
  */
 
-/* 
- * Locations for saving per-thread registers.
- */
-
-#define SAVE_Sp    	    (CurrentTSO->sp)
-#define SAVE_SpLim    	    (CurrentTSO->splim)
-
-#define SAVE_Hp	    	    (BaseReg->rHp)
-
-#define SAVE_CurrentTSO     (BaseReg->rCurrentTSO)
-#define SAVE_CurrentNursery (BaseReg->rCurrentNursery)
-#define SAVE_HpAlloc        (BaseReg->rHpAlloc)
-
-/* We sometimes need to save registers across a C-call, eg. if they
- * are clobbered in the standard calling convention.  We define the
- * save locations for all registers in the register table.
- */
-
-#define SAVE_R1             (BaseReg->rR1)
-#define SAVE_R2             (BaseReg->rR2)
-#define SAVE_R3             (BaseReg->rR3)
-#define SAVE_R4             (BaseReg->rR4)
-#define SAVE_R5             (BaseReg->rR5)
-#define SAVE_R6             (BaseReg->rR6)
-#define SAVE_R7             (BaseReg->rR7)
-#define SAVE_R8             (BaseReg->rR8)
-#define SAVE_R9             (BaseReg->rR9)
-#define SAVE_R10            (BaseReg->rR10)
- 
-#define SAVE_F1             (BaseReg->rF1)
-#define SAVE_F2             (BaseReg->rF2)
-#define SAVE_F3             (BaseReg->rF3)
-#define SAVE_F4             (BaseReg->rF4)
-
-#define SAVE_D1             (BaseReg->rD1)
-#define SAVE_D2             (BaseReg->rD2)
-
-#define SAVE_L1             (BaseReg->rL1)
-
 /* -----------------------------------------------------------------------------
  * Emit the GCC-specific register declarations for each machine
  * register being used.  If any STG register isn't mapped to a machine
@@ -163,11 +124,6 @@ typedef struct {
  * First, the general purpose registers.  The idea is, if a particular
  * general-purpose STG register can't be mapped to a real machine
  * register, it won't be used at all.  Instead, we'll use the stack.
- *
- * This is an improvement on the way things used to be done, when all
- * registers were mapped to locations in the register table, and stuff
- * was being shifted from the stack to the register table and back
- * again for no good reason (on register-poor architectures).
  */
 
 /* define NO_REGS to omit register declarations - used in RTS C code
@@ -402,287 +358,6 @@ GLOBAL_REG_DECL(bdescr *,HpAlloc,REG_HpAlloc)
 #define stg_gc_enter_1            (FunReg->stgGCEnter1)
 #define stg_gc_fun                (FunReg->stgGCFun)
 
-/* -----------------------------------------------------------------------------
-   For any registers which are denoted "caller-saves" by the C calling
-   convention, we have to emit code to save and restore them across C
-   calls.
-   -------------------------------------------------------------------------- */
-
-#ifdef CALLER_SAVES_R1
-#define CALLER_SAVE_R1    	SAVE_R1 = R1;
-#define CALLER_RESTORE_R1 	R1 = SAVE_R1;
-#else
-#define CALLER_SAVE_R1      	/* nothing */
-#define CALLER_RESTORE_R1    	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_R2
-#define CALLER_SAVE_R2    	SAVE_R2 = R2;
-#define CALLER_RESTORE_R2 	R2 = SAVE_R2;
-#else
-#define CALLER_SAVE_R2      	/* nothing */
-#define CALLER_RESTORE_R2    	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_R3
-#define CALLER_SAVE_R3    	SAVE_R3 = R3;
-#define CALLER_RESTORE_R3 	R3 = SAVE_R3;
-#else
-#define CALLER_SAVE_R3      	/* nothing */
-#define CALLER_RESTORE_R3    	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_R4
-#define CALLER_SAVE_R4    	SAVE_R4 = R4;
-#define CALLER_RESTORE_R4 	R4 = SAVE_R4;
-#else
-#define CALLER_SAVE_R4      	/* nothing */
-#define CALLER_RESTORE_R4    	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_R5
-#define CALLER_SAVE_R5    	SAVE_R5 = R5;
-#define CALLER_RESTORE_R5 	R5 = SAVE_R5;
-#else
-#define CALLER_SAVE_R5      	/* nothing */
-#define CALLER_RESTORE_R5    	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_R6
-#define CALLER_SAVE_R6    	SAVE_R6 = R6;
-#define CALLER_RESTORE_R6 	R6 = SAVE_R6;
-#else
-#define CALLER_SAVE_R6      	/* nothing */
-#define CALLER_RESTORE_R6    	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_R7
-#define CALLER_SAVE_R7    	SAVE_R7 = R7;
-#define CALLER_RESTORE_R7 	R7 = SAVE_R7;
-#else
-#define CALLER_SAVE_R7      	/* nothing */
-#define CALLER_RESTORE_R7    	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_R8
-#define CALLER_SAVE_R8    	SAVE_R8 = R8;
-#define CALLER_RESTORE_R8 	R8 = SAVE_R8;
-#else
-#define CALLER_SAVE_R8      	/* nothing */
-#define CALLER_RESTORE_R8    	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_R9
-#define CALLER_SAVE_R9    	SAVE_R9 = R9;
-#define CALLER_RESTORE_R9 	R9 = SAVE_R9;
-#else
-#define CALLER_SAVE_R9      	/* nothing */
-#define CALLER_RESTORE_R9    	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_R10
-#define CALLER_SAVE_R10    	SAVE_R10 = R10;
-#define CALLER_RESTORE_R10 	R10 = SAVE_R10;
-#else
-#define CALLER_SAVE_R10      	/* nothing */
-#define CALLER_RESTORE_R10    	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_F1
-#define CALLER_SAVE_F1    	SAVE_F1 = F1;
-#define CALLER_RESTORE_F1 	F1 = SAVE_F1;
-#else
-#define CALLER_SAVE_F1    	/* nothing */
-#define CALLER_RESTORE_F1 	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_F2
-#define CALLER_SAVE_F2    	SAVE_F2 = F2;
-#define CALLER_RESTORE_F2 	F2 = SAVE_F2;
-#else
-#define CALLER_SAVE_F2    	/* nothing */
-#define CALLER_RESTORE_F2 	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_F3
-#define CALLER_SAVE_F3    	SAVE_F3 = F3;
-#define CALLER_RESTORE_F3 	F3 = SAVE_F3;
-#else
-#define CALLER_SAVE_F3    	/* nothing */
-#define CALLER_RESTORE_F3 	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_F4
-#define CALLER_SAVE_F4    	SAVE_F4 = F4;
-#define CALLER_RESTORE_F4 	F4 = SAVE_F4;
-#else
-#define CALLER_SAVE_F4    	/* nothing */
-#define CALLER_RESTORE_F4 	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_D1
-#define CALLER_SAVE_D1    	SAVE_D1 = D1;
-#define CALLER_RESTORE_D1 	D1 = SAVE_D1;
-#else
-#define CALLER_SAVE_D1    	/* nothing */
-#define CALLER_RESTORE_D1 	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_D2
-#define CALLER_SAVE_D2    	SAVE_D2 = D2;
-#define CALLER_RESTORE_D2 	D2 = SAVE_D2;
-#else
-#define CALLER_SAVE_D2    	/* nothing */
-#define CALLER_RESTORE_D2 	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_L1
-#define CALLER_SAVE_L1    	SAVE_L1 = L1;
-#define CALLER_RESTORE_L1 	L1 = SAVE_L1;
-#else
-#define CALLER_SAVE_L1    	/* nothing */
-#define CALLER_RESTORE_L1 	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_Sp
-#define CALLER_SAVE_Sp	    	SAVE_Sp = Sp;
-#define CALLER_RESTORE_Sp  	Sp = SAVE_Sp;
-#else
-#define CALLER_SAVE_Sp	    	/* nothing */
-#define CALLER_RESTORE_Sp	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_SpLim
-#define CALLER_SAVE_SpLim    	SAVE_SpLim = SpLim;
-#define CALLER_RESTORE_SpLim  	SpLim = SAVE_SpLim;
-#else
-#define CALLER_SAVE_SpLim    	/* nothing */
-#define CALLER_RESTORE_SpLim	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_Hp
-#define CALLER_SAVE_Hp	    	SAVE_Hp = Hp;
-#define CALLER_RESTORE_Hp   	Hp = SAVE_Hp;
-#else
-#define CALLER_SAVE_Hp	    	/* nothing */
-#define CALLER_RESTORE_Hp	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_Base
-#ifdef THREADED_RTS
-#error "Can't have caller-saved BaseReg with THREADED_RTS"
-#endif
-#define CALLER_SAVE_Base	/* nothing */
-#define CALLER_RESTORE_Base	BaseReg = &MainRegTable;
-#else
-#define CALLER_SAVE_Base	/* nothing */
-#define CALLER_RESTORE_Base	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_CurrentTSO
-#define CALLER_SAVE_CurrentTSO   	SAVE_CurrentTSO = CurrentTSO;
-#define CALLER_RESTORE_CurrentTSO	CurrentTSO = SAVE_CurrentTSO;
-#else
-#define CALLER_SAVE_CurrentTSO   	/* nothing */
-#define CALLER_RESTORE_CurrentTSO   	/* nothing */
-#endif
-
-#ifdef CALLER_SAVES_CurrentNursery
-#define CALLER_SAVE_CurrentNursery   	SAVE_CurrentNursery = CurrentNursery;
-#define CALLER_RESTORE_CurrentNursery	CurrentNursery = SAVE_CurrentNursery;
-#else
-#define CALLER_SAVE_CurrentNursery   	/* nothing */
-#define CALLER_RESTORE_CurrentNursery   /* nothing */
-#endif
-
-#ifdef CALLER_SAVES_HpAlloc
-#define CALLER_SAVE_HpAlloc   		SAVE_HpAlloc = HpAlloc;
-#define CALLER_RESTORE_HpAlloc		HpAlloc = SAVE_HpAlloc;
-#else
-#define CALLER_SAVE_HpAlloc   		/* nothing */
-#define CALLER_RESTORE_HpAlloc   	/* nothing */
-#endif
-
 #endif /* IN_STG_CODE */
 
-/* ----------------------------------------------------------------------------
-   Handy bunches of saves/restores 
-   ------------------------------------------------------------------------  */
-
-#if IN_STG_CODE
-
-#define CALLER_SAVE_USER			\
-  CALLER_SAVE_R1				\
-  CALLER_SAVE_R2				\
-  CALLER_SAVE_R3				\
-  CALLER_SAVE_R4				\
-  CALLER_SAVE_R5				\
-  CALLER_SAVE_R6				\
-  CALLER_SAVE_R7				\
-  CALLER_SAVE_R8				\
-  CALLER_SAVE_R9				\
-  CALLER_SAVE_R10				\
-  CALLER_SAVE_F1				\
-  CALLER_SAVE_F2				\
-  CALLER_SAVE_F3				\
-  CALLER_SAVE_F4				\
-  CALLER_SAVE_D1				\
-  CALLER_SAVE_D2				\
-  CALLER_SAVE_L1
-
-     /* Save Base last, since the others may
-	be addressed relative to it */
-#define CALLER_SAVE_SYSTEM			\
-  CALLER_SAVE_Sp				\
-  CALLER_SAVE_SpLim				\
-  CALLER_SAVE_Hp				\
-  CALLER_SAVE_CurrentTSO			\
-  CALLER_SAVE_CurrentNursery			\
-  CALLER_SAVE_Base
-
-#define CALLER_RESTORE_USER			\
-  CALLER_RESTORE_R1				\
-  CALLER_RESTORE_R2				\
-  CALLER_RESTORE_R3				\
-  CALLER_RESTORE_R4				\
-  CALLER_RESTORE_R5				\
-  CALLER_RESTORE_R6				\
-  CALLER_RESTORE_R7				\
-  CALLER_RESTORE_R8				\
-  CALLER_RESTORE_R9				\
-  CALLER_RESTORE_R10			\
-  CALLER_RESTORE_F1				\
-  CALLER_RESTORE_F2				\
-  CALLER_RESTORE_F3				\
-  CALLER_RESTORE_F4				\
-  CALLER_RESTORE_D1				\
-  CALLER_RESTORE_D2				\
-  CALLER_RESTORE_L1
-
-     /* Restore Base first, since the others may
-	be addressed relative to it */
-#define CALLER_RESTORE_SYSTEM			\
-  CALLER_RESTORE_Base				\
-  CALLER_RESTORE_Sp				\
-  CALLER_RESTORE_SpLim				\
-  CALLER_RESTORE_Hp				\
-  CALLER_RESTORE_CurrentTSO			\
-  CALLER_RESTORE_CurrentNursery
-
-#else /* not IN_STG_CODE */
-
-#define CALLER_SAVE_USER       /* nothing */
-#define CALLER_SAVE_SYSTEM     /* nothing */
-#define CALLER_RESTORE_USER    /* nothing */
-#define CALLER_RESTORE_SYSTEM  /* nothing */
-
-#endif /* IN_STG_CODE */
-#define CALLER_SAVE_ALL				\
-  CALLER_SAVE_SYSTEM				\
-  CALLER_SAVE_USER
-
-#define CALLER_RESTORE_ALL			\
-  CALLER_RESTORE_SYSTEM				\
-  CALLER_RESTORE_USER
-
 #endif /* REGS_H */
diff --git a/rts/Apply.cmm b/rts/Apply.cmm
index a2d4a7e123..b89abeaff2 100644
--- a/rts/Apply.cmm
+++ b/rts/Apply.cmm
@@ -21,18 +21,16 @@
 
 STRING(stg_ap_0_ret_str,"stg_ap_0_ret... ")
 
-stg_ap_0_fast
+stg_ap_0_fast ( P_ fun )
 { 
-    // fn is in R1, no args on the stack
-
     IF_DEBUG(apply,
-	foreign "C" debugBelch(stg_ap_0_ret_str) [R1];
-	foreign "C" printClosure(R1 "ptr") [R1]);
+        ccall debugBelch(stg_ap_0_ret_str);
+        ccall printClosure(R1 "ptr"));
 
     IF_DEBUG(sanity,
-	foreign "C" checkStackFrame(Sp "ptr") [R1]);
+        ccall checkStackFrame(Sp "ptr"));
 
-    ENTER();
+    ENTER(fun);
 }
 
 /* -----------------------------------------------------------------------------
@@ -56,9 +54,9 @@ stg_ap_0_fast
    -------------------------------------------------------------------------- */
 
 INFO_TABLE(stg_PAP,/*special layout*/0,0,PAP,"PAP","PAP")
-{  foreign "C" barf("PAP object entered!") never returns; }
+{  ccall barf("PAP object entered!") never returns; }
     
-stg_PAP_apply
+stg_PAP_apply /* no args => explicit stack */
 {
   W_ Words;
   W_ pap;
@@ -78,7 +76,7 @@ stg_PAP_apply
       // this before calling stg_PAP_entry.
       Sp_adj(-1); 
       Sp(0) = R2;
-      jump stg_gc_unpt_r1;
+      jump stg_gc_unpt_r1 [R1];
   }
   Sp_adj(-Words);
 
@@ -86,7 +84,7 @@ stg_PAP_apply
   TICK_ENT_PAP();
   LDV_ENTER(pap);
 #ifdef PROFILING
-  foreign "C" enterFunCCS(BaseReg "ptr", StgHeader_ccs(pap) "ptr");
+  ccall enterFunCCS(BaseReg "ptr", StgHeader_ccs(pap) "ptr");
 #endif
 
   // Reload the stack 
@@ -122,26 +120,26 @@ for:
   TICK_ENT_VIA_NODE();
 
 #ifdef NO_ARG_REGS
-  jump %GET_ENTRY(UNTAG(R1));
+  jump %GET_ENTRY(UNTAG(R1)) [R1];
 #else
       W_ info;
       info = %GET_FUN_INFO(UNTAG(R1));
       W_ type;
       type = TO_W_(StgFunInfoExtra_fun_type(info));
       if (type == ARG_GEN) {
-	  jump StgFunInfoExtra_slow_apply(info);
+          jump StgFunInfoExtra_slow_apply(info) [R1];
       }
       if (type == ARG_GEN_BIG) {
-	  jump StgFunInfoExtra_slow_apply(info);
+          jump StgFunInfoExtra_slow_apply(info) [R1];
       }
       if (type == ARG_BCO) {
 	  Sp_adj(-2);
 	  Sp(1) = R1;
 	  Sp(0) = stg_apply_interp_info;
-	  jump stg_yield_to_interpreter;
+          jump stg_yield_to_interpreter [];
       }
       jump W_[stg_ap_stack_entries + 
-		WDS(TO_W_(StgFunInfoExtra_fun_type(info)))];
+                WDS(TO_W_(StgFunInfoExtra_fun_type(info)))] [R1];
 #endif
 }
 
@@ -155,6 +153,7 @@ for:
    -------------------------------------------------------------------------- */
 
 INFO_TABLE(stg_AP,/*special layout*/0,0,AP,"AP","AP")
+ /* no args => explicit stack */
 {
   W_ Words;
   W_ ap;
@@ -164,12 +163,12 @@ INFO_TABLE(stg_AP,/*special layout*/0,0,AP,"AP","AP")
   Words = TO_W_(StgAP_n_args(ap));
 
   /* 
-   * Check for stack overflow.  IMPORTANT: use a _NP check here,
+   * Check for stack overflow.  IMPORTANT: use a _ENTER check here,
    * because if the check fails, we might end up blackholing this very
    * closure, in which case we must enter the blackhole on return rather
    * than continuing to evaluate the now-defunct closure.
    */
-  STK_CHK_NP(WDS(Words) + SIZEOF_StgUpdateFrame);
+  STK_CHK_ENTER(WDS(Words) + SIZEOF_StgUpdateFrame, R1);
 
   PUSH_UPD_FRAME(Sp - SIZEOF_StgUpdateFrame, R1);
   Sp = Sp - SIZEOF_StgUpdateFrame - WDS(Words);
@@ -197,26 +196,26 @@ for:
   TICK_ENT_VIA_NODE();
 
 #ifdef NO_ARG_REGS
-  jump %GET_ENTRY(UNTAG(R1));
+  jump %GET_ENTRY(UNTAG(R1)) [R1];
 #else
       W_ info;
       info = %GET_FUN_INFO(UNTAG(R1));
       W_ type;
       type = TO_W_(StgFunInfoExtra_fun_type(info));
       if (type == ARG_GEN) {
-	  jump StgFunInfoExtra_slow_apply(info);
+          jump StgFunInfoExtra_slow_apply(info) [R1];
       }
       if (type == ARG_GEN_BIG) {
-	  jump StgFunInfoExtra_slow_apply(info);
+          jump StgFunInfoExtra_slow_apply(info) [R1];
       }
       if (type == ARG_BCO) {
 	  Sp_adj(-2);
 	  Sp(1) = R1;
 	  Sp(0) = stg_apply_interp_info;
-	  jump stg_yield_to_interpreter;
+          jump stg_yield_to_interpreter [];
       }
       jump W_[stg_ap_stack_entries + 
-		WDS(TO_W_(StgFunInfoExtra_fun_type(info)))];
+                WDS(TO_W_(StgFunInfoExtra_fun_type(info)))] [R1];
 #endif
 }
 
@@ -225,6 +224,7 @@ for:
    those generated by the byte-code compiler for inserting breakpoints. */
 
 INFO_TABLE(stg_AP_NOUPD,/*special layout*/0,0,AP,"AP_NOUPD","AP_NOUPD")
+   /* no args => explicit stack */
 {
   W_ Words;
   W_ ap;
@@ -234,12 +234,12 @@ INFO_TABLE(stg_AP_NOUPD,/*special layout*/0,0,AP,"AP_NOUPD","AP_NOUPD")
   Words = TO_W_(StgAP_n_args(ap));
 
   /* 
-   * Check for stack overflow.  IMPORTANT: use a _NP check here,
+   * Check for stack overflow.  IMPORTANT: use a _ENTER check here,
    * because if the check fails, we might end up blackholing this very
    * closure, in which case we must enter the blackhole on return rather
    * than continuing to evaluate the now-defunct closure.
    */
-  STK_CHK_NP(WDS(Words));
+  STK_CHK_ENTER(WDS(Words), R1);
   Sp = Sp - WDS(Words);
 
   TICK_ENT_AP();
@@ -265,26 +265,26 @@ for:
   TICK_ENT_VIA_NODE();
 
 #ifdef NO_ARG_REGS
-  jump %GET_ENTRY(UNTAG(R1));
+  jump %GET_ENTRY(UNTAG(R1)) [R1];
 #else
       W_ info;
       info = %GET_FUN_INFO(UNTAG(R1));
       W_ type;
       type = TO_W_(StgFunInfoExtra_fun_type(info));
       if (type == ARG_GEN) {
-	  jump StgFunInfoExtra_slow_apply(info);
+          jump StgFunInfoExtra_slow_apply(info) [R1];
       }
       if (type == ARG_GEN_BIG) {
-	  jump StgFunInfoExtra_slow_apply(info);
+          jump StgFunInfoExtra_slow_apply(info) [R1];
       }
       if (type == ARG_BCO) {
 	  Sp_adj(-2);
 	  Sp(1) = R1;
 	  Sp(0) = stg_apply_interp_info;
-	  jump stg_yield_to_interpreter;
+          jump stg_yield_to_interpreter [];
       }
       jump W_[stg_ap_stack_entries + 
-		WDS(TO_W_(StgFunInfoExtra_fun_type(info)))];
+                WDS(TO_W_(StgFunInfoExtra_fun_type(info)))] [R1];
 #endif
 }
 
@@ -300,6 +300,7 @@ for:
    -------------------------------------------------------------------------- */
 
 INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK")
+  /* no args => explicit stack */
 {
   W_ Words;
   W_ ap;
@@ -309,12 +310,12 @@ INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK")
   Words = StgAP_STACK_size(ap);
 
   /* 
-   * Check for stack overflow.  IMPORTANT: use a _NP check here,
+   * Check for stack overflow.  IMPORTANT: use a _ENTER check here,
    * because if the check fails, we might end up blackholing this very
    * closure, in which case we must enter the blackhole on return rather
    * than continuing to evaluate the now-defunct closure.
    */
-  STK_CHK_NP(WDS(Words) + SIZEOF_StgUpdateFrame + WDS(AP_STACK_SPLIM));
+  STK_CHK_ENTER(WDS(Words) + SIZEOF_StgUpdateFrame + WDS(AP_STACK_SPLIM), R1);
   /* ensure there is at least AP_STACK_SPLIM words of headroom available
    * after unpacking the AP_STACK. See bug #1466 */
 
@@ -343,7 +344,7 @@ for:
 
   R1 = StgAP_STACK_fun(ap);
 
-  ENTER();
+  ENTER_R1();
 }
 
 /* -----------------------------------------------------------------------------
@@ -352,6 +353,7 @@ for:
 
 INFO_TABLE(stg_AP_STACK_NOUPD,/*special layout*/0,0,AP_STACK,
                                         "AP_STACK_NOUPD","AP_STACK_NOUPD")
+   /* no args => explicit stack */
 {
   W_ Words;
   W_ ap;
@@ -366,7 +368,7 @@ INFO_TABLE(stg_AP_STACK_NOUPD,/*special layout*/0,0,AP_STACK,
    * closure, in which case we must enter the blackhole on return rather
    * than continuing to evaluate the now-defunct closure.
    */
-  STK_CHK_NP(WDS(Words) + WDS(AP_STACK_SPLIM));
+  STK_CHK_ENTER(WDS(Words) + WDS(AP_STACK_SPLIM), R1);
   /* ensure there is at least AP_STACK_SPLIM words of headroom available
    * after unpacking the AP_STACK. See bug #1466 */
 
@@ -394,5 +396,5 @@ for:
 
   R1 = StgAP_STACK_fun(ap);
 
-  ENTER();
+  ENTER_R1();
 }
diff --git a/rts/AutoApply.h b/rts/AutoApply.h
index d0c5c3fe6b..ebb7308875 100644
--- a/rts/AutoApply.h
+++ b/rts/AutoApply.h
@@ -35,7 +35,7 @@
     }							\
     R1 = pap;						\
     Sp_adj(1 + n);					\
-    jump %ENTRY_CODE(Sp(0));
+    jump %ENTRY_CODE(Sp(0)) [R1];
 
 // Copy the old PAP, build a new one with the extra arg(s)
 // ret addr and m arguments taking up n words are on the stack.
@@ -74,7 +74,7 @@
      }								\
      R1 = new_pap;						\
      Sp_adj(n+1);						\
-     jump %ENTRY_CODE(Sp(0));
+     jump %ENTRY_CODE(Sp(0)) [R1];
 
 // Jump to target, saving CCCS and restoring it on return
 #if defined(PROFILING)
@@ -82,9 +82,9 @@
     Sp(-1) = CCCS;                              \
     Sp(-2) = stg_restore_cccs_info;             \
     Sp_adj(-2);                                 \
-    jump (target)
+    jump (target) [*]
 #else
-#define jump_SAVE_CCCS(target) jump (target)
+#define jump_SAVE_CCCS(target) jump (target) [*]
 #endif
 
 #endif /* APPLY_H */
diff --git a/rts/ClosureFlags.c b/rts/ClosureFlags.c
index 0ab8b45669..a2a140282f 100644
--- a/rts/ClosureFlags.c
+++ b/rts/ClosureFlags.c
@@ -55,8 +55,7 @@ StgWord16 closure_flags[] = {
  [RET_BCO]              =  ( 0                                         ),
  [RET_SMALL]		=  (     _BTM|                       _SRT      ),
  [RET_BIG]		=  (                                 _SRT      ),
- [RET_DYN]		=  (                                 _SRT      ),
- [RET_FUN]		=  ( 0                                         ),
+ [RET_FUN]              =  ( 0                                         ),
  [UPDATE_FRAME]        	=  (     _BTM                                  ),
  [CATCH_FRAME]	   	=  (     _BTM                                  ),
  [UNDERFLOW_FRAME]      =  (     _BTM                                  ),
@@ -84,6 +83,6 @@ StgWord16 closure_flags[] = {
  [WHITEHOLE]		=  ( 0                                         )
 };
 
-#if N_CLOSURE_TYPES != 61
+#if N_CLOSURE_TYPES != 60
 #error Closure types changed: update ClosureFlags.c!
 #endif
diff --git a/rts/Exception.cmm b/rts/Exception.cmm
index 78907c4ba7..8a9f4e62c9 100644
--- a/rts/Exception.cmm
+++ b/rts/Exception.cmm
@@ -50,7 +50,8 @@ import ghczmprim_GHCziTypes_True_closure;
    -------------------------------------------------------------------------- */
 
 
-INFO_TABLE_RET(stg_unmaskAsyncExceptionszh_ret, RET_SMALL)
+INFO_TABLE_RET(stg_unmaskAsyncExceptionszh_ret, RET_SMALL, W_ info_ptr)
+    /* explicit stack */
 {
     CInt r;
 
@@ -60,7 +61,7 @@ INFO_TABLE_RET(stg_unmaskAsyncExceptionszh_ret, RET_SMALL)
     /* Eagerly raise a blocked exception, if there is one */
     if (StgTSO_blocked_exceptions(CurrentTSO) != END_TSO_QUEUE) {
 
-        STK_CHK_GEN( WDS(2), R1_PTR, stg_unmaskAsyncExceptionszh_ret_info);
+        STK_CHK_P (WDS(2), stg_unmaskAsyncExceptionszh_ret_info, R1);
         /* 
          * We have to be very careful here, as in killThread#, since
          * we are about to raise an async exception in the current
@@ -68,18 +69,18 @@ INFO_TABLE_RET(stg_unmaskAsyncExceptionszh_ret, RET_SMALL)
          */
         Sp_adj(-2);
         Sp(1) = R1;
-        Sp(0) = stg_gc_unpt_r1_info;
+        Sp(0) = stg_ret_p_info;
         SAVE_THREAD_STATE();
-        (r) = foreign "C" maybePerformBlockedException (MyCapability() "ptr", 
-    					              CurrentTSO "ptr") [R1];
+        (r) = ccall maybePerformBlockedException (MyCapability() "ptr", 
+                                                      CurrentTSO "ptr");
 
         if (r != 0::CInt) {
             if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) {
-                jump stg_threadFinished;
+                jump stg_threadFinished [];
             } else {
                 LOAD_THREAD_STATE();
                 ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
-                jump %ENTRY_CODE(Sp(0));
+                jump %ENTRY_CODE(Sp(0)) [R1];
             }
         }
         else {
@@ -93,10 +94,11 @@ INFO_TABLE_RET(stg_unmaskAsyncExceptionszh_ret, RET_SMALL)
     }
 
     Sp_adj(1);
-    jump %ENTRY_CODE(Sp(0));
+    jump %ENTRY_CODE(Sp(0)) [R1];
 }
 
-INFO_TABLE_RET(stg_maskAsyncExceptionszh_ret, RET_SMALL)
+INFO_TABLE_RET(stg_maskAsyncExceptionszh_ret, RET_SMALL, W_ info_ptr)
+    return (P_ ret)
 {
     StgTSO_flags(CurrentTSO) = 
        %lobits32(
@@ -104,11 +106,11 @@ INFO_TABLE_RET(stg_maskAsyncExceptionszh_ret, RET_SMALL)
           | TSO_BLOCKEX | TSO_INTERRUPTIBLE
       );
 
-    Sp_adj(1);
-    jump %ENTRY_CODE(Sp(0));
+    return (ret);
 }
 
-INFO_TABLE_RET(stg_maskUninterruptiblezh_ret, RET_SMALL)
+INFO_TABLE_RET(stg_maskUninterruptiblezh_ret, RET_SMALL, W_ info_ptr)
+    return (P_ ret)
 {
     StgTSO_flags(CurrentTSO) = 
        %lobits32(
@@ -117,14 +119,13 @@ INFO_TABLE_RET(stg_maskUninterruptiblezh_ret, RET_SMALL)
           & ~TSO_INTERRUPTIBLE
        );
 
-    Sp_adj(1);
-    jump %ENTRY_CODE(Sp(0));
+    return (ret);
 }
 
-stg_maskAsyncExceptionszh
+stg_maskAsyncExceptionszh /* explicit stack */
 {
     /* Args: R1 :: IO a */
-    STK_CHK_GEN( WDS(1)/* worst case */, R1_PTR, stg_maskAsyncExceptionszh);
+    STK_CHK_P( WDS(1)/* worst case */, stg_maskAsyncExceptionszh, R1);
 
     if ((TO_W_(StgTSO_flags(CurrentTSO)) & TSO_BLOCKEX) == 0) {
         /* avoid growing the stack unnecessarily */
@@ -146,13 +147,13 @@ stg_maskAsyncExceptionszh
 
     TICK_UNKNOWN_CALL();
     TICK_SLOW_CALL_v();
-    jump stg_ap_v_fast;
+    jump stg_ap_v_fast [R1];
 }
 
-stg_maskUninterruptiblezh
+stg_maskUninterruptiblezh /* explicit stack */
 {
     /* Args: R1 :: IO a */
-    STK_CHK_GEN( WDS(1)/* worst case */, R1_PTR, stg_maskAsyncExceptionszh);
+    STK_CHK_P( WDS(1)/* worst case */, stg_maskAsyncExceptionszh, R1);
 
     if ((TO_W_(StgTSO_flags(CurrentTSO)) & TSO_BLOCKEX) == 0) {
         /* avoid growing the stack unnecessarily */
@@ -174,16 +175,16 @@ stg_maskUninterruptiblezh
 
     TICK_UNKNOWN_CALL();
     TICK_SLOW_CALL_v();
-    jump stg_ap_v_fast;
+    jump stg_ap_v_fast [R1];
 }
 
-stg_unmaskAsyncExceptionszh
+stg_unmaskAsyncExceptionszh /* explicit stack */
 {
     CInt r;
     W_ level;
 
     /* Args: R1 :: IO a */
-    STK_CHK_GEN( WDS(4), R1_PTR, stg_unmaskAsyncExceptionszh);
+    STK_CHK_P (WDS(4), stg_unmaskAsyncExceptionszh, R1);
     /* 4 words: one for the unblock frame, 3 for setting up the
      * stack to call maybePerformBlockedException() below.
      */
@@ -225,16 +226,16 @@ stg_unmaskAsyncExceptionszh
             Sp(0) = stg_enter_info;
 
             SAVE_THREAD_STATE();
-            (r) = foreign "C" maybePerformBlockedException (MyCapability() "ptr", 
-						      CurrentTSO "ptr") [R1];
+            (r) = ccall maybePerformBlockedException (MyCapability() "ptr", 
+                                                      CurrentTSO "ptr");
 
             if (r != 0::CInt) {
                 if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) {
-                    jump stg_threadFinished;
+                    jump stg_threadFinished [];
 	        } else {
 	            LOAD_THREAD_STATE();
 	            ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
-	            jump %ENTRY_CODE(Sp(0));
+                    jump %ENTRY_CODE(Sp(0)) [R1];
 	        }
             } else {
                 /* we'll just call R1 directly, below */
@@ -245,11 +246,11 @@ stg_unmaskAsyncExceptionszh
     }
     TICK_UNKNOWN_CALL();
     TICK_SLOW_CALL_v();
-    jump stg_ap_v_fast;
+    jump stg_ap_v_fast [R1];
 }
 
 
-stg_getMaskingStatezh
+stg_getMaskingStatezh ()
 {
     /* args: none */
     /* 
@@ -257,25 +258,18 @@ stg_getMaskingStatezh
                 1 == masked, non-interruptible,
                 2 == masked, interruptible
     */
-    RET_N(((TO_W_(StgTSO_flags(CurrentTSO)) & TSO_BLOCKEX) != 0) +
-          ((TO_W_(StgTSO_flags(CurrentTSO)) & TSO_INTERRUPTIBLE) != 0));
+    return (((TO_W_(StgTSO_flags(CurrentTSO)) & TSO_BLOCKEX) != 0) +
+            ((TO_W_(StgTSO_flags(CurrentTSO)) & TSO_INTERRUPTIBLE) != 0));
 }
 
-stg_killThreadzh
+stg_killThreadzh (P_ target, P_ exception)
 {
-    /* args: R1 = TSO to kill, R2 = Exception */
-
     W_ why_blocked;
-    W_ target;
-    W_ exception;
-    
-    target = R1;
-    exception = R2;
-    
+
     /* Needs 3 words because throwToSingleThreaded uses some stack */
-    STK_CHK_GEN( WDS(3), R1_PTR & R2_PTR, stg_killThreadzh);
+    STK_CHK_PP (WDS(3), stg_killThreadzh, target, exception);
     /* We call allocate in throwTo(), so better check for GC */
-    MAYBE_GC(R1_PTR & R2_PTR, stg_killThreadzh);
+    MAYBE_GC_PP (stg_killThreadzh, target, exception);
 
     /* 
      * We might have killed ourselves.  In which case, better be *very*
@@ -292,58 +286,75 @@ stg_killThreadzh
          * happens: on resumption, we will just jump to the next frame on
          * the stack, which is the return point for stg_killThreadzh.
          */
-	SAVE_THREAD_STATE();
-	/* ToDo: what if the current thread is blocking exceptions? */
-	foreign "C" throwToSingleThreaded(MyCapability() "ptr", 
-					  target "ptr", exception "ptr")[R1,R2];
-	if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) {
-            jump stg_threadFinished;
-	} else {
-	    LOAD_THREAD_STATE();
-	    ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
-	    jump %ENTRY_CODE(Sp(0));
-	}
+        R1 = target;
+        R2 = exception;
+        jump stg_killMyself [R1,R2];
     } else {
-	W_ out;
-	W_ msg;
-	out = Sp - WDS(1); /* ok to re-use stack space here */
+        W_ msg;
 
-	(msg) = foreign "C" throwTo(MyCapability() "ptr",
+	(msg) = ccall throwTo(MyCapability() "ptr",
                                     CurrentTSO "ptr",
                                     target "ptr",
-                                    exception "ptr") [R1,R2];
+                                    exception "ptr");
 	
         if (msg == NULL) {
-	    jump %ENTRY_CODE(Sp(0));
-	} else {
+            return ();
+        } else {
             StgTSO_why_blocked(CurrentTSO) = BlockedOnMsgThrowTo;
             StgTSO_block_info(CurrentTSO) = msg;
 	    // we must block, and unlock the message before returning
-	    jump stg_block_throwto;
+            jump stg_block_throwto (target, exception);
 	}
     }
 }
 
+/*
+ * We must switch into low-level Cmm in order to raise an exception in
+ * the current thread, hence this is in a separate proc with arguments
+ * passed explicitly in R1 and R2.
+ */
+stg_killMyself
+{
+    P_ target, exception;
+    target = R1;
+    exception = R2;
+
+    SAVE_THREAD_STATE();
+    /* ToDo: what if the current thread is blocking exceptions? */
+    ccall throwToSingleThreaded(MyCapability() "ptr", 
+                                target "ptr", exception "ptr");
+    if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) {
+        jump stg_threadFinished [];
+    } else {
+        LOAD_THREAD_STATE();
+        ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
+        jump %ENTRY_CODE(Sp(0)) [];
+    }
+}
+
 /* -----------------------------------------------------------------------------
    Catch frames
    -------------------------------------------------------------------------- */
 
-#define SP_OFF 0
-
 /* Catch frames are very similar to update frames, but when entering
  * one we just pop the frame off the stack and perform the correct
  * kind of return to the activation record underneath us on the stack.
  */
 
+#define CATCH_FRAME_FIELDS(w_,p_,info_ptr,exceptions_blocked,handler)   \
+  w_ info_ptr,                                                          \
+  PROF_HDR_FIELDS(w_)                                                   \
+  w_ exceptions_blocked,                                                \
+  p_ handler
+
+
 INFO_TABLE_RET(stg_catch_frame, CATCH_FRAME,
-#if defined(PROFILING)
-  W_ unused1, W_ unused2,
-#endif
-  W_ unused3, P_ unused4)
-   {
-      Sp = Sp + SIZEOF_StgCatchFrame;
-      jump %ENTRY_CODE(Sp(SP_OFF));
-   }
+               CATCH_FRAME_FIELDS(W_,P_,info_ptr,
+                                  exceptions_blocked,handler))
+    return (P_ ret)
+{
+    return (ret);
+}
 
 /* -----------------------------------------------------------------------------
  * The catch infotable
@@ -356,30 +367,30 @@ INFO_TABLE_RET(stg_catch_frame, CATCH_FRAME,
  * -------------------------------------------------------------------------- */
 
 INFO_TABLE(stg_catch,2,0,FUN,"catch","catch")
+    (P_ node)
 {
-  R2 = StgClosure_payload(R1,1); /* h */
-  R1 = StgClosure_payload(R1,0); /* x */
-  jump stg_catchzh;
+    jump stg_catchzh(StgClosure_payload(node,0),StgClosure_payload(node,1));
 }
 
-stg_catchzh
+stg_catchzh ( P_ io,      /* :: IO a */
+              P_ handler  /* :: Exception -> IO a */ )
 {
-    /* args: R1 = m :: IO a, R2 = handler :: Exception -> IO a */
-    STK_CHK_GEN(SIZEOF_StgCatchFrame + WDS(1), R1_PTR & R2_PTR, stg_catchzh);
+    W_ exceptions_blocked;
+
+    STK_CHK_GEN();
   
-    /* Set up the catch frame */
-    Sp = Sp - SIZEOF_StgCatchFrame;
-    SET_HDR(Sp,stg_catch_frame_info,CCCS);
-    
-    StgCatchFrame_handler(Sp) = R2;
-    StgCatchFrame_exceptions_blocked(Sp) = 
+    exceptions_blocked =
         TO_W_(StgTSO_flags(CurrentTSO)) & (TSO_BLOCKEX | TSO_INTERRUPTIBLE);
     TICK_CATCHF_PUSHED();
 
     /* Apply R1 to the realworld token */
     TICK_UNKNOWN_CALL();
     TICK_SLOW_CALL_v();
-    jump stg_ap_v_fast;
+
+    jump stg_ap_v_fast
+        (CATCH_FRAME_FIELDS(,,stg_catch_frame_info,
+                            exceptions_blocked, handler))
+        (io);
 }
 
 /* -----------------------------------------------------------------------------
@@ -394,28 +405,33 @@ stg_catchzh
 
 INFO_TABLE(stg_raise,1,0,THUNK_1_0,"raise","raise")
 {
-  R1 = StgThunk_payload(R1,0);
-  jump stg_raisezh;
+    jump stg_raisezh(StgThunk_payload(R1,0));
 }
 
 section "data" {
   no_break_on_exception: W_[1];
 }
 
-INFO_TABLE_RET(stg_raise_ret, RET_SMALL, P_ arg1)
+INFO_TABLE_RET(stg_raise_ret, RET_SMALL, W_ info_ptr, P_ exception)
+    return (P_ ret)
 {
-  R1 = Sp(1);
-  Sp = Sp + WDS(2);
-  W_[no_break_on_exception] = 1;  
-  jump stg_raisezh;
+    W_[no_break_on_exception] = 1;
+    jump stg_raisezh (exception);
 }
 
-stg_raisezh
+stg_raisezh /* explicit stack */
+/*
+ * args : R1 :: Exception
+ *
+ * Here we assume that the NativeNodeCall convention always puts the
+ * first argument in R1 (which it does).  We cannot use high-level cmm
+ * due to all the LOAD_THREAD_STATE()/SAVE_THREAD_STATE() and stack
+ * walking that happens in here.
+ */
 {
     W_ handler;
     W_ frame_type;
     W_ exception;
-    /* args : R1 :: Exception */
 
    exception = R1;
 
@@ -427,16 +443,16 @@ stg_raisezh
      */
     if (RtsFlags_ProfFlags_showCCSOnException(RtsFlags) != 0::I32) {
         SAVE_THREAD_STATE();
-        foreign "C" fprintCCS_stderr(CCCS "ptr",
+        ccall fprintCCS_stderr(CCCS "ptr",
                                      exception "ptr",
-                                     CurrentTSO "ptr") [];
+                                     CurrentTSO "ptr");
         LOAD_THREAD_STATE();
     }
 #endif
     
 retry_pop_stack:
     SAVE_THREAD_STATE();
-    (frame_type) = foreign "C" raiseExceptionHelper(BaseReg "ptr", CurrentTSO "ptr", exception "ptr") [];
+    (frame_type) = ccall raiseExceptionHelper(BaseReg "ptr", CurrentTSO "ptr", exception "ptr");
     LOAD_THREAD_STATE();
     if (frame_type == ATOMICALLY_FRAME) {
       /* The exception has reached the edge of a memory transaction.  Check that 
@@ -450,14 +466,14 @@ retry_pop_stack:
       W_ trec, outer;
       W_ r;
       trec = StgTSO_trec(CurrentTSO);
-      (r) = foreign "C" stmValidateNestOfTransactions(trec "ptr") [];
+      (r) = ccall stmValidateNestOfTransactions(trec "ptr");
       outer  = StgTRecHeader_enclosing_trec(trec);
-      foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
-      foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
+      ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr");
+      ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr");
 
       if (outer != NO_TREC) {
-        foreign "C" stmAbortTransaction(MyCapability() "ptr", outer "ptr") [];
-        foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", outer "ptr") [];
+        ccall stmAbortTransaction(MyCapability() "ptr", outer "ptr");
+        ccall stmFreeAbortedTRec(MyCapability() "ptr", outer "ptr");
       }
 
       StgTSO_trec(CurrentTSO) = NO_TREC;
@@ -468,10 +484,10 @@ retry_pop_stack:
       } else {
         // Transaction was not valid: we retry the exception (otherwise continue
         // with a further call to raiseExceptionHelper)
-        ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
+        ("ptr" trec) = ccall stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr");
         StgTSO_trec(CurrentTSO) = trec;
         R1 = StgAtomicallyFrame_code(Sp);
-        jump stg_ap_v_fast;
+        jump stg_ap_v_fast [R1];
       }          
     }
 
@@ -492,7 +508,7 @@ retry_pop_stack:
             // for exmplae.  Perhaps the stop_on_exception flag should
             // be per-thread.
             CInt[rts_stop_on_exception] = 0;
-            ("ptr" ioAction) = foreign "C" deRefStablePtr (W_[rts_breakpoint_io_action] "ptr") [];
+            ("ptr" ioAction) = ccall deRefStablePtr (W_[rts_breakpoint_io_action] "ptr");
             Sp = Sp - WDS(6);
             Sp(5) = exception;
             Sp(4) = stg_raise_ret_info;
@@ -500,7 +516,7 @@ retry_pop_stack:
             Sp(2) = ghczmprim_GHCziTypes_True_closure; // dummy breakpoint info
             Sp(1) = ghczmprim_GHCziTypes_True_closure; // True <=> a breakpoint
             R1 = ioAction;
-            jump RET_LBL(stg_ap_pppv);
+            jump RET_LBL(stg_ap_pppv) [R1];
         }
     }
 
@@ -519,11 +535,12 @@ retry_pop_stack:
 	StgTSO_what_next(CurrentTSO) = ThreadKilled::I16;
 	SAVE_THREAD_STATE();	/* inline! */
 
-        jump stg_threadFinished;
+        jump stg_threadFinished [];
     }
 
-    /* Ok, Sp points to the enclosing CATCH_FRAME or CATCH_STM_FRAME.  Pop everything
-     * down to and including this frame, update Su, push R1, and enter the handler.
+    /* Ok, Sp points to the enclosing CATCH_FRAME or CATCH_STM_FRAME.
+     * Pop everything down to and including this frame, update Su,
+     * push R1, and enter the handler.
      */
     if (frame_type == CATCH_FRAME) {
       handler = StgCatchFrame_handler(Sp);
@@ -572,8 +589,8 @@ retry_pop_stack:
       W_ trec, outer;
       trec = StgTSO_trec(CurrentTSO);
       outer  = StgTRecHeader_enclosing_trec(trec);
-      foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
-      foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
+      ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr");
+      ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr");
       StgTSO_trec(CurrentTSO) = outer;
       Sp = Sp + SIZEOF_StgCatchSTMFrame;
     }
@@ -587,11 +604,10 @@ retry_pop_stack:
     Sp_adj(-1);
     TICK_UNKNOWN_CALL();
     TICK_SLOW_CALL_pv();
-    jump RET_LBL(stg_ap_pv);
+    jump RET_LBL(stg_ap_pv) [R1];
 }
 
-stg_raiseIOzh
+stg_raiseIOzh (P_ exception)
 {
-  /* Args :: R1 :: Exception */
-  jump stg_raisezh;
+    jump stg_raisezh (exception);
 }
diff --git a/rts/HeapStackCheck.cmm b/rts/HeapStackCheck.cmm
index 90691fa091..08adf45b02 100644
--- a/rts/HeapStackCheck.cmm
+++ b/rts/HeapStackCheck.cmm
@@ -11,6 +11,7 @@
  * ---------------------------------------------------------------------------*/
 
 #include "Cmm.h"
+#include "Updates.h"
 
 #ifdef __PIC__
 import pthread_mutex_unlock;
@@ -81,58 +82,66 @@ import LeaveCriticalSection;
  * ThreadRunGHC thread.
  */
 
-#define GC_GENERIC                                                      \
-    DEBUG_ONLY(foreign "C" heapCheckFail());                            \
-    if (Hp > HpLim) {                                                   \
-        Hp = Hp - HpAlloc/*in bytes*/;                                  \
-        if (HpLim == 0) {                                               \
-                R1 = ThreadYielding;                                    \
-                goto sched;                                             \
-        }                                                               \
-        if (HpAlloc <= BLOCK_SIZE                                       \
-            && bdescr_link(CurrentNursery) != NULL) {                   \
-            HpAlloc = 0;                                                \
-            CLOSE_NURSERY();                                            \
-            CurrentNursery = bdescr_link(CurrentNursery);               \
-            OPEN_NURSERY();                                             \
-            if (Capability_context_switch(MyCapability()) != 0 :: CInt ||  \
-                Capability_interrupt(MyCapability())      != 0 :: CInt) { \
-                R1 = ThreadYielding;                                    \
-                goto sched;                                             \
-            } else {                                                    \
-                jump %ENTRY_CODE(Sp(0));                                \
-            }                                                           \
-        } else {                                                        \
-            R1 = HeapOverflow;                                          \
-            goto sched;                                                 \
-        }                                                               \
-    } else {                                                            \
-        R1 = StackOverflow;                                             \
-    }                                                                   \
-  sched:                                                                \
-    PRE_RETURN(R1,ThreadRunGHC);                                        \
-    jump stg_returnToSched;
+stg_gc_noregs
+{
+    W_ ret;
+
+    DEBUG_ONLY(foreign "C" heapCheckFail());
+    if (Hp > HpLim) {
+        Hp = Hp - HpAlloc/*in bytes*/;
+        if (HpLim == 0) {
+                ret = ThreadYielding;
+                goto sched;
+        }
+        if (HpAlloc <= BLOCK_SIZE
+            && bdescr_link(CurrentNursery) != NULL) {
+            HpAlloc = 0;
+            CLOSE_NURSERY();
+            CurrentNursery = bdescr_link(CurrentNursery);
+            OPEN_NURSERY();
+            if (Capability_context_switch(MyCapability()) != 0 :: CInt ||
+                Capability_interrupt(MyCapability())      != 0 :: CInt) {
+                ret = ThreadYielding;
+                goto sched;
+            } else {
+                jump %ENTRY_CODE(Sp(0)) [];
+            }
+        } else {
+            ret = HeapOverflow;
+            goto sched;
+        }
+    } else {
+        if (CHECK_GC()) {
+            ret = HeapOverflow;
+        } else {
+            ret = StackOverflow;
+        }
+    }
+  sched:
+    PRE_RETURN(ret,ThreadRunGHC);
+    jump stg_returnToSched [R1];
+}
 
 #define HP_GENERIC				\
-   PRE_RETURN(HeapOverflow, ThreadRunGHC)	\
-  jump stg_returnToSched;
+    PRE_RETURN(HeapOverflow, ThreadRunGHC)      \
+    jump stg_returnToSched [R1];
 
 #define BLOCK_GENERIC				\
-   PRE_RETURN(ThreadBlocked,  ThreadRunGHC)	\
-  jump stg_returnToSched;
+    PRE_RETURN(ThreadBlocked,  ThreadRunGHC)    \
+    jump stg_returnToSched [R1];
 
 #define YIELD_GENERIC				\
-  PRE_RETURN(ThreadYielding, ThreadRunGHC)	\
-  jump stg_returnToSched;
+    PRE_RETURN(ThreadYielding, ThreadRunGHC)    \
+    jump stg_returnToSched [R1];
 
 #define BLOCK_BUT_FIRST(c)			\
-  PRE_RETURN(ThreadBlocked, ThreadRunGHC)	\
-  R2 = c;					\
-  jump stg_returnToSchedButFirst;
+    PRE_RETURN(ThreadBlocked, ThreadRunGHC)     \
+    R2 = c;                                     \
+    jump stg_returnToSchedButFirst [R1,R2,R3];
 
 #define YIELD_TO_INTERPRETER			\
-  PRE_RETURN(ThreadYielding, ThreadInterpret)	\
-  jump stg_returnToSchedNotPaused;
+    PRE_RETURN(ThreadYielding, ThreadInterpret) \
+    jump stg_returnToSchedNotPaused [R1];
 
 /* -----------------------------------------------------------------------------
    Heap checks in thunks/functions.
@@ -144,19 +153,55 @@ import LeaveCriticalSection;
    There are canned sequences for 'n' pointer values in registers.
    -------------------------------------------------------------------------- */
 
-INFO_TABLE_RET( stg_enter, RET_SMALL, P_ unused)
+INFO_TABLE_RET ( stg_enter, RET_SMALL, W_ info_ptr, P_ closure )
+    return (/* no return values */)
 {
-    R1 = Sp(1);
-    Sp_adj(2);
-    ENTER();
+    ENTER(closure);
 }
 
-__stg_gc_enter_1
+__stg_gc_enter_1 (P_ node)
 {
-    Sp_adj(-2);
-    Sp(1) = R1;
-    Sp(0) = stg_enter_info;
-    GC_GENERIC
+    jump stg_gc_noregs (stg_enter_info, node) ();
+}
+
+/* -----------------------------------------------------------------------------
+   Canned heap checks for primitives.
+
+   We can't use stg_gc_fun because primitives are not functions, so
+   these fragments let us save some boilerplate heap-check-failure
+   code in a few common cases.
+   -------------------------------------------------------------------------- */
+
+stg_gc_prim ()
+{
+    W_ fun;
+    fun = R9;
+    call stg_gc_noregs ();
+    jump fun();
+}
+
+stg_gc_prim_p (P_ arg)
+{
+    W_ fun;
+    fun = R9;
+    call stg_gc_noregs ();
+    jump fun(arg);
+}
+
+stg_gc_prim_pp (P_ arg1, P_ arg2)
+{
+    W_ fun;
+    fun = R9;
+    call stg_gc_noregs ();
+    jump fun(arg1,arg2);
+}
+
+stg_gc_prim_n (W_ arg)
+{
+    W_ fun;
+    fun = R9;
+    call stg_gc_noregs ();
+    jump fun(arg);
 }
 
 /* -----------------------------------------------------------------------------
@@ -169,138 +214,121 @@ __stg_gc_enter_1
    -------------------------------------------------------------------------- */
 
 /* The stg_enter_checkbh frame has the same shape as an update frame: */
-#if defined(PROFILING)
-#define UPD_FRAME_PARAMS W_ unused1, W_ unused2, P_ unused3
-#else
-#define UPD_FRAME_PARAMS P_ unused1
-#endif
 
-INFO_TABLE_RET( stg_enter_checkbh, RET_SMALL, UPD_FRAME_PARAMS)
+INFO_TABLE_RET ( stg_enter_checkbh, RET_SMALL,
+                 UPDATE_FRAME_FIELDS(W_,P_,info_ptr,updatee))
+    return (P_ ret)
 {
-    R1 = StgUpdateFrame_updatee(Sp);
-    Sp = Sp + SIZEOF_StgUpdateFrame;
     foreign "C" checkBlockingQueues(MyCapability() "ptr",
-                                    CurrentTSO) [R1];
-    ENTER();
+                                    CurrentTSO);
+    return (updatee);
 }
 
 /* -----------------------------------------------------------------------------
-   Heap checks in Primitive case alternatives
-
-   A primitive case alternative is entered with a value either in 
-   R1, FloatReg1 or D1 depending on the return convention.  All the
-   cases are covered below.
+   Info tables for returning values of various types.  These are used
+   when we want to push a frame on the stack that will return a value
+   to the frame underneath it.
    -------------------------------------------------------------------------- */
 
-/*-- No Registers live ------------------------------------------------------ */
-
-stg_gc_noregs
+INFO_TABLE_RET ( stg_ret_v, RET_SMALL, W_ info_ptr )
+    return (/* no return values */)
 {
-    GC_GENERIC
+    return ();
 }
 
-/*-- void return ------------------------------------------------------------ */
-
-INFO_TABLE_RET( stg_gc_void, RET_SMALL)
+INFO_TABLE_RET ( stg_ret_p, RET_SMALL, W_ info_ptr, P_ ptr )
+    return (/* no return values */)
 {
-    Sp_adj(1);
-    jump %ENTRY_CODE(Sp(0));
+    return (ptr);
 }
 
-/*-- R1 is boxed/unpointed -------------------------------------------------- */
-
-INFO_TABLE_RET( stg_gc_unpt_r1, RET_SMALL, P_ unused)
+INFO_TABLE_RET ( stg_ret_n, RET_SMALL, W_ info_ptr, W_ nptr )
+    return (/* no return values */)
 {
-    R1 = Sp(1);
-    Sp_adj(2);
-    jump %ENTRY_CODE(Sp(0));
+    return (nptr);
 }
 
-stg_gc_unpt_r1
+INFO_TABLE_RET ( stg_ret_f, RET_SMALL, W_ info_ptr, F_ f )
+    return (/* no return values */)
 {
-    Sp_adj(-2);
-    Sp(1) = R1;
-    Sp(0) = stg_gc_unpt_r1_info;
-    GC_GENERIC
+    return (f);
 }
 
-/*-- R1 is unboxed -------------------------------------------------- */
-
-/* the 1 is a bitmap - i.e. 1 non-pointer word on the stack. */
-INFO_TABLE_RET(	stg_gc_unbx_r1, RET_SMALL, W_ unused )
+INFO_TABLE_RET ( stg_ret_d, RET_SMALL, W_ info_ptr, D_ d )
+    return (/* no return values */)
 {
-    R1 = Sp(1);
-    Sp_adj(2);
-    jump %ENTRY_CODE(Sp(0));
+    return (d);
 }
 
-stg_gc_unbx_r1
+INFO_TABLE_RET ( stg_ret_l, RET_SMALL, W_ info_ptr, L_ l )
+    return (/* no return values */)
 {
-    Sp_adj(-2);
-    Sp(1) = R1;
-    Sp(0) = stg_gc_unbx_r1_info;
-    GC_GENERIC
+    return (l);
 }
 
-/*-- F1 contains a float ------------------------------------------------- */
+/* -----------------------------------------------------------------------------
+   Canned heap-check failures for case alts, where we have some values
+   in registers or on the stack according to the NativeReturn
+   convention.
+   -------------------------------------------------------------------------- */
+
 
-INFO_TABLE_RET(	stg_gc_f1, RET_SMALL, F_ unused )
+/*-- void return ------------------------------------------------------------ */
+
+/*-- R1 is a GC pointer, but we don't enter it ----------------------- */
+
+stg_gc_unpt_r1 return (P_ ptr) /* NB. return convention */
 {
-    F1 = F_[Sp+WDS(1)];
-    Sp_adj(2);
-    jump %ENTRY_CODE(Sp(0));
+    jump stg_gc_noregs (stg_ret_p_info, ptr) ();
 }
 
-stg_gc_f1
+/*-- R1 is unboxed -------------------------------------------------- */
+
+stg_gc_unbx_r1 return (W_ nptr) /* NB. return convention */
 {
-    Sp_adj(-2);
-    F_[Sp + WDS(1)] = F1;
-    Sp(0) = stg_gc_f1_info;
-    GC_GENERIC
+    jump stg_gc_noregs (stg_ret_n_info, nptr) ();
 }
 
-/*-- D1 contains a double ------------------------------------------------- */
+/*-- F1 contains a float ------------------------------------------------- */
 
-INFO_TABLE_RET(	stg_gc_d1, RET_SMALL, D_ unused )
+stg_gc_f1 return (F_ f)
 {
-    D1 = D_[Sp + WDS(1)];
-    Sp = Sp + WDS(1) + SIZEOF_StgDouble;
-    jump %ENTRY_CODE(Sp(0));
+    jump stg_gc_noregs (stg_ret_f_info, f) ();
 }
 
-stg_gc_d1
+/*-- D1 contains a double ------------------------------------------------- */
+
+stg_gc_d1 return (D_ d)
 {
-    Sp = Sp - WDS(1) - SIZEOF_StgDouble;
-    D_[Sp + WDS(1)] = D1;
-    Sp(0) = stg_gc_d1_info;
-    GC_GENERIC
+    jump stg_gc_noregs (stg_ret_d_info, d) ();
 }
 
 
 /*-- L1 contains an int64 ------------------------------------------------- */
 
-INFO_TABLE_RET( stg_gc_l1, RET_SMALL, L_ unused )
+stg_gc_l1 return (L_ l)
 {
-    L1 = L_[Sp + WDS(1)];
-    Sp_adj(1) + SIZEOF_StgWord64;
-    jump %ENTRY_CODE(Sp(0));
+    jump stg_gc_noregs (stg_ret_l_info, l) ();
 }
 
-stg_gc_l1
+/*-- Unboxed tuples with multiple pointers -------------------------------- */
+
+stg_gc_pp return (P_ arg1, P_ arg2)
 {
-    Sp_adj(-1) - SIZEOF_StgWord64;
-    L_[Sp + WDS(1)] = L1;
-    Sp(0) = stg_gc_l1_info;
-    GC_GENERIC
+    call stg_gc_noregs();
+    return (arg1,arg2);
 }
 
-/*-- Unboxed tuple return, one pointer (unregisterised build only) ---------- */
+stg_gc_ppp return (P_ arg1, P_ arg2, P_ arg3)
+{
+    call stg_gc_noregs();
+    return (arg1,arg2,arg3);
+}
 
-INFO_TABLE_RET( stg_ut_1_0_unreg, RET_SMALL, P_ unused )
+stg_gc_pppp return (P_ arg1, P_ arg2, P_ arg3, P_ arg4)
 {
-    Sp_adj(1);
-    // one ptr is on the stack (Sp(0))
-    jump %ENTRY_CODE(Sp(1));
+    call stg_gc_noregs();
+    return (arg1,arg2,arg3,arg4);
 }
 
 /* -----------------------------------------------------------------------------
@@ -333,7 +361,7 @@ INFO_TABLE_RET( stg_ut_1_0_unreg, RET_SMALL, P_ unused )
 
    -------------------------------------------------------------------------- */
 
-__stg_gc_fun
+__stg_gc_fun /* explicit stack */
 {
     W_ size;
     W_ info;
@@ -365,7 +393,7 @@ __stg_gc_fun
     Sp(2) = R1;
     Sp(1) = size;
     Sp(0) = stg_gc_fun_info;
-    GC_GENERIC
+    jump stg_gc_noregs [];
 #else
     W_ type;
     type = TO_W_(StgFunInfoExtra_fun_type(info));
@@ -377,14 +405,15 @@ __stg_gc_fun
         Sp(1) = size;
         Sp(0) = stg_gc_fun_info;
         // DEBUG_ONLY(foreign "C" debugBelch("stg_fun_gc_gen(ARG_GEN)"););
-        GC_GENERIC
+        jump stg_gc_noregs [];
     } else { 
-	jump W_[stg_stack_save_entries + WDS(type)];
+        jump W_[stg_stack_save_entries + WDS(type)] [*]; // all regs live
 	    // jumps to stg_gc_noregs after saving stuff
     }
 #endif /* !NO_ARG_REGS */
 }
 
+
 /* -----------------------------------------------------------------------------
    Generic Apply (return point)
 
@@ -393,14 +422,15 @@ __stg_gc_fun
    appropriately.  The stack layout is given above.
    -------------------------------------------------------------------------- */
 
-INFO_TABLE_RET( stg_gc_fun, RET_FUN )
+INFO_TABLE_RET ( stg_gc_fun, RET_FUN )
+    /* explicit stack */
 {
     R1 = Sp(2);
     Sp_adj(3);
 #ifdef NO_ARG_REGS
     // Minor optimisation: there are no argument registers to load up,
     // so we can just jump straight to the function's entry point.
-    jump %GET_ENTRY(UNTAG(R1));
+    jump %GET_ENTRY(UNTAG(R1)) [R1];
 #else
     W_ info;
     W_ type;
@@ -408,126 +438,25 @@ INFO_TABLE_RET( stg_gc_fun, RET_FUN )
     info = %GET_FUN_INFO(UNTAG(R1));
     type = TO_W_(StgFunInfoExtra_fun_type(info));
     if (type == ARG_GEN || type == ARG_GEN_BIG) {
-	jump StgFunInfoExtra_slow_apply(info);
+        jump StgFunInfoExtra_slow_apply(info) [R1];
     } else { 
 	if (type == ARG_BCO) {
 	    // cover this case just to be on the safe side
 	    Sp_adj(-2);
 	    Sp(1) = R1;
 	    Sp(0) = stg_apply_interp_info;
-	    jump stg_yield_to_interpreter;
+            jump stg_yield_to_interpreter [];
 	} else {
-	    jump W_[stg_ap_stack_entries + WDS(type)];
+            jump W_[stg_ap_stack_entries + WDS(type)] [R1];
 	}
     }
 #endif
 }
 
 /* -----------------------------------------------------------------------------
-   Generic Heap Check Code.
-
-   Called with Liveness mask in R9,  Return address in R10.
-   Stack must be consistent (containing all necessary info pointers
-   to relevant SRTs).
-
-   See StgMacros.h for a description of the RET_DYN stack frame.
-
-   We also define an stg_gen_yield here, because it's very similar.
-   -------------------------------------------------------------------------- */
-
-// For simplicity, we assume that SIZEOF_DOUBLE == 2*SIZEOF_VOID_P
-// on a 64-bit machine, we'll end up wasting a couple of words, but
-// it's not a big deal.
-
-#define RESTORE_EVERYTHING			\
-    L1   = L_[Sp + WDS(19)];			\
-    D2   = D_[Sp + WDS(17)];			\
-    D1   = D_[Sp + WDS(15)];			\
-    F4   = F_[Sp + WDS(14)];			\
-    F3   = F_[Sp + WDS(13)];			\
-    F2   = F_[Sp + WDS(12)];			\
-    F1   = F_[Sp + WDS(11)];			\
-    R8 = Sp(10);				\
-    R7 = Sp(9);					\
-    R6 = Sp(8);					\
-    R5 = Sp(7);					\
-    R4 = Sp(6);					\
-    R3 = Sp(5);					\
-    R2 = Sp(4);					\
-    R1 = Sp(3);					\
-    Sp_adj(21);
-
-#define RET_OFFSET (-19)
-
-#define SAVE_EVERYTHING				\
-    Sp_adj(-21);				\
-    L_[Sp + WDS(19)] = L1;			\
-    D_[Sp + WDS(17)] = D2;			\
-    D_[Sp + WDS(15)] = D1;			\
-    F_[Sp + WDS(14)] = F4;			\
-    F_[Sp + WDS(13)] = F3;			\
-    F_[Sp + WDS(12)] = F2;			\
-    F_[Sp + WDS(11)] = F1;			\
-    Sp(10) = R8;				\
-    Sp(9) = R7;					\
-    Sp(8) = R6;					\
-    Sp(7) = R5;					\
-    Sp(6) = R4;					\
-    Sp(5) = R3;					\
-    Sp(4) = R2;					\
-    Sp(3) = R1;					\
-    Sp(2) = R10;    /* return address */	\
-    Sp(1) = R9;     /* liveness mask  */	\
-    Sp(0) = stg_gc_gen_info;
-
-INFO_TABLE_RET( stg_gc_gen, RET_DYN )
-/* bitmap in the above info table is unused, the real one is on the stack. */
-{
-    RESTORE_EVERYTHING;
-    jump Sp(RET_OFFSET); /* No %ENTRY_CODE( - this is an actual code ptr */
-}
-
-stg_gc_gen
-{
-    // Hack; see Note [mvar-heap-check] in PrimOps.cmm
-    if (R10 == stg_putMVarzh || R10 == stg_takeMVarzh) {
-       unlockClosure(R1, stg_MVAR_DIRTY_info)
-    }
-    SAVE_EVERYTHING;
-    GC_GENERIC
-}
-
-// A heap check at an unboxed tuple return point.  The return address
-// is on the stack, and we can find it by using the offsets given
-// to us in the liveness mask.
-stg_gc_ut
-{
-    R10 = %ENTRY_CODE(Sp(RET_DYN_NONPTRS(R9) + RET_DYN_PTRS(R9)));
-    SAVE_EVERYTHING;
-    GC_GENERIC
-}
-
-/*
- * stg_gen_hp is used by MAYBE_GC, where we can't use GC_GENERIC
- * because we've just failed doYouWantToGC(), not a standard heap
- * check.  GC_GENERIC would end up returning StackOverflow.
- */
-stg_gc_gen_hp
-{
-    SAVE_EVERYTHING;
-    HP_GENERIC
-}	  
-
-/* -----------------------------------------------------------------------------
    Yields
    -------------------------------------------------------------------------- */
 
-stg_gen_yield
-{
-    SAVE_EVERYTHING;
-    YIELD_GENERIC
-}
-
 stg_yield_noregs
 {
     YIELD_GENERIC;
@@ -546,25 +475,11 @@ stg_yield_to_interpreter
    Blocks
    -------------------------------------------------------------------------- */
 
-stg_gen_block
-{
-    SAVE_EVERYTHING;
-    BLOCK_GENERIC;
-}
-
 stg_block_noregs
 {
     BLOCK_GENERIC;
 }
 
-stg_block_1
-{
-    Sp_adj(-2);
-    Sp(1) = R1;
-    Sp(0) = stg_enter_info;
-    BLOCK_GENERIC;
-}
-
 /* -----------------------------------------------------------------------------
  * takeMVar/putMVar-specific blocks
  *
@@ -585,52 +500,48 @@ stg_block_1
  * 
  * -------------------------------------------------------------------------- */
 
-INFO_TABLE_RET( stg_block_takemvar, RET_SMALL, P_ unused )
+INFO_TABLE_RET ( stg_block_takemvar, RET_SMALL, W_ info_ptr, P_ mvar )
+    return ()
 {
-    R1 = Sp(1);
-    Sp_adj(2);
-    jump stg_takeMVarzh;
+    jump stg_takeMVarzh(mvar);
 }
 
 // code fragment executed just before we return to the scheduler
 stg_block_takemvar_finally
 {
     unlockClosure(R3, stg_MVAR_DIRTY_info);
-    jump StgReturn;
+    jump StgReturn [R1];
 }
 
-stg_block_takemvar
+stg_block_takemvar /* mvar passed in R1 */
 {
     Sp_adj(-2);
     Sp(1) = R1;
     Sp(0) = stg_block_takemvar_info;
-    R3 = R1;
+    R3 = R1; // mvar communicated to stg_block_takemvar_finally in R3
     BLOCK_BUT_FIRST(stg_block_takemvar_finally);
 }
 
-INFO_TABLE_RET( stg_block_putmvar, RET_SMALL, P_ unused1, P_ unused2 )
+INFO_TABLE_RET( stg_block_putmvar, RET_SMALL, W_ info_ptr,
+                P_ mvar, P_ val )
+    return ()
 {
-    R2 = Sp(2);
-    R1 = Sp(1);
-    Sp_adj(3);
-    jump stg_putMVarzh;
+    jump stg_putMVarzh(mvar, val);
 }
 
 // code fragment executed just before we return to the scheduler
 stg_block_putmvar_finally
 {
     unlockClosure(R3, stg_MVAR_DIRTY_info);
-    jump StgReturn;
+    jump StgReturn [R1];
 }
 
-stg_block_putmvar
+stg_block_putmvar (P_ mvar, P_ val)
 {
-    Sp_adj(-3);
-    Sp(2) = R2;
-    Sp(1) = R1;
-    Sp(0) = stg_block_putmvar_info;
-    R3 = R1;
-    BLOCK_BUT_FIRST(stg_block_putmvar_finally);
+    push (stg_block_putmvar_info, mvar, val) {
+      R3 = R1; // mvar communicated to stg_block_putmvar_finally in R3
+      BLOCK_BUT_FIRST(stg_block_putmvar_finally);
+   }
 }
 
 stg_block_blackhole
@@ -641,12 +552,11 @@ stg_block_blackhole
     BLOCK_GENERIC;
 }
 
-INFO_TABLE_RET( stg_block_throwto, RET_SMALL, P_ unused, P_ unused )
+INFO_TABLE_RET ( stg_block_throwto, RET_SMALL, W_ info_ptr,
+                 P_ tso, P_ exception )
+    return ()
 {
-    R2 = Sp(2);
-    R1 = Sp(1);
-    Sp_adj(3);
-    jump stg_killThreadzh;
+    jump stg_killThreadzh(tso, exception);
 }
 
 stg_block_throwto_finally
@@ -657,30 +567,26 @@ stg_block_throwto_finally
     if (StgHeader_info(StgTSO_block_info(CurrentTSO)) == stg_WHITEHOLE_info) {
         unlockClosure(StgTSO_block_info(CurrentTSO), stg_MSG_THROWTO_info);
     }
-    jump StgReturn;
+    jump StgReturn [R1];
 }
 
-stg_block_throwto
+stg_block_throwto (P_ tso, P_ exception)
 {
-    Sp_adj(-3);
-    Sp(2) = R2;
-    Sp(1) = R1;
-    Sp(0) = stg_block_throwto_info;
-    BLOCK_BUT_FIRST(stg_block_throwto_finally);
+    push (stg_block_throwto_info, tso, exception) {
+       BLOCK_BUT_FIRST(stg_block_throwto_finally);
+    }
 }
 
 #ifdef mingw32_HOST_OS
-INFO_TABLE_RET( stg_block_async, RET_SMALL, W_ unused )
+INFO_TABLE_RET ( stg_block_async, RET_SMALL, W_ info_ptr, W_ ares )
+    return ()
 {
-    W_ ares;
     W_ len, errC;
 
-    ares = Sp(1);
     len = TO_W_(StgAsyncIOResult_len(ares));
     errC = TO_W_(StgAsyncIOResult_errCode(ares));
-    foreign "C" free(ares "ptr");
-    Sp_adj(2);
-    RET_NN(len, errC);
+    ccall free(ares "ptr");
+    return (len, errC);
 }
 
 stg_block_async
@@ -693,14 +599,11 @@ stg_block_async
 /* Used by threadDelay implementation; it would be desirable to get rid of
  * this free()'ing void return continuation.
  */
-INFO_TABLE_RET( stg_block_async_void, RET_SMALL, W_ ares )
+INFO_TABLE_RET ( stg_block_async_void, RET_SMALL, W_ info_ptr, W_ ares )
+    return ()
 {
-    W_ ares;
-
-    ares = Sp(1);
-    foreign "C" free(ares "ptr");
-    Sp_adj(2);
-    jump %ENTRY_CODE(Sp(0));
+    ccall free(ares "ptr");
+    return ();
 }
 
 stg_block_async_void
@@ -712,14 +615,15 @@ stg_block_async_void
 
 #endif
 
+
 /* -----------------------------------------------------------------------------
    STM-specific waiting
    -------------------------------------------------------------------------- */
 
 stg_block_stmwait_finally
 {
-    foreign "C" stmWaitUnlock(MyCapability() "ptr", R3 "ptr");
-    jump StgReturn;
+    ccall stmWaitUnlock(MyCapability() "ptr", R3 "ptr");
+    jump StgReturn [R1];
 }
 
 stg_block_stmwait
diff --git a/rts/Interpreter.c b/rts/Interpreter.c
index 83973e8c9b..2eb2d0789f 100644
--- a/rts/Interpreter.c
+++ b/rts/Interpreter.c
@@ -503,7 +503,7 @@ do_return:
     // 	  |   XXXX_info   |
     // 	  +---------------+
     //
-    // where XXXX_info is one of the stg_gc_unbx_r1_info family.
+    // where XXXX_info is one of the stg_ret_*_info family.
     //
     // We're only interested in the case when the real return address
     // is a BCO; otherwise we'll return to the scheduler.
@@ -512,12 +512,12 @@ do_return_unboxed:
     { 
 	int offset;
 	
-	ASSERT( Sp[0] == (W_)&stg_gc_unbx_r1_info
-		|| Sp[0] == (W_)&stg_gc_unpt_r1_info
-		|| Sp[0] == (W_)&stg_gc_f1_info
-		|| Sp[0] == (W_)&stg_gc_d1_info
-		|| Sp[0] == (W_)&stg_gc_l1_info
-		|| Sp[0] == (W_)&stg_gc_void_info // VoidRep
+        ASSERT(    Sp[0] == (W_)&stg_ret_v_info
+                || Sp[0] == (W_)&stg_ret_p_info
+                || Sp[0] == (W_)&stg_ret_n_info
+                || Sp[0] == (W_)&stg_ret_f_info
+                || Sp[0] == (W_)&stg_ret_d_info
+                || Sp[0] == (W_)&stg_ret_l_info
 	    );
 
 	// get the offset of the stg_ctoi_ret_XXX itbl
@@ -1336,27 +1336,27 @@ run_BCO:
 
 	case bci_RETURN_P:
 	    Sp--;
-	    Sp[0] = (W_)&stg_gc_unpt_r1_info;
+            Sp[0] = (W_)&stg_ret_p_info;
 	    goto do_return_unboxed;
 	case bci_RETURN_N:
 	    Sp--;
-	    Sp[0] = (W_)&stg_gc_unbx_r1_info;
+            Sp[0] = (W_)&stg_ret_n_info;
 	    goto do_return_unboxed;
 	case bci_RETURN_F:
 	    Sp--;
-	    Sp[0] = (W_)&stg_gc_f1_info;
+            Sp[0] = (W_)&stg_ret_f_info;
 	    goto do_return_unboxed;
 	case bci_RETURN_D:
 	    Sp--;
-	    Sp[0] = (W_)&stg_gc_d1_info;
+            Sp[0] = (W_)&stg_ret_d_info;
 	    goto do_return_unboxed;
 	case bci_RETURN_L:
 	    Sp--;
-	    Sp[0] = (W_)&stg_gc_l1_info;
+            Sp[0] = (W_)&stg_ret_l_info;
 	    goto do_return_unboxed;
 	case bci_RETURN_V:
 	    Sp--;
-	    Sp[0] = (W_)&stg_gc_void_info;
+            Sp[0] = (W_)&stg_ret_v_info;
 	    goto do_return_unboxed;
 
 	case bci_SWIZZLE: {
@@ -1372,9 +1372,6 @@ run_BCO:
 	    int o_itbl                = BCO_GET_LARGE_ARG;
 	    int interruptible         = BCO_NEXT;
 	    void(*marshall_fn)(void*) = (void (*)(void*))BCO_LIT(o_itbl);
-	    int ret_dyn_size = 
-		RET_DYN_BITMAP_SIZE + RET_DYN_NONPTR_REGS_SIZE
-		+ sizeofW(StgRetDyn);
 
             /* the stack looks like this:
                
@@ -1405,6 +1402,7 @@ run_BCO:
             nat nargs = cif->nargs;
             nat ret_size;
             nat i;
+            int j;
             StgPtr p;
             W_ ret[2];                  // max needed
 	    W_ *arguments[stk_offset];  // max needed
@@ -1446,17 +1444,19 @@ run_BCO:
 	    //
 	    // We know how many (non-ptr) words there are before the
 	    // next valid stack frame: it is the stk_offset arg to the
-	    // CCALL instruction.   So we build a RET_DYN stack frame
-	    // on the stack frame to describe this chunk of stack.
-	    //
-	    Sp -= ret_dyn_size;
-	    ((StgRetDyn *)Sp)->liveness = R1_PTR | N_NONPTRS(stk_offset);
-	    ((StgRetDyn *)Sp)->info = (StgInfoTable *)&stg_gc_gen_info;
+	    // CCALL instruction.   So we overwrite this area of the
+            // stack with empty stack frames (stg_ret_v_info);
+            //
+            for (j = 0; j < stk_offset; j++) {
+                Sp[j] = (W_)&stg_ret_v_info; /* an empty stack frame */
+            }
 
             // save obj (pointer to the current BCO), since this
-            // might move during the call.  We use the R1 slot in the
-            // RET_DYN frame for this, hence R1_PTR above.
-            ((StgRetDyn *)Sp)->payload[0] = (StgClosure *)obj;
+            // might move during the call.  We push an stg_ret_p frame
+            // for this.
+            Sp -= 2;
+            Sp[1] = (W_)obj;
+            Sp[0] = (W_)&stg_ret_p_info;
 
 	    SAVE_STACK_POINTERS;
 	    tok = suspendThread(&cap->r, interruptible ? rtsTrue : rtsFalse);
@@ -1464,11 +1464,11 @@ run_BCO:
 	    // We already made a copy of the arguments above.
             ffi_call(cif, fn, ret, argptrs);
 
-	    // And restart the thread again, popping the RET_DYN frame.
+            // And restart the thread again, popping the stg_ret_p frame.
 	    cap = (Capability *)((void *)((unsigned char*)resumeThread(tok) - STG_FIELD_OFFSET(Capability,r)));
 	    LOAD_STACK_POINTERS;
 
-            if (Sp[0] != (W_)&stg_gc_gen_info) {
+            if (Sp[0] != (W_)&stg_ret_p_info) {
                 // the stack is not how we left it.  This probably
                 // means that an exception got raised on exit from the
                 // foreign call, so we should just continue with
@@ -1476,16 +1476,16 @@ run_BCO:
                 RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
             }
 
-            // Re-load the pointer to the BCO from the RET_DYN frame,
+            // Re-load the pointer to the BCO from the stg_ret_p frame,
             // it might have moved during the call.  Also reload the
             // pointers to the components of the BCO.
-            obj        = ((StgRetDyn *)Sp)->payload[0];
+            obj        = (P_)Sp[1];
             bco        = (StgBCO*)obj;
             instrs     = (StgWord16*)(bco->instrs->payload);
             literals   = (StgWord*)(&bco->literals->payload[0]);
             ptrs       = (StgPtr*)(&bco->ptrs->payload[0]);
 
-	    Sp += ret_dyn_size;
+            Sp += 2; // pop the stg_ret_p frame
 	    
 	    // Save the Haskell thread's current value of errno
 	    cap->r.rCurrentTSO->saved_errno = errno;
diff --git a/rts/LdvProfile.c b/rts/LdvProfile.c
index 7bc032e05d..8ccafef9e2 100644
--- a/rts/LdvProfile.c
+++ b/rts/LdvProfile.c
@@ -133,7 +133,6 @@ processHeapClosureForDead( StgClosure *c )
     case CATCH_FRAME:
     case UNDERFLOW_FRAME:
     case STOP_FRAME:
-    case RET_DYN:
     case RET_BCO:
     case RET_SMALL:
     case RET_BIG:
diff --git a/rts/Linker.c b/rts/Linker.c
index cf60c528d3..64d60f23d0 100644
--- a/rts/Linker.c
+++ b/rts/Linker.c
@@ -1011,301 +1011,300 @@ typedef struct _RtsSymbolVal {
 #endif
 
 
-#define RTS_SYMBOLS                                     \
-      Maybe_Stable_Names                                \
-      RTS_TICKY_SYMBOLS                                 \
-      SymI_HasProto(StgReturn)                          \
-      SymI_HasProto(stg_enter_info)                     \
-      SymI_HasProto(stg_gc_void_info)                   \
-      SymI_HasProto(__stg_gc_enter_1)                   \
-      SymI_HasProto(stg_gc_noregs)                      \
-      SymI_HasProto(stg_gc_unpt_r1_info)                \
-      SymI_HasProto(stg_gc_unpt_r1)                     \
-      SymI_HasProto(stg_gc_unbx_r1_info)                \
-      SymI_HasProto(stg_gc_unbx_r1)                     \
-      SymI_HasProto(stg_gc_f1_info)                     \
-      SymI_HasProto(stg_gc_f1)                          \
-      SymI_HasProto(stg_gc_d1_info)                     \
-      SymI_HasProto(stg_gc_d1)                          \
-      SymI_HasProto(stg_gc_l1_info)                     \
-      SymI_HasProto(stg_gc_l1)                          \
-      SymI_HasProto(__stg_gc_fun)                       \
-      SymI_HasProto(stg_gc_fun_info)                    \
-      SymI_HasProto(stg_gc_gen)                         \
-      SymI_HasProto(stg_gc_gen_info)                    \
-      SymI_HasProto(stg_gc_gen_hp)                      \
-      SymI_HasProto(stg_gc_ut)                          \
-      SymI_HasProto(stg_gen_yield)                      \
-      SymI_HasProto(stg_yield_noregs)                   \
-      SymI_HasProto(stg_yield_to_interpreter)           \
-      SymI_HasProto(stg_gen_block)                      \
-      SymI_HasProto(stg_block_noregs)                   \
-      SymI_HasProto(stg_block_1)                        \
-      SymI_HasProto(stg_block_takemvar)                 \
-      SymI_HasProto(stg_block_putmvar)                  \
-      MAIN_CAP_SYM                                      \
-      SymI_HasProto(MallocFailHook)                     \
-      SymI_HasProto(OnExitHook)                         \
-      SymI_HasProto(OutOfHeapHook)                      \
-      SymI_HasProto(StackOverflowHook)                  \
-      SymI_HasProto(addDLL)                             \
-      SymI_HasProto(__int_encodeDouble)                 \
-      SymI_HasProto(__word_encodeDouble)                \
-      SymI_HasProto(__2Int_encodeDouble)                \
-      SymI_HasProto(__int_encodeFloat)                  \
-      SymI_HasProto(__word_encodeFloat)                 \
-      SymI_HasProto(stg_atomicallyzh)                   \
-      SymI_HasProto(barf)                               \
-      SymI_HasProto(debugBelch)                         \
-      SymI_HasProto(errorBelch)                         \
-      SymI_HasProto(sysErrorBelch)                      \
-      SymI_HasProto(stg_getMaskingStatezh)              \
-      SymI_HasProto(stg_maskAsyncExceptionszh)          \
-      SymI_HasProto(stg_maskUninterruptiblezh)          \
-      SymI_HasProto(stg_catchzh)                        \
-      SymI_HasProto(stg_catchRetryzh)                   \
-      SymI_HasProto(stg_catchSTMzh)                     \
-      SymI_HasProto(stg_checkzh)                        \
-      SymI_HasProto(closure_flags)                      \
-      SymI_HasProto(cmp_thread)                         \
-      SymI_HasProto(createAdjustor)                     \
-      SymI_HasProto(stg_decodeDoublezu2Intzh)           \
-      SymI_HasProto(stg_decodeFloatzuIntzh)             \
-      SymI_HasProto(defaultsHook)                       \
-      SymI_HasProto(stg_delayzh)                        \
-      SymI_HasProto(stg_deRefWeakzh)                    \
-      SymI_HasProto(stg_deRefStablePtrzh)               \
-      SymI_HasProto(dirty_MUT_VAR)                      \
-      SymI_HasProto(stg_forkzh)                         \
-      SymI_HasProto(stg_forkOnzh)                       \
-      SymI_HasProto(forkProcess)                        \
-      SymI_HasProto(forkOS_createThread)                \
-      SymI_HasProto(freeHaskellFunctionPtr)             \
+#define RTS_SYMBOLS                                                     \
+      Maybe_Stable_Names                                                \
+      RTS_TICKY_SYMBOLS                                                 \
+      SymI_HasProto(StgReturn)                                          \
+      SymI_HasProto(stg_gc_noregs)                                      \
+      SymI_HasProto(stg_ret_v_info)                                     \
+      SymI_HasProto(stg_ret_p_info)                                     \
+      SymI_HasProto(stg_ret_n_info)                                     \
+      SymI_HasProto(stg_ret_f_info)                                     \
+      SymI_HasProto(stg_ret_d_info)                                     \
+      SymI_HasProto(stg_ret_l_info)                                     \
+      SymI_HasProto(stg_gc_prim_p)                                      \
+      SymI_HasProto(stg_gc_prim_pp)                                     \
+      SymI_HasProto(stg_gc_prim_n)                                      \
+      SymI_HasProto(stg_enter_info)                                     \
+      SymI_HasProto(__stg_gc_enter_1)                                   \
+      SymI_HasProto(stg_gc_unpt_r1)                                     \
+      SymI_HasProto(stg_gc_unbx_r1)                                     \
+      SymI_HasProto(stg_gc_f1)                                          \
+      SymI_HasProto(stg_gc_d1)                                          \
+      SymI_HasProto(stg_gc_l1)                                          \
+      SymI_HasProto(stg_gc_pp)                                          \
+      SymI_HasProto(stg_gc_ppp)                                         \
+      SymI_HasProto(stg_gc_pppp)                                        \
+      SymI_HasProto(__stg_gc_fun)                                       \
+      SymI_HasProto(stg_gc_fun_info)                                    \
+      SymI_HasProto(stg_yield_noregs)                                   \
+      SymI_HasProto(stg_yield_to_interpreter)                           \
+      SymI_HasProto(stg_block_noregs)                                   \
+      SymI_HasProto(stg_block_takemvar)                                 \
+      SymI_HasProto(stg_block_putmvar)                                  \
+      MAIN_CAP_SYM                                                      \
+      SymI_HasProto(MallocFailHook)                                     \
+      SymI_HasProto(OnExitHook)                                         \
+      SymI_HasProto(OutOfHeapHook)                                      \
+      SymI_HasProto(StackOverflowHook)                                  \
+      SymI_HasProto(addDLL)                                             \
+      SymI_HasProto(__int_encodeDouble)                                 \
+      SymI_HasProto(__word_encodeDouble)                                \
+      SymI_HasProto(__2Int_encodeDouble)                                \
+      SymI_HasProto(__int_encodeFloat)                                  \
+      SymI_HasProto(__word_encodeFloat)                                 \
+      SymI_HasProto(stg_atomicallyzh)                                   \
+      SymI_HasProto(barf)                                               \
+      SymI_HasProto(debugBelch)                                         \
+      SymI_HasProto(errorBelch)                                         \
+      SymI_HasProto(sysErrorBelch)                                      \
+      SymI_HasProto(stg_getMaskingStatezh)                              \
+      SymI_HasProto(stg_maskAsyncExceptionszh)                          \
+      SymI_HasProto(stg_maskUninterruptiblezh)                          \
+      SymI_HasProto(stg_catchzh)                                        \
+      SymI_HasProto(stg_catchRetryzh)                                   \
+      SymI_HasProto(stg_catchSTMzh)                                     \
+      SymI_HasProto(stg_checkzh)                                        \
+      SymI_HasProto(closure_flags)                                      \
+      SymI_HasProto(cmp_thread)                                         \
+      SymI_HasProto(createAdjustor)                                     \
+      SymI_HasProto(stg_decodeDoublezu2Intzh)                           \
+      SymI_HasProto(stg_decodeFloatzuIntzh)                             \
+      SymI_HasProto(defaultsHook)                                       \
+      SymI_HasProto(stg_delayzh)                                        \
+      SymI_HasProto(stg_deRefWeakzh)                                    \
+      SymI_HasProto(stg_deRefStablePtrzh)                               \
+      SymI_HasProto(dirty_MUT_VAR)                                      \
+      SymI_HasProto(stg_forkzh)                                         \
+      SymI_HasProto(stg_forkOnzh)                                       \
+      SymI_HasProto(forkProcess)                                        \
+      SymI_HasProto(forkOS_createThread)                                \
+      SymI_HasProto(freeHaskellFunctionPtr)                             \
       SymI_HasProto(getOrSetGHCConcSignalSignalHandlerStore)            \
       SymI_HasProto(getOrSetGHCConcWindowsPendingDelaysStore)           \
       SymI_HasProto(getOrSetGHCConcWindowsIOManagerThreadStore)         \
       SymI_HasProto(getOrSetGHCConcWindowsProddingStore)                \
       SymI_HasProto(getOrSetSystemEventThreadEventManagerStore)         \
       SymI_HasProto(getOrSetSystemEventThreadIOManagerThreadStore)      \
-      SymI_HasProto(getGCStats)                         \
-      SymI_HasProto(getGCStatsEnabled)                  \
-      SymI_HasProto(genSymZh)                           \
-      SymI_HasProto(genericRaise)                       \
-      SymI_HasProto(getProgArgv)                        \
-      SymI_HasProto(getFullProgArgv)                    \
-      SymI_HasProto(getStablePtr)                       \
-      SymI_HasProto(hs_init)                            \
-      SymI_HasProto(hs_exit)                            \
-      SymI_HasProto(hs_set_argv)                        \
-      SymI_HasProto(hs_add_root)                        \
-      SymI_HasProto(hs_perform_gc)                      \
-      SymI_HasProto(hs_free_stable_ptr)                 \
-      SymI_HasProto(hs_free_fun_ptr)                    \
-      SymI_HasProto(hs_hpc_rootModule)                  \
-      SymI_HasProto(hs_hpc_module)                      \
-      SymI_HasProto(initLinker)                         \
-      SymI_HasProto(stg_unpackClosurezh)                \
-      SymI_HasProto(stg_getApStackValzh)                \
-      SymI_HasProto(stg_getSparkzh)                     \
-      SymI_HasProto(stg_numSparkszh)                    \
-      SymI_HasProto(stg_isCurrentThreadBoundzh)         \
-      SymI_HasProto(stg_isEmptyMVarzh)                  \
-      SymI_HasProto(stg_killThreadzh)                   \
-      SymI_HasProto(loadArchive)                        \
-      SymI_HasProto(loadObj)                            \
-      SymI_HasProto(insertStableSymbol)                 \
-      SymI_HasProto(insertSymbol)                       \
-      SymI_HasProto(lookupSymbol)                       \
-      SymI_HasProto(stg_makeStablePtrzh)                \
-      SymI_HasProto(stg_mkApUpd0zh)                     \
-      SymI_HasProto(stg_myThreadIdzh)                   \
-      SymI_HasProto(stg_labelThreadzh)                  \
-      SymI_HasProto(stg_newArrayzh)                     \
-      SymI_HasProto(stg_newArrayArrayzh)                     \
-      SymI_HasProto(stg_newBCOzh)                       \
-      SymI_HasProto(stg_newByteArrayzh)                 \
-      SymI_HasProto_redirect(newCAF, newDynCAF)         \
-      SymI_HasProto(stg_newMVarzh)                      \
-      SymI_HasProto(stg_newMutVarzh)                    \
-      SymI_HasProto(stg_newTVarzh)                      \
-      SymI_HasProto(stg_noDuplicatezh)                  \
-      SymI_HasProto(stg_atomicModifyMutVarzh)           \
-      SymI_HasProto(stg_casMutVarzh)                    \
-      SymI_HasProto(stg_newPinnedByteArrayzh)           \
-      SymI_HasProto(stg_newAlignedPinnedByteArrayzh)    \
-      SymI_HasProto(newSpark)                           \
-      SymI_HasProto(performGC)                          \
-      SymI_HasProto(performMajorGC)                     \
-      SymI_HasProto(prog_argc)                          \
-      SymI_HasProto(prog_argv)                          \
-      SymI_HasProto(stg_putMVarzh)                      \
-      SymI_HasProto(stg_raisezh)                        \
-      SymI_HasProto(stg_raiseIOzh)                      \
-      SymI_HasProto(stg_readTVarzh)                     \
-      SymI_HasProto(stg_readTVarIOzh)                   \
-      SymI_HasProto(resumeThread)                       \
-      SymI_HasProto(setNumCapabilities)                 \
-      SymI_HasProto(getNumberOfProcessors)              \
-      SymI_HasProto(resolveObjs)                        \
-      SymI_HasProto(stg_retryzh)                        \
-      SymI_HasProto(rts_apply)                          \
-      SymI_HasProto(rts_checkSchedStatus)               \
-      SymI_HasProto(rts_eval)                           \
-      SymI_HasProto(rts_evalIO)                         \
-      SymI_HasProto(rts_evalLazyIO)                     \
-      SymI_HasProto(rts_evalStableIO)                   \
-      SymI_HasProto(rts_eval_)                          \
-      SymI_HasProto(rts_getBool)                        \
-      SymI_HasProto(rts_getChar)                        \
-      SymI_HasProto(rts_getDouble)                      \
-      SymI_HasProto(rts_getFloat)                       \
-      SymI_HasProto(rts_getInt)                         \
-      SymI_HasProto(rts_getInt8)                        \
-      SymI_HasProto(rts_getInt16)                       \
-      SymI_HasProto(rts_getInt32)                       \
-      SymI_HasProto(rts_getInt64)                       \
-      SymI_HasProto(rts_getPtr)                         \
-      SymI_HasProto(rts_getFunPtr)                      \
-      SymI_HasProto(rts_getStablePtr)                   \
-      SymI_HasProto(rts_getThreadId)                    \
-      SymI_HasProto(rts_getWord)                        \
-      SymI_HasProto(rts_getWord8)                       \
-      SymI_HasProto(rts_getWord16)                      \
-      SymI_HasProto(rts_getWord32)                      \
-      SymI_HasProto(rts_getWord64)                      \
-      SymI_HasProto(rts_lock)                           \
-      SymI_HasProto(rts_mkBool)                         \
-      SymI_HasProto(rts_mkChar)                         \
-      SymI_HasProto(rts_mkDouble)                       \
-      SymI_HasProto(rts_mkFloat)                        \
-      SymI_HasProto(rts_mkInt)                          \
-      SymI_HasProto(rts_mkInt8)                         \
-      SymI_HasProto(rts_mkInt16)                        \
-      SymI_HasProto(rts_mkInt32)                        \
-      SymI_HasProto(rts_mkInt64)                        \
-      SymI_HasProto(rts_mkPtr)                          \
-      SymI_HasProto(rts_mkFunPtr)                       \
-      SymI_HasProto(rts_mkStablePtr)                    \
-      SymI_HasProto(rts_mkString)                       \
-      SymI_HasProto(rts_mkWord)                         \
-      SymI_HasProto(rts_mkWord8)                        \
-      SymI_HasProto(rts_mkWord16)                       \
-      SymI_HasProto(rts_mkWord32)                       \
-      SymI_HasProto(rts_mkWord64)                       \
-      SymI_HasProto(rts_unlock)                         \
-      SymI_HasProto(rts_unsafeGetMyCapability)          \
-      SymI_HasProto(rtsSupportsBoundThreads)            \
-      SymI_HasProto(rts_isProfiled)                     \
-      SymI_HasProto(setProgArgv)                        \
-      SymI_HasProto(startupHaskell)                     \
-      SymI_HasProto(shutdownHaskell)                    \
-      SymI_HasProto(shutdownHaskellAndExit)             \
-      SymI_HasProto(stable_ptr_table)                   \
-      SymI_HasProto(stackOverflow)                      \
-      SymI_HasProto(stg_CAF_BLACKHOLE_info)             \
-      SymI_HasProto(stg_BLACKHOLE_info)                 \
-      SymI_HasProto(__stg_EAGER_BLACKHOLE_info)         \
-      SymI_HasProto(stg_BLOCKING_QUEUE_CLEAN_info)      \
-      SymI_HasProto(stg_BLOCKING_QUEUE_DIRTY_info)      \
-      SymI_HasProto(startTimer)                         \
-      SymI_HasProto(stg_MVAR_CLEAN_info)                \
-      SymI_HasProto(stg_MVAR_DIRTY_info)                \
-      SymI_HasProto(stg_IND_STATIC_info)                \
-      SymI_HasProto(stg_ARR_WORDS_info)                 \
-      SymI_HasProto(stg_MUT_ARR_PTRS_DIRTY_info)        \
-      SymI_HasProto(stg_MUT_ARR_PTRS_FROZEN_info)       \
-      SymI_HasProto(stg_MUT_ARR_PTRS_FROZEN0_info)      \
-      SymI_HasProto(stg_WEAK_info)                      \
-      SymI_HasProto(stg_ap_v_info)                      \
-      SymI_HasProto(stg_ap_f_info)                      \
-      SymI_HasProto(stg_ap_d_info)                      \
-      SymI_HasProto(stg_ap_l_info)                      \
-      SymI_HasProto(stg_ap_n_info)                      \
-      SymI_HasProto(stg_ap_p_info)                      \
-      SymI_HasProto(stg_ap_pv_info)                     \
-      SymI_HasProto(stg_ap_pp_info)                     \
-      SymI_HasProto(stg_ap_ppv_info)                    \
-      SymI_HasProto(stg_ap_ppp_info)                    \
-      SymI_HasProto(stg_ap_pppv_info)                   \
-      SymI_HasProto(stg_ap_pppp_info)                   \
-      SymI_HasProto(stg_ap_ppppp_info)                  \
-      SymI_HasProto(stg_ap_pppppp_info)                 \
-      SymI_HasProto(stg_ap_0_fast)                      \
-      SymI_HasProto(stg_ap_v_fast)                      \
-      SymI_HasProto(stg_ap_f_fast)                      \
-      SymI_HasProto(stg_ap_d_fast)                      \
-      SymI_HasProto(stg_ap_l_fast)                      \
-      SymI_HasProto(stg_ap_n_fast)                      \
-      SymI_HasProto(stg_ap_p_fast)                      \
-      SymI_HasProto(stg_ap_pv_fast)                     \
-      SymI_HasProto(stg_ap_pp_fast)                     \
-      SymI_HasProto(stg_ap_ppv_fast)                    \
-      SymI_HasProto(stg_ap_ppp_fast)                    \
-      SymI_HasProto(stg_ap_pppv_fast)                   \
-      SymI_HasProto(stg_ap_pppp_fast)                   \
-      SymI_HasProto(stg_ap_ppppp_fast)                  \
-      SymI_HasProto(stg_ap_pppppp_fast)                 \
-      SymI_HasProto(stg_ap_1_upd_info)                  \
-      SymI_HasProto(stg_ap_2_upd_info)                  \
-      SymI_HasProto(stg_ap_3_upd_info)                  \
-      SymI_HasProto(stg_ap_4_upd_info)                  \
-      SymI_HasProto(stg_ap_5_upd_info)                  \
-      SymI_HasProto(stg_ap_6_upd_info)                  \
-      SymI_HasProto(stg_ap_7_upd_info)                  \
-      SymI_HasProto(stg_exit)                           \
-      SymI_HasProto(stg_sel_0_upd_info)                 \
-      SymI_HasProto(stg_sel_10_upd_info)                \
-      SymI_HasProto(stg_sel_11_upd_info)                \
-      SymI_HasProto(stg_sel_12_upd_info)                \
-      SymI_HasProto(stg_sel_13_upd_info)                \
-      SymI_HasProto(stg_sel_14_upd_info)                \
-      SymI_HasProto(stg_sel_15_upd_info)                \
-      SymI_HasProto(stg_sel_1_upd_info)                 \
-      SymI_HasProto(stg_sel_2_upd_info)                 \
-      SymI_HasProto(stg_sel_3_upd_info)                 \
-      SymI_HasProto(stg_sel_4_upd_info)                 \
-      SymI_HasProto(stg_sel_5_upd_info)                 \
-      SymI_HasProto(stg_sel_6_upd_info)                 \
-      SymI_HasProto(stg_sel_7_upd_info)                 \
-      SymI_HasProto(stg_sel_8_upd_info)                 \
-      SymI_HasProto(stg_sel_9_upd_info)                 \
-      SymI_HasProto(stg_upd_frame_info)                 \
-      SymI_HasProto(stg_bh_upd_frame_info)              \
-      SymI_HasProto(suspendThread)                      \
-      SymI_HasProto(stg_takeMVarzh)                     \
-      SymI_HasProto(stg_threadStatuszh)                 \
-      SymI_HasProto(stg_tryPutMVarzh)                   \
-      SymI_HasProto(stg_tryTakeMVarzh)                  \
-      SymI_HasProto(stg_unmaskAsyncExceptionszh)        \
-      SymI_HasProto(unloadObj)                          \
-      SymI_HasProto(stg_unsafeThawArrayzh)              \
-      SymI_HasProto(stg_waitReadzh)                     \
-      SymI_HasProto(stg_waitWritezh)                    \
-      SymI_HasProto(stg_writeTVarzh)                    \
-      SymI_HasProto(stg_yieldzh)                        \
-      SymI_NeedsProto(stg_interp_constr_entry)          \
-      SymI_HasProto(stg_arg_bitmaps)                    \
-      SymI_HasProto(large_alloc_lim)                    \
-      SymI_HasProto(g0)                                 \
-      SymI_HasProto(allocate)                           \
-      SymI_HasProto(allocateExec)                       \
-      SymI_HasProto(freeExec)                           \
-      SymI_HasProto(getAllocations)                     \
-      SymI_HasProto(revertCAFs)                         \
-      SymI_HasProto(RtsFlags)                           \
-      SymI_NeedsProto(rts_breakpoint_io_action)         \
-      SymI_NeedsProto(rts_stop_next_breakpoint)         \
-      SymI_NeedsProto(rts_stop_on_exception)            \
-      SymI_HasProto(stopTimer)                          \
-      SymI_HasProto(n_capabilities)                     \
-      SymI_HasProto(stg_traceCcszh)                     \
-      SymI_HasProto(stg_traceEventzh)                   \
-      SymI_HasProto(getMonotonicNSec)                   \
-      SymI_HasProto(lockFile)                           \
-      SymI_HasProto(unlockFile)                         \
-      SymI_HasProto(startProfTimer)                     \
-      SymI_HasProto(stopProfTimer)                      \
-      RTS_USER_SIGNALS_SYMBOLS                          \
+      SymI_HasProto(getGCStats)                                         \
+      SymI_HasProto(getGCStatsEnabled)                                  \
+      SymI_HasProto(genSymZh)                                           \
+      SymI_HasProto(genericRaise)                                       \
+      SymI_HasProto(getProgArgv)                                        \
+      SymI_HasProto(getFullProgArgv)                                    \
+      SymI_HasProto(getStablePtr)                                       \
+      SymI_HasProto(hs_init)                                            \
+      SymI_HasProto(hs_exit)                                            \
+      SymI_HasProto(hs_set_argv)                                        \
+      SymI_HasProto(hs_add_root)                                        \
+      SymI_HasProto(hs_perform_gc)                                      \
+      SymI_HasProto(hs_free_stable_ptr)                                 \
+      SymI_HasProto(hs_free_fun_ptr)                                    \
+      SymI_HasProto(hs_hpc_rootModule)                                  \
+      SymI_HasProto(hs_hpc_module)                                      \
+      SymI_HasProto(initLinker)                                         \
+      SymI_HasProto(stg_unpackClosurezh)                                \
+      SymI_HasProto(stg_getApStackValzh)                                \
+      SymI_HasProto(stg_getSparkzh)                                     \
+      SymI_HasProto(stg_numSparkszh)                                    \
+      SymI_HasProto(stg_isCurrentThreadBoundzh)                         \
+      SymI_HasProto(stg_isEmptyMVarzh)                                  \
+      SymI_HasProto(stg_killThreadzh)                                   \
+      SymI_HasProto(loadArchive)                                        \
+      SymI_HasProto(loadObj)                                            \
+      SymI_HasProto(insertStableSymbol)                                 \
+      SymI_HasProto(insertSymbol)                                       \
+      SymI_HasProto(lookupSymbol)                                       \
+      SymI_HasProto(stg_makeStablePtrzh)                                \
+      SymI_HasProto(stg_mkApUpd0zh)                                     \
+      SymI_HasProto(stg_myThreadIdzh)                                   \
+      SymI_HasProto(stg_labelThreadzh)                                  \
+      SymI_HasProto(stg_newArrayzh)                                     \
+      SymI_HasProto(stg_newArrayArrayzh)                                \
+      SymI_HasProto(stg_newBCOzh)                                       \
+      SymI_HasProto(stg_newByteArrayzh)                                 \
+      SymI_HasProto_redirect(newCAF, newDynCAF)                         \
+      SymI_HasProto(stg_newMVarzh)                                      \
+      SymI_HasProto(stg_newMutVarzh)                                    \
+      SymI_HasProto(stg_newTVarzh)                                      \
+      SymI_HasProto(stg_noDuplicatezh)                                  \
+      SymI_HasProto(stg_atomicModifyMutVarzh)                           \
+      SymI_HasProto(stg_casMutVarzh)                                    \
+      SymI_HasProto(stg_newPinnedByteArrayzh)                           \
+      SymI_HasProto(stg_newAlignedPinnedByteArrayzh)                    \
+      SymI_HasProto(newSpark)                                           \
+      SymI_HasProto(performGC)                                          \
+      SymI_HasProto(performMajorGC)                                     \
+      SymI_HasProto(prog_argc)                                          \
+      SymI_HasProto(prog_argv)                                          \
+      SymI_HasProto(stg_putMVarzh)                                      \
+      SymI_HasProto(stg_raisezh)                                        \
+      SymI_HasProto(stg_raiseIOzh)                                      \
+      SymI_HasProto(stg_readTVarzh)                                     \
+      SymI_HasProto(stg_readTVarIOzh)                                   \
+      SymI_HasProto(resumeThread)                                       \
+      SymI_HasProto(setNumCapabilities)                                 \
+      SymI_HasProto(getNumberOfProcessors)                              \
+      SymI_HasProto(resolveObjs)                                        \
+      SymI_HasProto(stg_retryzh)                                        \
+      SymI_HasProto(rts_apply)                                          \
+      SymI_HasProto(rts_checkSchedStatus)                               \
+      SymI_HasProto(rts_eval)                                           \
+      SymI_HasProto(rts_evalIO)                                         \
+      SymI_HasProto(rts_evalLazyIO)                                     \
+      SymI_HasProto(rts_evalStableIO)                                   \
+      SymI_HasProto(rts_eval_)                                          \
+      SymI_HasProto(rts_getBool)                                        \
+      SymI_HasProto(rts_getChar)                                        \
+      SymI_HasProto(rts_getDouble)                                      \
+      SymI_HasProto(rts_getFloat)                                       \
+      SymI_HasProto(rts_getInt)                                         \
+      SymI_HasProto(rts_getInt8)                                        \
+      SymI_HasProto(rts_getInt16)                                       \
+      SymI_HasProto(rts_getInt32)                                       \
+      SymI_HasProto(rts_getInt64)                                       \
+      SymI_HasProto(rts_getPtr)                                         \
+      SymI_HasProto(rts_getFunPtr)                                      \
+      SymI_HasProto(rts_getStablePtr)                                   \
+      SymI_HasProto(rts_getThreadId)                                    \
+      SymI_HasProto(rts_getWord)                                        \
+      SymI_HasProto(rts_getWord8)                                       \
+      SymI_HasProto(rts_getWord16)                                      \
+      SymI_HasProto(rts_getWord32)                                      \
+      SymI_HasProto(rts_getWord64)                                      \
+      SymI_HasProto(rts_lock)                                           \
+      SymI_HasProto(rts_mkBool)                                         \
+      SymI_HasProto(rts_mkChar)                                         \
+      SymI_HasProto(rts_mkDouble)                                       \
+      SymI_HasProto(rts_mkFloat)                                        \
+      SymI_HasProto(rts_mkInt)                                          \
+      SymI_HasProto(rts_mkInt8)                                         \
+      SymI_HasProto(rts_mkInt16)                                        \
+      SymI_HasProto(rts_mkInt32)                                        \
+      SymI_HasProto(rts_mkInt64)                                        \
+      SymI_HasProto(rts_mkPtr)                                          \
+      SymI_HasProto(rts_mkFunPtr)                                       \
+      SymI_HasProto(rts_mkStablePtr)                                    \
+      SymI_HasProto(rts_mkString)                                       \
+      SymI_HasProto(rts_mkWord)                                         \
+      SymI_HasProto(rts_mkWord8)                                        \
+      SymI_HasProto(rts_mkWord16)                                       \
+      SymI_HasProto(rts_mkWord32)                                       \
+      SymI_HasProto(rts_mkWord64)                                       \
+      SymI_HasProto(rts_unlock)                                         \
+      SymI_HasProto(rts_unsafeGetMyCapability)                          \
+      SymI_HasProto(rtsSupportsBoundThreads)                            \
+      SymI_HasProto(rts_isProfiled)                                     \
+      SymI_HasProto(setProgArgv)                                        \
+      SymI_HasProto(startupHaskell)                                     \
+      SymI_HasProto(shutdownHaskell)                                    \
+      SymI_HasProto(shutdownHaskellAndExit)                             \
+      SymI_HasProto(stable_ptr_table)                                   \
+      SymI_HasProto(stackOverflow)                                      \
+      SymI_HasProto(stg_CAF_BLACKHOLE_info)                             \
+      SymI_HasProto(stg_BLACKHOLE_info)                                 \
+      SymI_HasProto(__stg_EAGER_BLACKHOLE_info)                         \
+      SymI_HasProto(stg_BLOCKING_QUEUE_CLEAN_info)                      \
+      SymI_HasProto(stg_BLOCKING_QUEUE_DIRTY_info)                      \
+      SymI_HasProto(startTimer)                                         \
+      SymI_HasProto(stg_MVAR_CLEAN_info)                                \
+      SymI_HasProto(stg_MVAR_DIRTY_info)                                \
+      SymI_HasProto(stg_IND_STATIC_info)                                \
+      SymI_HasProto(stg_ARR_WORDS_info)                                 \
+      SymI_HasProto(stg_MUT_ARR_PTRS_DIRTY_info)                        \
+      SymI_HasProto(stg_MUT_ARR_PTRS_FROZEN_info)                       \
+      SymI_HasProto(stg_MUT_ARR_PTRS_FROZEN0_info)                      \
+      SymI_HasProto(stg_WEAK_info)                                      \
+      SymI_HasProto(stg_ap_v_info)                                      \
+      SymI_HasProto(stg_ap_f_info)                                      \
+      SymI_HasProto(stg_ap_d_info)                                      \
+      SymI_HasProto(stg_ap_l_info)                                      \
+      SymI_HasProto(stg_ap_n_info)                                      \
+      SymI_HasProto(stg_ap_p_info)                                      \
+      SymI_HasProto(stg_ap_pv_info)                                     \
+      SymI_HasProto(stg_ap_pp_info)                                     \
+      SymI_HasProto(stg_ap_ppv_info)                                    \
+      SymI_HasProto(stg_ap_ppp_info)                                    \
+      SymI_HasProto(stg_ap_pppv_info)                                   \
+      SymI_HasProto(stg_ap_pppp_info)                                   \
+      SymI_HasProto(stg_ap_ppppp_info)                                  \
+      SymI_HasProto(stg_ap_pppppp_info)                                 \
+      SymI_HasProto(stg_ap_0_fast)                                      \
+      SymI_HasProto(stg_ap_v_fast)                                      \
+      SymI_HasProto(stg_ap_f_fast)                                      \
+      SymI_HasProto(stg_ap_d_fast)                                      \
+      SymI_HasProto(stg_ap_l_fast)                                      \
+      SymI_HasProto(stg_ap_n_fast)                                      \
+      SymI_HasProto(stg_ap_p_fast)                                      \
+      SymI_HasProto(stg_ap_pv_fast)                                     \
+      SymI_HasProto(stg_ap_pp_fast)                                     \
+      SymI_HasProto(stg_ap_ppv_fast)                                    \
+      SymI_HasProto(stg_ap_ppp_fast)                                    \
+      SymI_HasProto(stg_ap_pppv_fast)                                   \
+      SymI_HasProto(stg_ap_pppp_fast)                                   \
+      SymI_HasProto(stg_ap_ppppp_fast)                                  \
+      SymI_HasProto(stg_ap_pppppp_fast)                                 \
+      SymI_HasProto(stg_ap_1_upd_info)                                  \
+      SymI_HasProto(stg_ap_2_upd_info)                                  \
+      SymI_HasProto(stg_ap_3_upd_info)                                  \
+      SymI_HasProto(stg_ap_4_upd_info)                                  \
+      SymI_HasProto(stg_ap_5_upd_info)                                  \
+      SymI_HasProto(stg_ap_6_upd_info)                                  \
+      SymI_HasProto(stg_ap_7_upd_info)                                  \
+      SymI_HasProto(stg_exit)                                           \
+      SymI_HasProto(stg_sel_0_upd_info)                                 \
+      SymI_HasProto(stg_sel_10_upd_info)                                \
+      SymI_HasProto(stg_sel_11_upd_info)                                \
+      SymI_HasProto(stg_sel_12_upd_info)                                \
+      SymI_HasProto(stg_sel_13_upd_info)                                \
+      SymI_HasProto(stg_sel_14_upd_info)                                \
+      SymI_HasProto(stg_sel_15_upd_info)                                \
+      SymI_HasProto(stg_sel_1_upd_info)                                 \
+      SymI_HasProto(stg_sel_2_upd_info)                                 \
+      SymI_HasProto(stg_sel_3_upd_info)                                 \
+      SymI_HasProto(stg_sel_4_upd_info)                                 \
+      SymI_HasProto(stg_sel_5_upd_info)                                 \
+      SymI_HasProto(stg_sel_6_upd_info)                                 \
+      SymI_HasProto(stg_sel_7_upd_info)                                 \
+      SymI_HasProto(stg_sel_8_upd_info)                                 \
+      SymI_HasProto(stg_sel_9_upd_info)                                 \
+      SymI_HasProto(stg_upd_frame_info)                                 \
+      SymI_HasProto(stg_bh_upd_frame_info)                              \
+      SymI_HasProto(suspendThread)                                      \
+      SymI_HasProto(stg_takeMVarzh)                                     \
+      SymI_HasProto(stg_threadStatuszh)                                 \
+      SymI_HasProto(stg_tryPutMVarzh)                                   \
+      SymI_HasProto(stg_tryTakeMVarzh)                                  \
+      SymI_HasProto(stg_unmaskAsyncExceptionszh)                        \
+      SymI_HasProto(unloadObj)                                          \
+      SymI_HasProto(stg_unsafeThawArrayzh)                              \
+      SymI_HasProto(stg_waitReadzh)                                     \
+      SymI_HasProto(stg_waitWritezh)                                    \
+      SymI_HasProto(stg_writeTVarzh)                                    \
+      SymI_HasProto(stg_yieldzh)                                        \
+      SymI_NeedsProto(stg_interp_constr_entry)                          \
+      SymI_HasProto(stg_arg_bitmaps)                                    \
+      SymI_HasProto(large_alloc_lim)                                    \
+      SymI_HasProto(g0)                                                 \
+      SymI_HasProto(allocate)                                           \
+      SymI_HasProto(allocateExec)                                       \
+      SymI_HasProto(freeExec)                                           \
+      SymI_HasProto(getAllocations)                                     \
+      SymI_HasProto(revertCAFs)                                         \
+      SymI_HasProto(RtsFlags)                                           \
+      SymI_NeedsProto(rts_breakpoint_io_action)                         \
+      SymI_NeedsProto(rts_stop_next_breakpoint)                         \
+      SymI_NeedsProto(rts_stop_on_exception)                            \
+      SymI_HasProto(stopTimer)                                          \
+      SymI_HasProto(n_capabilities)                                     \
+      SymI_HasProto(stg_traceCcszh)                                     \
+      SymI_HasProto(stg_traceEventzh)                                   \
+      SymI_HasProto(getMonotonicNSec)                                   \
+      SymI_HasProto(lockFile)                                           \
+      SymI_HasProto(unlockFile)                                         \
+      SymI_HasProto(startProfTimer)                                     \
+      SymI_HasProto(stopProfTimer)                                      \
+      RTS_USER_SIGNALS_SYMBOLS                                          \
       RTS_INTCHAR_SYMBOLS
 
 
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index 9cedabdca8..1a531b2149 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -1,6 +1,6 @@
 /* -----------------------------------------------------------------------------
  *
- * (c) The GHC Team, 1998-2011
+ * (c) The GHC Team, 1998-2012
  *
  * Out-of-line primitive operations
  *
@@ -10,14 +10,9 @@
  * this file contains code for most of those with the attribute
  * out_of_line=True.
  *
- * Entry convention: the entry convention for a primop is that all the
- * args are in Stg registers (R1, R2, etc.).  This is to make writing
- * the primops easier.  (see compiler/codeGen/CgCallConv.hs).
- *
- * Return convention: results from a primop are generally returned
- * using the ordinary unboxed tuple return convention.  The C-- parser
- * implements the RET_xxxx() macros to perform unboxed-tuple returns
- * based on the prevailing return convention.
+ * Entry convention: the entry convention for a primop is the
+ * NativeNodeCall convention, and the return convention is
+ * NativeReturn.  (see compiler/cmm/CmmCallConv.hs)
  *
  * This file is written in a subset of C--, extended with various
  * features specific to GHC.  It is compiled by GHC directly.  For the
@@ -44,8 +39,6 @@ import sm_mutex;
 
   Basically just new*Array - the others are all inline macros.
 
-  The size arg is always passed in R1, and the result returned in R1.
-
   The slow entry point is for returning from a heap check, the saved
   size argument must be re-loaded from the stack.
   -------------------------------------------------------------------------- */
@@ -54,29 +47,32 @@ import sm_mutex;
  * round up to the nearest word for the size of the array.
  */
 
-stg_newByteArrayzh
+stg_newByteArrayzh ( W_ n )
 {
-    W_ words, payload_words, n, p;
-    MAYBE_GC(NO_PTRS,stg_newByteArrayzh);
-    n = R1;
+    W_ words, payload_words;
+    gcptr p;
+
+    MAYBE_GC_N(stg_newByteArrayzh, n);
+
     payload_words = ROUNDUP_BYTES_TO_WDS(n);
     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-    ("ptr" p) = foreign "C" allocate(MyCapability() "ptr",words) [];
+    ("ptr" p) = ccall allocate(MyCapability() "ptr",words);
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
     SET_HDR(p, stg_ARR_WORDS_info, CCCS);
     StgArrWords_bytes(p) = n;
-    RET_P(p);
+    return (p);
 }
 
 #define BA_ALIGN 16
 #define BA_MASK  (BA_ALIGN-1)
 
-stg_newPinnedByteArrayzh
+stg_newPinnedByteArrayzh ( W_ n )
 {
-    W_ words, n, bytes, payload_words, p;
+    W_ words, bytes, payload_words;
+    gcptr p;
+
+    MAYBE_GC_N(stg_newPinnedByteArrayzh, n);
 
-    MAYBE_GC(NO_PTRS,stg_newPinnedByteArrayzh);
-    n = R1;
     bytes = n;
     /* payload_words is what we will tell the profiler we had to allocate */
     payload_words = ROUNDUP_BYTES_TO_WDS(bytes);
@@ -89,7 +85,7 @@ stg_newPinnedByteArrayzh
     /* Now we convert to a number of words: */
     words = ROUNDUP_BYTES_TO_WDS(bytes);
 
-    ("ptr" p) = foreign "C" allocatePinned(MyCapability() "ptr", words) [];
+    ("ptr" p) = ccall allocatePinned(MyCapability() "ptr", words);
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
     /* Now we need to move p forward so that the payload is aligned
@@ -98,16 +94,15 @@ stg_newPinnedByteArrayzh
 
     SET_HDR(p, stg_ARR_WORDS_info, CCCS);
     StgArrWords_bytes(p) = n;
-    RET_P(p);
+    return (p);
 }
 
-stg_newAlignedPinnedByteArrayzh
+stg_newAlignedPinnedByteArrayzh ( W_ n, W_ alignment )
 {
-    W_ words, n, bytes, payload_words, p, alignment;
+    W_ words, bytes, payload_words;
+    gcptr p;
 
-    MAYBE_GC(NO_PTRS,stg_newAlignedPinnedByteArrayzh);
-    n = R1;
-    alignment = R2;
+    again: MAYBE_GC(again);
 
     /* we always supply at least word-aligned memory, so there's no
        need to allow extra space for alignment if the requirement is less
@@ -128,7 +123,7 @@ stg_newAlignedPinnedByteArrayzh
     /* Now we convert to a number of words: */
     words = ROUNDUP_BYTES_TO_WDS(bytes);
 
-    ("ptr" p) = foreign "C" allocatePinned(MyCapability() "ptr", words) [];
+    ("ptr" p) = ccall allocatePinned(MyCapability() "ptr", words);
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
 
     /* Now we need to move p forward so that the payload is aligned
@@ -138,23 +133,22 @@ stg_newAlignedPinnedByteArrayzh
 
     SET_HDR(p, stg_ARR_WORDS_info, CCCS);
     StgArrWords_bytes(p) = n;
-    RET_P(p);
+    return (p);
 }
 
-stg_newArrayzh
+stg_newArrayzh ( W_ n /* words */, gcptr init )
 {
-    W_ words, n, init, arr, p, size;
-    /* Args: R1 = words, R2 = initialisation value */
+    W_ words, size;
+    gcptr p, arr;
 
-    n = R1;
-    MAYBE_GC(R2_PTR,stg_newArrayzh);
+    again: MAYBE_GC(again);
 
     // the mark area contains one byte for each 2^MUT_ARR_PTRS_CARD_BITS words
     // in the array, making sure we round up, and then rounding up to a whole
     // number of words.
     size = n + mutArrPtrsCardWords(n);
     words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + size;
-    ("ptr" arr) = foreign "C" allocate(MyCapability() "ptr",words) [R2];
+    ("ptr" arr) = ccall allocate(MyCapability() "ptr",words);
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
 
     SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS);
@@ -162,7 +156,6 @@ stg_newArrayzh
     StgMutArrPtrs_size(arr) = size;
 
     // Initialise all elements of the the array with the value in R2
-    init = R2;
     p = arr + SIZEOF_StgMutArrPtrs;
   for:
     if (p < arr + WDS(words)) {
@@ -178,10 +171,10 @@ stg_newArrayzh
 	goto for2;
     }
 
-    RET_P(arr);
+    return (arr);
 }
 
-stg_unsafeThawArrayzh
+stg_unsafeThawArrayzh ( gcptr arr )
 {
   // SUBTLETY TO DO WITH THE OLD GEN MUTABLE LIST
   //
@@ -201,31 +194,30 @@ stg_unsafeThawArrayzh
   // we put it on the mutable list more than once, but it would get scavenged
   // multiple times during GC, which would be unnecessarily slow.
   //
-  if (StgHeader_info(R1) != stg_MUT_ARR_PTRS_FROZEN0_info) {
-	SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
-	recordMutable(R1, R1);
+  if (StgHeader_info(arr) != stg_MUT_ARR_PTRS_FROZEN0_info) {
+        SET_INFO(arr,stg_MUT_ARR_PTRS_DIRTY_info);
+        recordMutable(arr);
 	// must be done after SET_INFO, because it ASSERTs closure_MUTABLE()
-	RET_P(R1);
+	return (arr);
   } else {
-	SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
-	RET_P(R1);
+	SET_INFO(arr,stg_MUT_ARR_PTRS_DIRTY_info);
+	return (arr);
   }
 }
 
-stg_newArrayArrayzh
+stg_newArrayArrayzh ( W_ n /* words */ )
 {
-    W_ words, n, arr, p, size;
-    /* Args: R1 = words */
+    W_ words, size;
+    gcptr p, arr;
 
-    n = R1;
-    MAYBE_GC(NO_PTRS,stg_newArrayArrayzh);
+    MAYBE_GC_N(stg_newArrayArrayzh, n);
 
     // the mark area contains one byte for each 2^MUT_ARR_PTRS_CARD_BITS words
     // in the array, making sure we round up, and then rounding up to a whole
     // number of words.
     size = n + mutArrPtrsCardWords(n);
     words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + size;
-    ("ptr" arr) = foreign "C" allocate(MyCapability() "ptr",words) [];
+    ("ptr" arr) = ccall allocate(MyCapability() "ptr",words);
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
 
     SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
@@ -248,7 +240,7 @@ stg_newArrayArrayzh
 	goto for2;
     }
 
-    RET_P(arr);
+    return (arr);
 }
 
 
@@ -256,46 +248,39 @@ stg_newArrayArrayzh
    MutVar primitives
    -------------------------------------------------------------------------- */
 
-stg_newMutVarzh
+stg_newMutVarzh ( gcptr init )
 {
     W_ mv;
-    /* Args: R1 = initialisation value */
 
-    ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, stg_newMutVarzh);
+    ALLOC_PRIM (SIZEOF_StgMutVar);
 
     mv = Hp - SIZEOF_StgMutVar + WDS(1);
     SET_HDR(mv,stg_MUT_VAR_DIRTY_info,CCCS);
-    StgMutVar_var(mv) = R1;
+    StgMutVar_var(mv) = init;
     
-    RET_P(mv);
+    return (mv);
 }
 
-stg_casMutVarzh
+stg_casMutVarzh ( gcptr mv, gcptr old, gcptr new )
  /* MutVar# s a -> a -> a -> State# s -> (# State#, Int#, a #) */
 {
-    W_ mv, old, new, h;
-
-    mv  = R1;
-    old = R2;
-    new = R3;
+    gcptr h;
 
-    (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var,
-                          old, new) [];
+    (h) = ccall cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var,
+                          old, new);
     if (h != old) {
-        RET_NP(1,h);
+        return (1,h);
     } else {
         if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
-           foreign "C" dirty_MUT_VAR(BaseReg "ptr", mv "ptr") [];
+           ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr");
         }
-        RET_NP(0,h);
+        return (0,h);
     }
 }
 
-
-stg_atomicModifyMutVarzh
+stg_atomicModifyMutVarzh ( gcptr mv, gcptr f )
 {
-    W_ mv, f, z, x, y, r, h;
-    /* Args: R1 :: MutVar#,  R2 :: a -> (a,b) */
+    W_ z, x, y, r, h;
 
     /* If x is the current contents of the MutVar#, then 
        We want to make the new contents point to
@@ -331,10 +316,7 @@ stg_atomicModifyMutVarzh
 
 #define SIZE (THUNK_2_SIZE + THUNK_1_SIZE + THUNK_1_SIZE)
 
-   HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, stg_atomicModifyMutVarzh);
-
-   mv = R1;
-   f = R2;
+   HP_CHK_GEN_TICKY(SIZE);
 
    TICK_ALLOC_THUNK_2();
    CCCS_ALLOC(THUNK_2_SIZE);
@@ -361,17 +343,17 @@ stg_atomicModifyMutVarzh
    x = StgMutVar_var(mv);
    StgThunk_payload(z,1) = x;
 #ifdef THREADED_RTS
-   (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y) [];
+   (h) = ccall cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y);
    if (h != x) { goto retry; }
 #else
    StgMutVar_var(mv) = y;
 #endif
 
    if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
-     foreign "C" dirty_MUT_VAR(BaseReg "ptr", mv "ptr") [];
+     ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr");
    }
 
-   RET_P(r);
+   return (r);
 }
 
 /* -----------------------------------------------------------------------------
@@ -380,15 +362,13 @@ stg_atomicModifyMutVarzh
 
 STRING(stg_weak_msg,"New weak pointer at %p\n")
 
-stg_mkWeakzh
+stg_mkWeakzh ( gcptr key,
+               gcptr value,
+               gcptr finalizer /* or stg_NO_FINALIZER_closure */ )
 {
-  /* R1 = key
-     R2 = value
-     R3 = finalizer (or stg_NO_FINALIZER_closure)
-  */
-  W_ w;
+  gcptr w;
 
-  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, stg_mkWeakzh );
+  ALLOC_PRIM (SIZEOF_StgWeak)
 
   w = Hp - SIZEOF_StgWeak + WDS(1);
   SET_HDR(w, stg_WEAK_info, CCCS);
@@ -397,9 +377,9 @@ stg_mkWeakzh
   // Should StgWeak_cfinalizer(w) be stg_NO_FINALIZER_closure or
   // something else?
 
-  StgWeak_key(w)        = R1;
-  StgWeak_value(w)      = R2;
-  StgWeak_finalizer(w)  = R3;
+  StgWeak_key(w)        = key;
+  StgWeak_value(w)      = value;
+  StgWeak_finalizer(w)  = finalizer;
   StgWeak_cfinalizer(w) = stg_NO_FINALIZER_closure;
 
   ACQUIRE_LOCK(sm_mutex);
@@ -407,49 +387,34 @@ stg_mkWeakzh
   W_[weak_ptr_list] 	= w;
   RELEASE_LOCK(sm_mutex);
 
-  IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
+  IF_DEBUG(weak, ccall debugBelch(stg_weak_msg,w));
 
-  RET_P(w);
+  return (w);
 }
 
-stg_mkWeakNoFinalizzerzh
+stg_mkWeakNoFinalizzerzh ( gcptr key, gcptr value )
 {
-  /* R1 = key
-     R2 = value
-   */
-  R3 = stg_NO_FINALIZER_closure;
-
-  jump stg_mkWeakzh;
+  jump stg_mkWeakzh (key, value, stg_NO_FINALIZER_closure);
 }
 
-stg_mkWeakForeignEnvzh
+stg_mkWeakForeignEnvzh ( gcptr key,
+                         gcptr val,
+                         W_ fptr,   // finalizer
+                         W_ ptr,
+                         W_ flag,   // has environment (0 or 1)
+                         W_ eptr )
 {
-  /* R1 = key
-     R2 = value
-     R3 = finalizer
-     R4 = pointer
-     R5 = has environment (0 or 1)
-     R6 = environment
-  */
-  W_ w, payload_words, words, p;
-
-  W_ key, val, fptr, ptr, flag, eptr;
-
-  key  = R1;
-  val  = R2;
-  fptr = R3;
-  ptr  = R4;
-  flag = R5;
-  eptr = R6;
-
-  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR, stg_mkWeakForeignEnvzh );
+  W_ payload_words, words;
+  gcptr w, p;
+
+  ALLOC_PRIM (SIZEOF_StgWeak);
 
   w = Hp - SIZEOF_StgWeak + WDS(1);
   SET_HDR(w, stg_WEAK_info, CCCS);
 
   payload_words = 4;
   words         = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
-  ("ptr" p)     = foreign "C" allocate(MyCapability() "ptr", words) [];
+  ("ptr" p)     = ccall allocate(MyCapability() "ptr", words);
 
   TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
   SET_HDR(p, stg_ARR_WORDS_info, CCCS);
@@ -473,22 +438,18 @@ stg_mkWeakForeignEnvzh
   W_[weak_ptr_list] = w;
   RELEASE_LOCK(sm_mutex);
 
-  IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
+  IF_DEBUG(weak, ccall debugBelch(stg_weak_msg,w));
 
-  RET_P(w);
+  return (w);
 }
 
-stg_finalizzeWeakzh
+stg_finalizzeWeakzh ( gcptr w )
 {
-  /* R1 = weak ptr
-   */
-  W_ w, f, arr;
-
-  w = R1;
+  gcptr f, arr;
 
   // already dead?
   if (GET_INFO(w) == stg_DEAD_WEAK_info) {
-      RET_NP(0,stg_NO_FINALIZER_closure);
+      return (0,stg_NO_FINALIZER_closure);
   }
 
   // kill it
@@ -516,26 +477,25 @@ stg_finalizzeWeakzh
   StgDeadWeak_link(w) = StgWeak_link(w);
 
   if (arr != stg_NO_FINALIZER_closure) {
-    foreign "C" runCFinalizer(StgArrWords_payload(arr,0),
+    ccall runCFinalizer(StgArrWords_payload(arr,0),
                               StgArrWords_payload(arr,1),
                               StgArrWords_payload(arr,2),
-                              StgArrWords_payload(arr,3)) [];
+                              StgArrWords_payload(arr,3));
   }
 
   /* return the finalizer */
   if (f == stg_NO_FINALIZER_closure) {
-      RET_NP(0,stg_NO_FINALIZER_closure);
+      return (0,stg_NO_FINALIZER_closure);
   } else {
-      RET_NP(1,f);
+      return (1,f);
   }
 }
 
-stg_deRefWeakzh
+stg_deRefWeakzh ( gcptr w )
 {
-  /* R1 = weak ptr */
-  W_ w, code, val;
+  W_ code;
+  gcptr val;
 
-  w = R1;
   if (GET_INFO(w) == stg_WEAK_info) {
     code = 1;
     val = StgWeak_value(w);
@@ -543,171 +503,144 @@ stg_deRefWeakzh
     code = 0;
     val = w;
   }
-  RET_NP(code,val);
+  return (code,val);
 }
 
 /* -----------------------------------------------------------------------------
    Floating point operations.
    -------------------------------------------------------------------------- */
 
-stg_decodeFloatzuIntzh
+stg_decodeFloatzuIntzh ( F_ arg )
 { 
     W_ p;
-    F_ arg;
     W_ mp_tmp1;
     W_ mp_tmp_w;
 
-    STK_CHK_GEN( WDS(2), NO_PTRS, stg_decodeFloatzuIntzh );
+    STK_CHK_GEN_N (WDS(2));
 
     mp_tmp1  = Sp - WDS(1);
     mp_tmp_w = Sp - WDS(2);
     
-    /* arguments: F1 = Float# */
-    arg = F1;
-    
     /* Perform the operation */
-    foreign "C" __decodeFloat_Int(mp_tmp1 "ptr", mp_tmp_w "ptr", arg) [];
+    ccall __decodeFloat_Int(mp_tmp1 "ptr", mp_tmp_w "ptr", arg);
     
     /* returns: (Int# (mantissa), Int# (exponent)) */
-    RET_NN(W_[mp_tmp1], W_[mp_tmp_w]);
+    return (W_[mp_tmp1], W_[mp_tmp_w]);
 }
 
-stg_decodeDoublezu2Intzh
+stg_decodeDoublezu2Intzh ( D_ arg )
 { 
-    D_ arg;
     W_ p;
     W_ mp_tmp1;
     W_ mp_tmp2;
     W_ mp_result1;
     W_ mp_result2;
 
-    STK_CHK_GEN( WDS(4), NO_PTRS, stg_decodeDoublezu2Intzh );
+    STK_CHK_GEN_N (WDS(4));
 
     mp_tmp1    = Sp - WDS(1);
     mp_tmp2    = Sp - WDS(2);
     mp_result1 = Sp - WDS(3);
     mp_result2 = Sp - WDS(4);
 
-    /* arguments: D1 = Double# */
-    arg = D1;
-
     /* Perform the operation */
-    foreign "C" __decodeDouble_2Int(mp_tmp1 "ptr", mp_tmp2 "ptr",
+    ccall __decodeDouble_2Int(mp_tmp1 "ptr", mp_tmp2 "ptr",
                                     mp_result1 "ptr", mp_result2 "ptr",
-                                    arg) [];
+                                    arg);
 
     /* returns:
        (Int# (mant sign), Word# (mant high), Word# (mant low), Int# (expn)) */
-    RET_NNNN(W_[mp_tmp1], W_[mp_tmp2], W_[mp_result1], W_[mp_result2]);
+    return (W_[mp_tmp1], W_[mp_tmp2], W_[mp_result1], W_[mp_result2]);
 }
 
 /* -----------------------------------------------------------------------------
  * Concurrency primitives
  * -------------------------------------------------------------------------- */
 
-stg_forkzh
+stg_forkzh ( gcptr closure )
 {
-  /* args: R1 = closure to spark */
-
-  MAYBE_GC(R1_PTR, stg_forkzh);
+  MAYBE_GC_P(stg_forkzh, closure);
 
-  W_ closure;
-  W_ threadid;
-  closure = R1;
+  gcptr threadid;
 
-  ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
+  ("ptr" threadid) = ccall createIOThread( MyCapability() "ptr", 
 				RtsFlags_GcFlags_initialStkSize(RtsFlags), 
-				closure "ptr") [];
+                                closure "ptr");
 
   /* start blocked if the current thread is blocked */
   StgTSO_flags(threadid) = %lobits16(
      TO_W_(StgTSO_flags(threadid)) | 
      TO_W_(StgTSO_flags(CurrentTSO)) & (TSO_BLOCKEX | TSO_INTERRUPTIBLE));
 
-  foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
+  ccall scheduleThread(MyCapability() "ptr", threadid "ptr");
 
   // context switch soon, but not immediately: we don't want every
   // forkIO to force a context-switch.
   Capability_context_switch(MyCapability()) = 1 :: CInt;
   
-  RET_P(threadid);
+  return (threadid);
 }
 
-stg_forkOnzh
+stg_forkOnzh ( W_ cpu, gcptr closure )
 {
-  /* args: R1 = cpu, R2 = closure to spark */
+again: MAYBE_GC(again);
 
-  MAYBE_GC(R2_PTR, stg_forkOnzh);
+  gcptr threadid;
 
-  W_ cpu;
-  W_ closure;
-  W_ threadid;
-  cpu = R1;
-  closure = R2;
-
-  ("ptr" threadid) = foreign "C" createIOThread( MyCapability() "ptr", 
+  ("ptr" threadid) = ccall createIOThread( MyCapability() "ptr", 
 				RtsFlags_GcFlags_initialStkSize(RtsFlags), 
-				closure "ptr") [];
+                                closure "ptr");
 
   /* start blocked if the current thread is blocked */
   StgTSO_flags(threadid) = %lobits16(
      TO_W_(StgTSO_flags(threadid)) | 
      TO_W_(StgTSO_flags(CurrentTSO)) & (TSO_BLOCKEX | TSO_INTERRUPTIBLE));
 
-  foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
+  ccall scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr");
 
   // context switch soon, but not immediately: we don't want every
   // forkIO to force a context-switch.
   Capability_context_switch(MyCapability()) = 1 :: CInt;
   
-  RET_P(threadid);
+  return (threadid);
 }
 
-stg_yieldzh
+stg_yieldzh ()
 {
   // when we yield to the scheduler, we have to tell it to put the
   // current thread to the back of the queue by setting the
   // context_switch flag.  If we don't do this, it will run the same
   // thread again.
   Capability_context_switch(MyCapability()) = 1 :: CInt;
-  jump stg_yield_noregs;
+  jump stg_yield_noregs();
 }
 
-stg_myThreadIdzh
+stg_myThreadIdzh ()
 {
-  /* no args. */
-  RET_P(CurrentTSO);
+  return (CurrentTSO);
 }
 
-stg_labelThreadzh
+stg_labelThreadzh ( gcptr threadid, W_ addr )
 {
-  /* args: 
-	R1 = ThreadId#
-	R2 = Addr# */
 #if defined(DEBUG) || defined(TRACING) || defined(DTRACE)
-  foreign "C" labelThread(MyCapability() "ptr", R1 "ptr", R2 "ptr") [];
+  ccall labelThread(MyCapability() "ptr", threadid "ptr", addr "ptr");
 #endif
-  jump %ENTRY_CODE(Sp(0));
+  return ();
 }
 
-stg_isCurrentThreadBoundzh
+stg_isCurrentThreadBoundzh (/* no args */)
 {
-  /* no args */
   W_ r;
-  (r) = foreign "C" isThreadBound(CurrentTSO) [];
-  RET_N(r);
+  (r) = ccall isThreadBound(CurrentTSO);
+  return (r);
 }
 
-stg_threadStatuszh
+stg_threadStatuszh ( gcptr tso )
 {
-    /* args: R1 :: ThreadId# */
-    W_ tso;
     W_ why_blocked;
     W_ what_next;
     W_ ret, cap, locked;
 
-    tso = R1;
-
     what_next   = TO_W_(StgTSO_what_next(tso));
     why_blocked = TO_W_(StgTSO_why_blocked(tso));
     // Note: these two reads are not atomic, so they might end up
@@ -733,214 +666,250 @@ stg_threadStatuszh
         locked = 0;
     }
 
-    RET_NNN(ret,cap,locked);
+    return (ret,cap,locked);
 }
 
 /* -----------------------------------------------------------------------------
  * TVar primitives
  * -------------------------------------------------------------------------- */
 
-#define SP_OFF 0
+// Catch retry frame -----------------------------------------------------------
+
+#define CATCH_RETRY_FRAME_FIELDS(w_,p_,info_ptr,        \
+                                 running_alt_code,      \
+                                 first_code,            \
+                                 alt_code)              \
+  w_ info_ptr,                                          \
+  PROF_HDR_FIELDS(w_)                                   \
+  w_ running_alt_code,                                  \
+  p_ first_code,                                        \
+  p_ alt_code
 
-// Catch retry frame ------------------------------------------------------------
 
 INFO_TABLE_RET(stg_catch_retry_frame, CATCH_RETRY_FRAME,
-#if defined(PROFILING)
-  W_ unused1, W_ unused2,
-#endif
-  W_ unused3, P_ unused4, P_ unused5)
+               CATCH_RETRY_FRAME_FIELDS(W_,P_,
+                                        info_ptr,
+                                        running_alt_code,
+                                        first_code,
+                                        alt_code))
+    return (P_ ret)
 {
-   W_ r, frame, trec, outer;
-
-   frame = Sp;
-   trec = StgTSO_trec(CurrentTSO);
-   outer  = StgTRecHeader_enclosing_trec(trec);
-   (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
-   if (r != 0) {
-     /* Succeeded (either first branch or second branch) */
-     StgTSO_trec(CurrentTSO) = outer;
-     Sp = Sp + SIZEOF_StgCatchRetryFrame;
-     jump %ENTRY_CODE(Sp(SP_OFF));
-   } else {
-     /* Did not commit: re-execute */
-     W_ new_trec;
-     ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
-     StgTSO_trec(CurrentTSO) = new_trec;
-     if (StgCatchRetryFrame_running_alt_code(frame) != 0::I32) {
-       R1 = StgCatchRetryFrame_alt_code(frame);
-     } else {
-       R1 = StgCatchRetryFrame_first_code(frame);
-     }
-     jump stg_ap_v_fast;
-   }
-}
+    W_ r;
+    gcptr trec, outer, arg;
 
+    trec = StgTSO_trec(CurrentTSO);
+    outer  = StgTRecHeader_enclosing_trec(trec);
+    (r) = ccall stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr");
+    if (r != 0) {
+        // Succeeded (either first branch or second branch)
+        StgTSO_trec(CurrentTSO) = outer;
+        return (ret);
+    } else {
+        // Did not commit: re-execute
+        P_ new_trec;
+        ("ptr" new_trec) = ccall stmStartTransaction(MyCapability() "ptr",
+                                                           outer "ptr");
+        StgTSO_trec(CurrentTSO) = new_trec;
+        if (running_alt_code != 0) {
+            jump stg_ap_v_fast
+                (CATCH_RETRY_FRAME_FIELDS(,,info_ptr,
+                                          running_alt_code,
+                                          first_code,
+                                          alt_code))
+                (alt_code);
+        } else {
+            jump stg_ap_v_fast
+                (CATCH_RETRY_FRAME_FIELDS(,,info_ptr,
+                                          running_alt_code,
+                                          first_code,
+                                          alt_code))
+                (first_code);
+        }
+    }
+}
 
 // Atomically frame ------------------------------------------------------------
 
+// This must match StgAtomicallyFrame in Closures.h
+#define ATOMICALLY_FRAME_FIELDS(w_,p_,info_ptr,code,next,result)        \
+  w_ info_ptr,                                                          \
+  PROF_HDR_FIELDS(w_)                                                   \
+  p_ code,                                                              \
+  p_ next,                                                              \
+  p_ result
+
+
 INFO_TABLE_RET(stg_atomically_frame, ATOMICALLY_FRAME,
-#if defined(PROFILING)
-  W_ unused1, W_ unused2,
-#endif
-  P_ code, P_ next_invariant_to_check, P_ result)
+               // layout of the frame, and bind the field names
+               ATOMICALLY_FRAME_FIELDS(W_,P_,
+                                       info_ptr,
+                                       code,
+                                       next_invariant,
+                                       frame_result))
+    return (P_ result) // value returned to the frame
 {
-  W_ frame, trec, valid, next_invariant, q, outer;
+  W_ valid;
+  gcptr trec, outer, next_invariant, q;
 
-  frame  = Sp;
   trec   = StgTSO_trec(CurrentTSO);
-  result = R1;
   outer  = StgTRecHeader_enclosing_trec(trec);
 
   if (outer == NO_TREC) {
     /* First time back at the atomically frame -- pick up invariants */
-    ("ptr" q) = foreign "C" stmGetInvariantsToCheck(MyCapability() "ptr", trec "ptr") [];
-    StgAtomicallyFrame_next_invariant_to_check(frame) = q;
-    StgAtomicallyFrame_result(frame) = result;
+    ("ptr" next_invariant) =
+        ccall stmGetInvariantsToCheck(MyCapability() "ptr", trec "ptr");
+    frame_result = result;
 
   } else {
     /* Second/subsequent time back at the atomically frame -- abort the
      * tx that's checking the invariant and move on to the next one */
     StgTSO_trec(CurrentTSO) = outer;
-    q = StgAtomicallyFrame_next_invariant_to_check(frame);
-    StgInvariantCheckQueue_my_execution(q) = trec;
-    foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
+    StgInvariantCheckQueue_my_execution(next_invariant) = trec;
+    ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr");
     /* Don't free trec -- it's linked from q and will be stashed in the
      * invariant if we eventually commit. */
-    q = StgInvariantCheckQueue_next_queue_entry(q);
-    StgAtomicallyFrame_next_invariant_to_check(frame) = q;
+    next_invariant =
+       StgInvariantCheckQueue_next_queue_entry(next_invariant);
     trec = outer;
   }
 
-  q = StgAtomicallyFrame_next_invariant_to_check(frame);
-
-  if (q != END_INVARIANT_CHECK_QUEUE) {
+  if (next_invariant != END_INVARIANT_CHECK_QUEUE) {
     /* We can't commit yet: another invariant to check */
-    ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [];
+    ("ptr" trec) = ccall stmStartTransaction(MyCapability() "ptr", trec "ptr");
     StgTSO_trec(CurrentTSO) = trec;
-
-    next_invariant = StgInvariantCheckQueue_invariant(q);
-    R1 = StgAtomicInvariant_code(next_invariant);
-    jump stg_ap_v_fast;
+    q = StgInvariantCheckQueue_invariant(next_invariant);
+    jump stg_ap_v_fast
+        (ATOMICALLY_FRAME_FIELDS(,,info_ptr,code,next_invariant,frame_result))
+        (StgAtomicInvariant_code(q));
 
   } else {
 
     /* We've got no more invariants to check, try to commit */
-    (valid) = foreign "C" stmCommitTransaction(MyCapability() "ptr", trec "ptr") [];
+    (valid) = ccall stmCommitTransaction(MyCapability() "ptr", trec "ptr");
     if (valid != 0) {
       /* Transaction was valid: commit succeeded */
       StgTSO_trec(CurrentTSO) = NO_TREC;
-      R1 = StgAtomicallyFrame_result(frame);
-      Sp = Sp + SIZEOF_StgAtomicallyFrame;
-      jump %ENTRY_CODE(Sp(SP_OFF));
+      return (frame_result);
     } else {
       /* Transaction was not valid: try again */
-      ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
+      ("ptr" trec) = ccall stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr");
       StgTSO_trec(CurrentTSO) = trec;
-      StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
-      R1 = StgAtomicallyFrame_code(frame);
-      jump stg_ap_v_fast;
+      next_invariant = END_INVARIANT_CHECK_QUEUE;
+
+      jump stg_ap_v_fast
+          // push the StgAtomicallyFrame again: the code generator is
+          // clever enough to only assign the fields that have changed.
+          (ATOMICALLY_FRAME_FIELDS(,,info_ptr,code,next_invariant,frame_result))
+          (code);
     }
   }
 }
 
+
 INFO_TABLE_RET(stg_atomically_waiting_frame, ATOMICALLY_FRAME,
-#if defined(PROFILING)
-  W_ unused1, W_ unused2,
-#endif
-  P_ code, P_ next_invariant_to_check, P_ result)
+               // layout of the frame, and bind the field names
+               ATOMICALLY_FRAME_FIELDS(W_,P_,
+                                       info_ptr,
+                                       code,
+                                       next_invariant,
+                                       frame_result))
+    return (/* no return values */)
 {
-  W_ frame, trec, valid;
-
-  frame = Sp;
+  W_ trec, valid;
 
   /* The TSO is currently waiting: should we stop waiting? */
-  (valid) = foreign "C" stmReWait(MyCapability() "ptr", CurrentTSO "ptr") [];
+  (valid) = ccall stmReWait(MyCapability() "ptr", CurrentTSO "ptr");
   if (valid != 0) {
-    /* Previous attempt is still valid: no point trying again yet */
-    jump stg_block_noregs;
+      /* Previous attempt is still valid: no point trying again yet */
+      jump stg_block_noregs
+          (ATOMICALLY_FRAME_FIELDS(,,info_ptr,
+                                   code,next_invariant,frame_result))
+          ();
   } else {
     /* Previous attempt is no longer valid: try again */
-    ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
+    ("ptr" trec) = ccall stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr");
     StgTSO_trec(CurrentTSO) = trec;
-    StgHeader_info(frame) = stg_atomically_frame_info;
-    R1 = StgAtomicallyFrame_code(frame);
-    jump stg_ap_v_fast;
+
+    // change the frame header to stg_atomically_frame_info
+    jump stg_ap_v_fast
+        (ATOMICALLY_FRAME_FIELDS(,,stg_atomically_frame_info,
+                                 code,next_invariant,frame_result))
+        (code);
   }
 }
 
-// STM catch frame --------------------------------------------------------------
-
-#define SP_OFF 0
+// STM catch frame -------------------------------------------------------------
 
 /* Catch frames are very similar to update frames, but when entering
  * one we just pop the frame off the stack and perform the correct
  * kind of return to the activation record underneath us on the stack.
  */
 
+#define CATCH_STM_FRAME_FIELDS(w_,p_,info_ptr,code,handler)     \
+  w_ info_ptr,                                                  \
+  PROF_HDR_FIELDS(w_)                                           \
+  p_ code,                                                      \
+  p_ handler
+
 INFO_TABLE_RET(stg_catch_stm_frame, CATCH_STM_FRAME,
-#if defined(PROFILING)
-  W_ unused1, W_ unused2,
-#endif
-  P_ unused3, P_ unused4)
-   {
-      W_ r, frame, trec, outer;
-      frame = Sp;
-      trec = StgTSO_trec(CurrentTSO);
-      outer  = StgTRecHeader_enclosing_trec(trec);
-      (r) = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
-      if (r != 0) {
+               // layout of the frame, and bind the field names
+               CATCH_STM_FRAME_FIELDS(W_,P_,info_ptr,code,handler))
+    return (P_ ret)
+{
+    W_ r, trec, outer;
+
+    trec = StgTSO_trec(CurrentTSO);
+    outer  = StgTRecHeader_enclosing_trec(trec);
+    (r) = ccall stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr");
+    if (r != 0) {
         /* Commit succeeded */
         StgTSO_trec(CurrentTSO) = outer;
-        Sp = Sp + SIZEOF_StgCatchSTMFrame;
-        jump %ENTRY_CODE(Sp(SP_OFF));
-      } else {
+        return (ret);
+    } else {
         /* Commit failed */
         W_ new_trec;
-        ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+        ("ptr" new_trec) = ccall stmStartTransaction(MyCapability() "ptr", outer "ptr");
         StgTSO_trec(CurrentTSO) = new_trec;
-        R1 = StgCatchSTMFrame_code(frame);
-        jump stg_ap_v_fast;
-      }
-   }
+
+        jump stg_ap_v_fast
+            (CATCH_STM_FRAME_FIELDS(,,info_ptr,code,handler))
+            (code);
+    }
+}
 
 
-// Primop definition ------------------------------------------------------------
+// Primop definition -----------------------------------------------------------
 
-stg_atomicallyzh
+stg_atomicallyzh (P_ stm)
 {
-  W_ frame;
   W_ old_trec;
   W_ new_trec;
-  
+  W_ code, next_invariant, frame_result;
+
   // stmStartTransaction may allocate
-  MAYBE_GC (R1_PTR, stg_atomicallyzh); 
+  MAYBE_GC_P(stg_atomicallyzh, stm);
 
-  /* Args: R1 = m :: STM a */
-  STK_CHK_GEN(SIZEOF_StgAtomicallyFrame + WDS(1), R1_PTR, stg_atomicallyzh);
+  STK_CHK_GEN();
 
   old_trec = StgTSO_trec(CurrentTSO);
 
   /* Nested transactions are not allowed; raise an exception */
   if (old_trec != NO_TREC) {
-     R1 = base_ControlziExceptionziBase_nestedAtomically_closure;
-     jump stg_raisezh;
+     jump stg_raisezh(base_ControlziExceptionziBase_nestedAtomically_closure);
   }
 
-  /* Set up the atomically frame */
-  Sp = Sp - SIZEOF_StgAtomicallyFrame;
-  frame = Sp;
-
-  SET_HDR(frame,stg_atomically_frame_info, CCCS);
-  StgAtomicallyFrame_code(frame) = R1;
-  StgAtomicallyFrame_result(frame) = NO_TREC;
-  StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
+  code = stm;
+  next_invariant = END_INVARIANT_CHECK_QUEUE;
+  frame_result = NO_TREC;
 
   /* Start the memory transcation */
-  ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", old_trec "ptr") [R1];
+  ("ptr" new_trec) = ccall stmStartTransaction(MyCapability() "ptr", old_trec "ptr");
   StgTSO_trec(CurrentTSO) = new_trec;
 
-  /* Apply R1 to the realworld token */
-  jump stg_ap_v_fast;
+  jump stg_ap_v_fast
+      (ATOMICALLY_FRAME_FIELDS(,,stg_atomically_frame_info,
+                               code,next_invariant,frame_result))
+      (stm);
 }
 
 // A closure representing "atomically x".  This is used when a thread
@@ -948,73 +917,57 @@ stg_atomicallyzh
 // It is somewhat similar to the stg_raise closure.
 //
 INFO_TABLE(stg_atomically,1,0,THUNK_1_0,"atomically","atomically")
+    (P_ thunk)
 {
-  R1 = StgThunk_payload(R1,0);
-  jump stg_atomicallyzh;
+    jump stg_atomicallyzh(StgThunk_payload(thunk,0));
 }
 
 
-stg_catchSTMzh
+stg_catchSTMzh (P_ code    /* :: STM a */,
+                P_ handler /* :: Exception -> STM a */)
 {
-  W_ frame;
-  
-  /* Args: R1 :: STM a */
-  /* Args: R2 :: Exception -> STM a */
-  STK_CHK_GEN(SIZEOF_StgCatchSTMFrame + WDS(1), R1_PTR & R2_PTR, stg_catchSTMzh);
-
-  /* Set up the catch frame */
-  Sp = Sp - SIZEOF_StgCatchSTMFrame;
-  frame = Sp;
-
-  SET_HDR(frame, stg_catch_stm_frame_info, CCCS);
-  StgCatchSTMFrame_handler(frame) = R2;
-  StgCatchSTMFrame_code(frame) = R1;
-
-  /* Start a nested transaction to run the body of the try block in */
-  W_ cur_trec;  
-  W_ new_trec;
-  cur_trec = StgTSO_trec(CurrentTSO);
-  ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", cur_trec "ptr");
-  StgTSO_trec(CurrentTSO) = new_trec;
-
-  /* Apply R1 to the realworld token */
-  jump stg_ap_v_fast;
+    STK_CHK_GEN();
+
+    /* Start a nested transaction to run the body of the try block in */
+    W_ cur_trec;
+    W_ new_trec;
+    cur_trec = StgTSO_trec(CurrentTSO);
+    ("ptr" new_trec) = ccall stmStartTransaction(MyCapability() "ptr",
+                                                 cur_trec "ptr");
+    StgTSO_trec(CurrentTSO) = new_trec;
+
+    jump stg_ap_v_fast
+        (CATCH_STM_FRAME_FIELDS(,,stg_catch_stm_frame_info, code, handler))
+        (code);
 }
 
 
-stg_catchRetryzh
+stg_catchRetryzh (P_ first_code, /* :: STM a */
+                  P_ alt_code    /* :: STM a */)
 {
-  W_ frame;
   W_ new_trec;
-  W_ trec;
 
   // stmStartTransaction may allocate
-  MAYBE_GC (R1_PTR & R2_PTR, stg_catchRetryzh); 
+  MAYBE_GC_PP (stg_catchRetryzh, first_code, alt_code);
 
-  /* Args: R1 :: STM a */
-  /* Args: R2 :: STM a */
-  STK_CHK_GEN(SIZEOF_StgCatchRetryFrame + WDS(1), R1_PTR & R2_PTR, stg_catchRetryzh);
+  STK_CHK_GEN();
 
   /* Start a nested transaction within which to run the first code */
-  trec = StgTSO_trec(CurrentTSO);
-  ("ptr" new_trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [R1,R2];
+  ("ptr" new_trec) = ccall stmStartTransaction(MyCapability() "ptr",
+                                               StgTSO_trec(CurrentTSO) "ptr");
   StgTSO_trec(CurrentTSO) = new_trec;
 
-  /* Set up the catch-retry frame */
-  Sp = Sp - SIZEOF_StgCatchRetryFrame;
-  frame = Sp;
-  
-  SET_HDR(frame, stg_catch_retry_frame_info, CCCS);
-  StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
-  StgCatchRetryFrame_first_code(frame) = R1;
-  StgCatchRetryFrame_alt_code(frame) = R2;
-
-  /* Apply R1 to the realworld token */
-  jump stg_ap_v_fast;
+  // push the CATCH_RETRY stack frame, and apply first_code to realWorld#
+  jump stg_ap_v_fast
+      (CATCH_RETRY_FRAME_FIELDS(,, stg_catch_retry_frame_info,
+                                0, /* not running_alt_code */
+                                first_code,
+                                alt_code))
+      (first_code);
 }
 
 
-stg_retryzh
+stg_retryzh /* no arg list: explicit stack layout */
 {
   W_ frame_type;
   W_ frame;
@@ -1022,12 +975,14 @@ stg_retryzh
   W_ outer;
   W_ r;
 
-  MAYBE_GC (NO_PTRS, stg_retryzh); // STM operations may allocate
+  // STM operations may allocate
+  MAYBE_GC_ (stg_retryzh); // NB. not MAYBE_GC(), we cannot make a
+                           // function call in an explicit-stack proc
 
   // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
 retry_pop_stack:
   SAVE_THREAD_STATE();
-  (frame_type) = foreign "C" findRetryFrameHelper(MyCapability(), CurrentTSO "ptr") [];
+  (frame_type) = ccall findRetryFrameHelper(MyCapability(), CurrentTSO "ptr");
   LOAD_THREAD_STATE();
   frame = Sp;
   trec = StgTSO_trec(CurrentTSO);
@@ -1037,15 +992,15 @@ retry_pop_stack:
     // The retry reaches a CATCH_RETRY_FRAME before the atomic frame
     ASSERT(outer != NO_TREC);
     // Abort the transaction attempting the current branch
-    foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
-    foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
-    if (!StgCatchRetryFrame_running_alt_code(frame) != 0::I32) {
+    ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr");
+    ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr");
+    if (!StgCatchRetryFrame_running_alt_code(frame) != 0) {
       // Retry in the first branch: try the alternative
-      ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+      ("ptr" trec) = ccall stmStartTransaction(MyCapability() "ptr", outer "ptr");
       StgTSO_trec(CurrentTSO) = trec;
       StgCatchRetryFrame_running_alt_code(frame) = 1 :: CInt; // true;
       R1 = StgCatchRetryFrame_alt_code(frame);
-      jump stg_ap_v_fast;
+      jump stg_ap_v_fast [R1];
     } else {
       // Retry in the alternative code: propagate the retry
       StgTSO_trec(CurrentTSO) = outer;
@@ -1060,108 +1015,93 @@ retry_pop_stack:
     // We called retry while checking invariants, so abort the current
     // invariant check (merging its TVar accesses into the parents read
     // set so we'll wait on them)
-    foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
-    foreign "C" stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr") [];
+    ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr");
+    ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr");
     trec = outer;
     StgTSO_trec(CurrentTSO) = trec;
     outer  = StgTRecHeader_enclosing_trec(trec);
   }
   ASSERT(outer == NO_TREC);
 
-  (r) = foreign "C" stmWait(MyCapability() "ptr", CurrentTSO "ptr", trec "ptr") [];
+  (r) = ccall stmWait(MyCapability() "ptr", CurrentTSO "ptr", trec "ptr");
   if (r != 0) {
     // Transaction was valid: stmWait put us on the TVars' queues, we now block
     StgHeader_info(frame) = stg_atomically_waiting_frame_info;
     Sp = frame;
-    // Fix up the stack in the unregisterised case: the return convention is different.
     R3 = trec; // passing to stmWaitUnblock()
-    jump stg_block_stmwait;
+    jump stg_block_stmwait [R3];
   } else {
     // Transaction was not valid: retry immediately
-    ("ptr" trec) = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+    ("ptr" trec) = ccall stmStartTransaction(MyCapability() "ptr", outer "ptr");
     StgTSO_trec(CurrentTSO) = trec;
-    R1 = StgAtomicallyFrame_code(frame);
     Sp = frame;
-    jump stg_ap_v_fast;
+    R1 = StgAtomicallyFrame_code(frame);
+    jump stg_ap_v_fast [R1];
   }
 }
 
-
-stg_checkzh
+stg_checkzh (P_ closure /* STM a */)
 {
-  W_ trec, closure;
-
-  /* Args: R1 = invariant closure */
-  MAYBE_GC (R1_PTR, stg_checkzh); 
+    W_ trec;
 
-  trec = StgTSO_trec(CurrentTSO);
-  closure = R1;
-  foreign "C" stmAddInvariantToCheck(MyCapability() "ptr", 
-                                     trec "ptr",
-                                     closure "ptr") [];
+    MAYBE_GC_P (stg_checkzh, closure);
 
-  jump %ENTRY_CODE(Sp(0));
+    trec = StgTSO_trec(CurrentTSO);
+    ccall stmAddInvariantToCheck(MyCapability() "ptr",
+                                 trec "ptr",
+                                 closure "ptr");
+    return ();
 }
 
 
-stg_newTVarzh
+stg_newTVarzh (P_ init)
 {
-  W_ tv;
-  W_ new_value;
+    W_ tv;
 
-  /* Args: R1 = initialisation value */
-
-  MAYBE_GC (R1_PTR, stg_newTVarzh); 
-  new_value = R1;
-  ("ptr" tv) = foreign "C" stmNewTVar(MyCapability() "ptr", new_value "ptr") [];
-  RET_P(tv);
+    MAYBE_GC_P (stg_newTVarzh, init);
+    ("ptr" tv) = ccall stmNewTVar(MyCapability() "ptr", init "ptr");
+    return (tv);
 }
 
 
-stg_readTVarzh
+stg_readTVarzh (P_ tvar)
 {
   W_ trec;
-  W_ tvar;
   W_ result;
 
-  /* Args: R1 = TVar closure */
+  // Call to stmReadTVar may allocate
+  MAYBE_GC_P (stg_readTVarzh, tvar);
 
-  MAYBE_GC (R1_PTR, stg_readTVarzh); // Call to stmReadTVar may allocate
   trec = StgTSO_trec(CurrentTSO);
-  tvar = R1;
-  ("ptr" result) = foreign "C" stmReadTVar(MyCapability() "ptr", trec "ptr", tvar "ptr") [];
-
-  RET_P(result);
+  ("ptr" result) = ccall stmReadTVar(MyCapability() "ptr", trec "ptr",
+                                     tvar "ptr");
+  return (result);
 }
 
-stg_readTVarIOzh
+stg_readTVarIOzh ( P_ tvar /* :: TVar a */ )
 {
     W_ result;
 
 again:
-    result = StgTVar_current_value(R1);
+    result = StgTVar_current_value(tvar);
     if (%INFO_PTR(result) == stg_TREC_HEADER_info) {
         goto again;
     }
-    RET_P(result);
+    return (result);
 }
 
-stg_writeTVarzh
+stg_writeTVarzh (P_ tvar,     /* :: TVar a */
+                 P_ new_value /* :: a      */)
 {
-  W_ trec;
-  W_ tvar;
-  W_ new_value;
-  
-  /* Args: R1 = TVar closure */
-  /*       R2 = New value    */
+    W_ trec;
 
-  MAYBE_GC (R1_PTR & R2_PTR, stg_writeTVarzh); // Call to stmWriteTVar may allocate
-  trec = StgTSO_trec(CurrentTSO);
-  tvar = R1;
-  new_value = R2;
-  foreign "C" stmWriteTVar(MyCapability() "ptr", trec "ptr", tvar "ptr", new_value "ptr") [];
+    // Call to stmWriteTVar may allocate
+    MAYBE_GC_PP (stg_writeTVarzh, tvar, new_value);
 
-  jump %ENTRY_CODE(Sp(0));
+    trec = StgTSO_trec(CurrentTSO);
+    ccall stmWriteTVar(MyCapability() "ptr", trec "ptr", tvar "ptr",
+                       new_value "ptr");
+    return ();
 }
 
 
@@ -1197,23 +1137,20 @@ stg_writeTVarzh
  *
  * -------------------------------------------------------------------------- */
 
-stg_isEmptyMVarzh
+stg_isEmptyMVarzh ( P_ mvar /* :: MVar a */ )
 {
-    /* args: R1 = MVar closure */
-
-    if (StgMVar_value(R1) == stg_END_TSO_QUEUE_closure) {
-	RET_N(1);
+    if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
+	return (1);
     } else {
-	RET_N(0);
+	return (0);
     }
 }
 
-stg_newMVarzh
+stg_newMVarzh ()
 {
-    /* args: none */
     W_ mvar;
 
-    ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, stg_newMVarzh );
+    ALLOC_PRIM (SIZEOF_StgMVar);
   
     mvar = Hp - SIZEOF_StgMVar + WDS(1);
     SET_HDR(mvar,stg_MVAR_DIRTY_info,CCCS);
@@ -1221,7 +1158,7 @@ stg_newMVarzh
     StgMVar_head(mvar)  = stg_END_TSO_QUEUE_closure;
     StgMVar_tail(mvar)  = stg_END_TSO_QUEUE_closure;
     StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
-    RET_P(mvar);
+    return (mvar);
 }
 
 
@@ -1229,7 +1166,7 @@ stg_newMVarzh
     W_ sp;                                      \
     sp = StgStack_sp(stack);                    \
     W_[sp + WDS(1)] = value;                    \
-    W_[sp + WDS(0)] = stg_gc_unpt_r1_info;
+    W_[sp + WDS(0)] = stg_ret_p_info;
 
 #define PerformPut(stack,lval)                  \
     W_ sp;                                      \
@@ -1237,21 +1174,19 @@ stg_newMVarzh
     StgStack_sp(stack) = sp;                    \
     lval = W_[sp - WDS(1)];
 
-stg_takeMVarzh
-{
-    W_ mvar, val, info, tso, q;
 
-    /* args: R1 = MVar closure */
-    mvar = R1;
+stg_takeMVarzh ( P_ mvar /* :: MVar a */ )
+{
+    W_ val, info, tso, q;
 
 #if defined(THREADED_RTS)
-    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
+    ("ptr" info) = ccall lockClosure(mvar "ptr");
 #else
     info = GET_INFO(mvar);
 #endif
         
     if (info == stg_MVAR_CLEAN_info) {
-        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") [];
+        ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
     }
 
     /* If the MVar is empty, put ourselves on its blocking queue,
@@ -1259,16 +1194,13 @@ stg_takeMVarzh
      */
     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
         
-        // Note [mvar-heap-check] We want to do the heap check in the
-        // branch here, to avoid the conditional in the common case.
-        // However, we've already locked the MVar above, so we better
-        // be careful to unlock it again if the the heap check fails.
-        // Unfortunately we don't have an easy way to inject any code
-        // into the heap check generated by the code generator, so we
-        // have to do it in stg_gc_gen (see HeapStackCheck.cmm).
-        HP_CHK_GEN_TICKY(SIZEOF_StgMVarTSOQueue, R1_PTR, stg_takeMVarzh);
-        TICK_ALLOC_PRIM(SIZEOF_StgMVarTSOQueue, 0, 0);
-        CCCS_ALLOC(SIZEOF_StgMVarTSOQueue);
+        // We want to put the heap check down here in the slow path,
+        // but be careful to unlock the closure before returning to
+        // the RTS if the check fails.
+        ALLOC_PRIM_WITH_CUSTOM_FAILURE
+            (SIZEOF_StgMVarTSOQueue,
+             unlockClosure(mvar, stg_MVAR_DIRTY_info);
+             GC_PRIM_P(stg_takeMVarzh, mvar));
 
         q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1);
 
@@ -1280,16 +1212,15 @@ stg_takeMVarzh
 	    StgMVar_head(mvar) = q;
 	} else {
             StgMVarTSOQueue_link(StgMVar_tail(mvar)) = q;
-            foreign "C" recordClosureMutated(MyCapability() "ptr",
-                                             StgMVar_tail(mvar)) [];
+            ccall recordClosureMutated(MyCapability() "ptr",
+                                             StgMVar_tail(mvar));
 	}
 	StgTSO__link(CurrentTSO)       = q;
 	StgTSO_block_info(CurrentTSO)  = mvar;
 	StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
 	StgMVar_tail(mvar)             = q;
 	
-        R1 = mvar;
-	jump stg_block_takemvar;
+        jump stg_block_takemvar(mvar);
     }
     
     /* we got the value... */
@@ -1301,14 +1232,14 @@ loop:
         /* No further putMVars, MVar is now empty */
         StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
         unlockClosure(mvar, stg_MVAR_DIRTY_info);
-        RET_P(val);
+        return (val);
     }
     if (StgHeader_info(q) == stg_IND_info ||
         StgHeader_info(q) == stg_MSG_NULL_info) {
         q = StgInd_indirectee(q);
         goto loop;
     }
-    
+  
     // There are putMVar(s) waiting... wake up the first thread on the queue
     
     tso = StgMVarTSOQueue_tso(q);
@@ -1330,22 +1261,18 @@ loop:
     
     // no need to mark the TSO dirty, we have only written END_TSO_QUEUE.
 
-    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
+    ccall tryWakeupThread(MyCapability() "ptr", tso);
     
     unlockClosure(mvar, stg_MVAR_DIRTY_info);
-    RET_P(val);
+    return (val);
 }
 
-
-stg_tryTakeMVarzh
+stg_tryTakeMVarzh ( P_ mvar /* :: MVar a */ )
 {
-    W_ mvar, val, info, tso, q;
-
-    /* args: R1 = MVar closure */
-    mvar = R1;
+    W_ val, info, tso, q;
 
 #if defined(THREADED_RTS)
-    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
+    ("ptr" info) = ccall lockClosure(mvar "ptr");
 #else
     info = GET_INFO(mvar);
 #endif
@@ -1360,11 +1287,11 @@ stg_tryTakeMVarzh
 	/* HACK: we need a pointer to pass back, 
 	 * so we abuse NO_FINALIZER_closure
 	 */
-	RET_NP(0, stg_NO_FINALIZER_closure);
+	return (0, stg_NO_FINALIZER_closure);
     }
     
     if (info == stg_MVAR_CLEAN_info) {
-        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") [];
+        ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
     }
 
     /* we got the value... */
@@ -1376,7 +1303,7 @@ loop:
         /* No further putMVars, MVar is now empty */
         StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
         unlockClosure(mvar, stg_MVAR_DIRTY_info);
-        RET_NP(1, val);
+        return (1, val);
     }
     if (StgHeader_info(q) == stg_IND_info ||
         StgHeader_info(q) == stg_MSG_NULL_info) {
@@ -1405,37 +1332,36 @@ loop:
     
     // no need to mark the TSO dirty, we have only written END_TSO_QUEUE.
 
-    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
+    ccall tryWakeupThread(MyCapability() "ptr", tso);
     
     unlockClosure(mvar, stg_MVAR_DIRTY_info);
-    RET_NP(1,val);
+    return (1,val);
 }
 
-
-stg_putMVarzh
+stg_putMVarzh ( P_ mvar, /* :: MVar a */
+                P_ val,  /* :: a */ )
 {
-    W_ mvar, val, info, tso, q;
-
-    /* args: R1 = MVar, R2 = value */
-    mvar = R1;
-    val  = R2;
+    W_ info, tso, q;
 
 #if defined(THREADED_RTS)
-    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
+    ("ptr" info) = ccall lockClosure(mvar "ptr");
 #else
     info = GET_INFO(mvar);
 #endif
 
     if (info == stg_MVAR_CLEAN_info) {
-        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
+        ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
     }
 
     if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
 
-        // see Note [mvar-heap-check] above
-        HP_CHK_GEN_TICKY(SIZEOF_StgMVarTSOQueue, R1_PTR & R2_PTR, stg_putMVarzh);
-        TICK_ALLOC_PRIM(SIZEOF_StgMVarTSOQueue, 0, 0);
-        CCCS_ALLOC(SIZEOF_StgMVarTSOQueue);
+        // We want to put the heap check down here in the slow path,
+        // but be careful to unlock the closure before returning to
+        // the RTS if the check fails.
+        ALLOC_PRIM_WITH_CUSTOM_FAILURE
+            (SIZEOF_StgMVarTSOQueue,
+             unlockClosure(mvar, stg_MVAR_DIRTY_info);
+             GC_PRIM_P(stg_putMVarzh, mvar));
 
         q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1);
 
@@ -1447,17 +1373,15 @@ stg_putMVarzh
 	    StgMVar_head(mvar) = q;
 	} else {
             StgMVarTSOQueue_link(StgMVar_tail(mvar)) = q;
-            foreign "C" recordClosureMutated(MyCapability() "ptr",
-                                             StgMVar_tail(mvar)) [];
+            ccall recordClosureMutated(MyCapability() "ptr",
+                                             StgMVar_tail(mvar));
 	}
 	StgTSO__link(CurrentTSO)       = q;
 	StgTSO_block_info(CurrentTSO)  = mvar;
 	StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
 	StgMVar_tail(mvar)             = q;
 
-        R1 = mvar;
-        R2 = val;
-	jump stg_block_putmvar;
+        jump stg_block_putmvar(mvar,val);
     }
   
     q = StgMVar_head(mvar);
@@ -1466,7 +1390,7 @@ loop:
 	/* No further takes, the MVar is now full. */
 	StgMVar_value(mvar) = val;
     	unlockClosure(mvar, stg_MVAR_DIRTY_info);
-	jump %ENTRY_CODE(Sp(0));
+        return ();
     }
     if (StgHeader_info(q) == stg_IND_info ||
         StgHeader_info(q) == stg_MSG_NULL_info) {
@@ -1494,26 +1418,23 @@ loop:
     StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
 
     if (TO_W_(StgStack_dirty(stack)) == 0) {
-        foreign "C" dirty_STACK(MyCapability() "ptr", stack "ptr") [];
+        ccall dirty_STACK(MyCapability() "ptr", stack "ptr");
     }
     
-    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
+    ccall tryWakeupThread(MyCapability() "ptr", tso);
 
     unlockClosure(mvar, stg_MVAR_DIRTY_info);
-    jump %ENTRY_CODE(Sp(0));
+    return ();
 }
 
 
-stg_tryPutMVarzh
+stg_tryPutMVarzh ( P_ mvar, /* :: MVar a */
+                   P_ val,  /* :: a */ )
 {
-    W_ mvar, val, info, tso, q;
-
-    /* args: R1 = MVar, R2 = value */
-    mvar = R1;
-    val  = R2;
+    W_ info, tso, q;
 
 #if defined(THREADED_RTS)
-    ("ptr" info) = foreign "C" lockClosure(mvar "ptr") [];
+    ("ptr" info) = ccall lockClosure(mvar "ptr");
 #else
     info = GET_INFO(mvar);
 #endif
@@ -1522,11 +1443,11 @@ stg_tryPutMVarzh
 #if defined(THREADED_RTS)
     	unlockClosure(mvar, info);
 #endif
-	RET_N(0);
+	return (0);
     }
   
     if (info == stg_MVAR_CLEAN_info) {
-        foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr");
+        ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
     }
 
     q = StgMVar_head(mvar);
@@ -1535,7 +1456,7 @@ loop:
 	/* No further takes, the MVar is now full. */
 	StgMVar_value(mvar) = val;
     	unlockClosure(mvar, stg_MVAR_DIRTY_info);
-        RET_N(1);
+        return (1);
     }
     if (StgHeader_info(q) == stg_IND_info ||
         StgHeader_info(q) == stg_MSG_NULL_info) {
@@ -1563,13 +1484,13 @@ loop:
     StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
     
     if (TO_W_(StgStack_dirty(stack)) == 0) {
-        foreign "C" dirty_STACK(MyCapability() "ptr", stack "ptr") [];
+        ccall dirty_STACK(MyCapability() "ptr", stack "ptr");
     }
     
-    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
+    ccall tryWakeupThread(MyCapability() "ptr", tso);
 
     unlockClosure(mvar, stg_MVAR_DIRTY_info);
-    RET_N(1);
+    return (1);
 }
 
 
@@ -1577,13 +1498,13 @@ loop:
    Stable pointer primitives
    -------------------------------------------------------------------------  */
 
-stg_makeStableNamezh
+stg_makeStableNamezh ( P_ obj )
 {
     W_ index, sn_obj;
 
-    ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, stg_makeStableNamezh );
+    ALLOC_PRIM_P (SIZEOF_StgStableName, stg_makeStableNamezh, obj);
   
-    (index) = foreign "C" lookupStableName(R1 "ptr") [];
+    (index) = ccall lookupStableName(obj "ptr");
 
     /* Is there already a StableName for this heap object?
      *  stable_ptr_table is a pointer to an array of snEntry structs.
@@ -1597,56 +1518,48 @@ stg_makeStableNamezh
 	sn_obj = snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry);
     }
     
-    RET_P(sn_obj);
+    return (sn_obj);
 }
 
-
-stg_makeStablePtrzh
+stg_makeStablePtrzh ( P_ obj )
 {
-    /* Args: R1 = a */
     W_ sp;
-    MAYBE_GC(R1_PTR, stg_makeStablePtrzh);
-    ("ptr" sp) = foreign "C" getStablePtr(R1 "ptr") [];
-    RET_N(sp);
+
+    ("ptr" sp) = ccall getStablePtr(obj "ptr");
+    return (sp);
 }
 
-stg_deRefStablePtrzh
+stg_deRefStablePtrzh ( P_ sp )
 {
-    /* Args: R1 = the stable ptr */
-    W_ r, sp;
-    sp = R1;
+    W_ r;
     r = snEntry_addr(W_[stable_ptr_table] + sp*SIZEOF_snEntry);
-    RET_P(r);
+    return (r);
 }
 
 /* -----------------------------------------------------------------------------
    Bytecode object primitives
    -------------------------------------------------------------------------  */
 
-stg_newBCOzh
+stg_newBCOzh ( P_ instrs,
+               P_ literals,
+               P_ ptrs,
+               W_ arity,
+               P_ bitmap_arr )
 {
-    /* R1 = instrs
-       R2 = literals
-       R3 = ptrs
-       R4 = arity
-       R5 = bitmap array
-    */
-    W_ bco, bitmap_arr, bytes, words;
-    
-    bitmap_arr = R5;
+    W_ bco, bytes, words;
 
     words = BYTES_TO_WDS(SIZEOF_StgBCO) + BYTE_ARR_WDS(bitmap_arr);
     bytes = WDS(words);
 
-    ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, stg_newBCOzh );
+    ALLOC_PRIM (bytes);
 
     bco = Hp - bytes + WDS(1);
     SET_HDR(bco, stg_BCO_info, CCCS);
     
-    StgBCO_instrs(bco)     = R1;
-    StgBCO_literals(bco)   = R2;
-    StgBCO_ptrs(bco)       = R3;
-    StgBCO_arity(bco)      = HALF_W_(R4);
+    StgBCO_instrs(bco)     = instrs;
+    StgBCO_literals(bco)   = literals;
+    StgBCO_ptrs(bco)       = ptrs;
+    StgBCO_arity(bco)      = HALF_W_(arity);
     StgBCO_size(bco)       = HALF_W_(words);
     
     // Copy the arity/bitmap info into the BCO
@@ -1659,23 +1572,20 @@ for:
 	goto for;
     }
     
-    RET_P(bco);
+    return (bco);
 }
 
-
-stg_mkApUpd0zh
+stg_mkApUpd0zh ( P_ bco )
 {
-    // R1 = the BCO# for the AP
-    //	
     W_ ap;
 
     // This function is *only* used to wrap zero-arity BCOs in an
     // updatable wrapper (see ByteCodeLink.lhs).  An AP thunk is always
     // saturated and always points directly to a FUN or BCO.
-    ASSERT(%INFO_TYPE(%GET_STD_INFO(R1)) == HALF_W_(BCO) &&
-	   StgBCO_arity(R1) == HALF_W_(0));
+    ASSERT(%INFO_TYPE(%GET_STD_INFO(bco)) == HALF_W_(BCO) &&
+           StgBCO_arity(bco) == HALF_W_(0));
 
-    HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, stg_mkApUpd0zh);
+    HP_CHK_P(SIZEOF_StgAP, stg_mkApUpd0zh, bco);
     TICK_ALLOC_UP_THK(0, 0);
     CCCS_ALLOC(SIZEOF_StgAP);
 
@@ -1683,18 +1593,17 @@ stg_mkApUpd0zh
     SET_HDR(ap, stg_AP_info, CCCS);
     
     StgAP_n_args(ap) = HALF_W_(0);
-    StgAP_fun(ap) = R1;
+    StgAP_fun(ap) = bco;
     
-    RET_P(ap);
+    return (ap);
 }
 
-stg_unpackClosurezh
+stg_unpackClosurezh ( P_ closure )
 {
-/* args: R1 = closure to analyze */
 // TODO: Consider the absence of ptrs or nonptrs as a special case ?
 
     W_ info, ptrs, nptrs, p, ptrs_arr, nptrs_arr;
-    info  = %GET_STD_INFO(UNTAG(R1));
+    info  = %GET_STD_INFO(UNTAG(closure));
 
     // Some closures have non-standard layout, so we omit those here.
     W_ type;
@@ -1723,10 +1632,10 @@ out:
     ptrs_arr_cards = mutArrPtrsCardWords(ptrs);
     ptrs_arr_sz  = SIZEOF_StgMutArrPtrs + WDS(ptrs) + WDS(ptrs_arr_cards);
 
-    ALLOC_PRIM (ptrs_arr_sz + nptrs_arr_sz, R1_PTR, stg_unpackClosurezh);
+    ALLOC_PRIM_P (ptrs_arr_sz + nptrs_arr_sz, stg_unpackClosurezh, closure);
 
     W_ clos;
-    clos = UNTAG(R1);
+    clos = UNTAG(closure);
 
     ptrs_arr  = Hp - nptrs_arr_sz - ptrs_arr_sz + WDS(1);
     nptrs_arr = Hp - nptrs_arr_sz + WDS(1);
@@ -1755,7 +1664,7 @@ for2:
     	 p = p + 1;
 	 goto for2;
     }
-    RET_NPP(info, ptrs_arr, nptrs_arr);
+    return (info, ptrs_arr, nptrs_arr);
 }
 
 /* -----------------------------------------------------------------------------
@@ -1770,47 +1679,45 @@ for2:
     if (W_[blocked_queue_hd] == END_TSO_QUEUE) {	\
       W_[blocked_queue_hd] = tso;			\
     } else {						\
-      foreign "C" setTSOLink(MyCapability() "ptr", W_[blocked_queue_tl] "ptr", tso) []; \
+      ccall setTSOLink(MyCapability() "ptr", W_[blocked_queue_tl] "ptr", tso); \
     }							\
     W_[blocked_queue_tl] = tso;
 
-stg_waitReadzh
+stg_waitReadzh ( W_ fd )
 {
-    /* args: R1 */
 #ifdef THREADED_RTS
-    foreign "C" barf("waitRead# on threaded RTS") never returns;
+    ccall barf("waitRead# on threaded RTS") never returns;
 #else
 
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
-    StgTSO_block_info(CurrentTSO) = R1;
+    StgTSO_block_info(CurrentTSO) = fd;
     // No locking - we're not going to use this interface in the
     // threaded RTS anyway.
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
-    jump stg_block_noregs;
+    jump stg_block_noregs();
 #endif
 }
 
-stg_waitWritezh
+stg_waitWritezh ( W_ fd )
 {
-    /* args: R1 */
 #ifdef THREADED_RTS
-    foreign "C" barf("waitWrite# on threaded RTS") never returns;
+    ccall barf("waitWrite# on threaded RTS") never returns;
 #else
 
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
-    StgTSO_block_info(CurrentTSO) = R1;
+    StgTSO_block_info(CurrentTSO) = fd;
     // No locking - we're not going to use this interface in the
     // threaded RTS anyway.
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
-    jump stg_block_noregs;
+    jump stg_block_noregs();
 #endif
 }
 
 
 STRING(stg_delayzh_malloc_str, "stg_delayzh")
-stg_delayzh
+stg_delayzh ( W_ us_delay )
 {
 #ifdef mingw32_HOST_OS
     W_ ares;
@@ -1820,19 +1727,18 @@ stg_delayzh
 #endif
 
 #ifdef THREADED_RTS
-    foreign "C" barf("delay# on threaded RTS") never returns;
+    ccall barf("delay# on threaded RTS") never returns;
 #else
 
-    /* args: R1 (microsecond delay amount) */
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnDelay::I16;
 
 #ifdef mingw32_HOST_OS
 
     /* could probably allocate this on the heap instead */
-    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+    ("ptr" ares) = ccall stgMallocBytes(SIZEOF_StgAsyncIOResult,
 					    stg_delayzh_malloc_str);
-    (reqID) = foreign "C" addDelayRequest(R1);
+    (reqID) = ccall addDelayRequest(us_delay);
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
     StgAsyncIOResult_errCode(ares) = 0;
@@ -1844,12 +1750,12 @@ stg_delayzh
      */
     StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
-    jump stg_block_async_void;
+    jump stg_block_async_void();
 
 #else
 
 
-    (target) = foreign "C" getDelayTarget(R1) [R1];
+    (target) = ccall getDelayTarget(us_delay);
 
     StgTSO_block_info(CurrentTSO) = target;
 
@@ -1867,9 +1773,9 @@ while:
     if (prev == NULL) {
 	W_[sleeping_queue] = CurrentTSO;
     } else {
-        foreign "C" setTSOLink(MyCapability() "ptr", prev "ptr", CurrentTSO) [];
+        ccall setTSOLink(MyCapability() "ptr", prev "ptr", CurrentTSO);
     }
-    jump stg_block_noregs;
+    jump stg_block_noregs();
 #endif
 #endif /* !THREADED_RTS */
 }
@@ -1877,86 +1783,80 @@ while:
 
 #ifdef mingw32_HOST_OS
 STRING(stg_asyncReadzh_malloc_str, "stg_asyncReadzh")
-stg_asyncReadzh
+stg_asyncReadzh ( W_ fd, W_ is_sock, W_ len, W_ buf )
 {
     W_ ares;
     CInt reqID;
 
 #ifdef THREADED_RTS
-    foreign "C" barf("asyncRead# on threaded RTS") never returns;
+    ccall barf("asyncRead# on threaded RTS") never returns;
 #else
 
-    /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
 
     /* could probably allocate this on the heap instead */
-    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
-					    stg_asyncReadzh_malloc_str)
-			[R1,R2,R3,R4];
-    (reqID) = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr") [];
+    ("ptr" ares) = ccall stgMallocBytes(SIZEOF_StgAsyncIOResult,
+                                        stg_asyncReadzh_malloc_str);
+    (reqID) = ccall addIORequest(fd, 0/*FALSE*/,is_sock,len,buf "ptr");
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
     StgAsyncIOResult_errCode(ares) = 0;
     StgTSO_block_info(CurrentTSO)  = ares;
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
-    jump stg_block_async;
+    jump stg_block_async();
 #endif
 }
 
 STRING(stg_asyncWritezh_malloc_str, "stg_asyncWritezh")
-stg_asyncWritezh
+stg_asyncWritezh ( W_ fd, W_ is_sock, W_ len, W_ buf )
 {
     W_ ares;
     CInt reqID;
 
 #ifdef THREADED_RTS
-    foreign "C" barf("asyncWrite# on threaded RTS") never returns;
+    ccall barf("asyncWrite# on threaded RTS") never returns;
 #else
 
-    /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
 
-    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
-					    stg_asyncWritezh_malloc_str)
-			[R1,R2,R3,R4];
-    (reqID) = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr") [];
+    ("ptr" ares) = ccall stgMallocBytes(SIZEOF_StgAsyncIOResult,
+                                        stg_asyncWritezh_malloc_str);
+    (reqID) = ccall addIORequest(fd, 1/*TRUE*/,is_sock,len,buf "ptr");
 
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
     StgAsyncIOResult_errCode(ares) = 0;
     StgTSO_block_info(CurrentTSO)  = ares;
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
-    jump stg_block_async;
+    jump stg_block_async();
 #endif
 }
 
 STRING(stg_asyncDoProczh_malloc_str, "stg_asyncDoProczh")
-stg_asyncDoProczh
+stg_asyncDoProczh ( W_ proc, W_ param )
 {
     W_ ares;
     CInt reqID;
 
 #ifdef THREADED_RTS
-    foreign "C" barf("asyncDoProc# on threaded RTS") never returns;
+    ccall barf("asyncDoProc# on threaded RTS") never returns;
 #else
 
-    /* args: R1 = proc, R2 = param */
     ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
     StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
 
     /* could probably allocate this on the heap instead */
-    ("ptr" ares) = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
-					    stg_asyncDoProczh_malloc_str) 
-				[R1,R2];
-    (reqID) = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr") [];
+    ("ptr" ares) = ccall stgMallocBytes(SIZEOF_StgAsyncIOResult,
+                                        stg_asyncDoProczh_malloc_str);
+    (reqID) = ccall addDoProcRequest(proc "ptr",param "ptr");
     StgAsyncIOResult_reqID(ares)   = reqID;
     StgAsyncIOResult_len(ares)     = 0;
     StgAsyncIOResult_errCode(ares) = 0;
     StgTSO_block_info(CurrentTSO) = ares;
     APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
-    jump stg_block_async;
+    jump stg_block_async();
 #endif
 }
 #endif
@@ -2012,15 +1912,16 @@ stg_asyncDoProczh
  * only manifests occasionally (once very 10 runs or so).
  * -------------------------------------------------------------------------- */
 
-INFO_TABLE_RET(stg_noDuplicate, RET_SMALL)
+INFO_TABLE_RET(stg_noDuplicate, RET_SMALL, W_ info_ptr)
+    return (/* no return values */)
 {
-    Sp_adj(1);
-    jump stg_noDuplicatezh;
+    jump stg_noDuplicatezh();
 }
 
-stg_noDuplicatezh
+stg_noDuplicatezh /* no arg list: explicit stack layout */
 {
-    STK_CHK_GEN( WDS(1), NO_PTRS, stg_noDuplicatezh );
+    STK_CHK(WDS(1), stg_noDuplicatezh);
+
     // leave noDuplicate frame in case the current
     // computation is suspended and restarted (see above).
     Sp_adj(-1);
@@ -2028,10 +1929,10 @@ stg_noDuplicatezh
 
     SAVE_THREAD_STATE();
     ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
-    foreign "C" threadPaused (MyCapability() "ptr", CurrentTSO "ptr") [];
+    ccall threadPaused (MyCapability() "ptr", CurrentTSO "ptr");
     
     if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) {
-        jump stg_threadFinished;
+        jump stg_threadFinished [];
     } else {
         LOAD_THREAD_STATE();
         ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
@@ -2039,7 +1940,7 @@ stg_noDuplicatezh
         if (Sp(0) == stg_noDuplicate_info) {
             Sp_adj(1);
         }
-        jump %ENTRY_CODE(Sp(0));
+        jump %ENTRY_CODE(Sp(0)) [];
     }
 }
 
@@ -2047,75 +1948,62 @@ stg_noDuplicatezh
    Misc. primitives
    -------------------------------------------------------------------------- */
 
-stg_getApStackValzh
+stg_getApStackValzh ( P_ ap_stack, W_ offset )
 {
-   W_ ap_stack, offset, val, ok;
-
-   /* args: R1 = AP_STACK, R2 = offset */
-   ap_stack = R1;
-   offset   = R2;
-
    if (%INFO_PTR(ap_stack) == stg_AP_STACK_info) {
-        ok = 1;
-        val = StgAP_STACK_payload(ap_stack,offset); 
+       return (1,StgAP_STACK_payload(ap_stack,offset));
    } else {
-        ok = 0;
-        val = R1;
+       return (0,ap_stack);
    }
-   RET_NP(ok,val);
 }
 
 // Write the cost center stack of the first argument on stderr; return
 // the second.  Possibly only makes sense for already evaluated
 // things?
-stg_traceCcszh
+stg_traceCcszh ( P_ obj, P_ ret )
 {
     W_ ccs;
 
 #ifdef PROFILING
-    ccs = StgHeader_ccs(UNTAG(R1));
-    foreign "C" fprintCCS_stderr(ccs "ptr") [R2];
+    ccs = StgHeader_ccs(UNTAG(obj));
+    ccall fprintCCS_stderr(ccs "ptr");
 #endif
 
-    R1 = R2;
-    ENTER();
+    jump stg_ap_0_fast(ret);
 }
 
-stg_getSparkzh
+stg_getSparkzh ()
 {
    W_ spark;
 
 #ifndef THREADED_RTS
-   RET_NP(0,ghczmprim_GHCziTypes_False_closure);
+   return (0,ghczmprim_GHCziTypes_False_closure);
 #else
-   (spark) = foreign "C" findSpark(MyCapability());
+   (spark) = ccall findSpark(MyCapability());
    if (spark != 0) {
-      RET_NP(1,spark);
+      return (1,spark);
    } else {
-      RET_NP(0,ghczmprim_GHCziTypes_False_closure);
+      return (0,ghczmprim_GHCziTypes_False_closure);
    }
 #endif
 }
 
-stg_numSparkszh
+stg_numSparkszh ()
 {
   W_ n;
 #ifdef THREADED_RTS
-  (n) = foreign "C" dequeElements(Capability_sparks(MyCapability()));
+  (n) = ccall dequeElements(Capability_sparks(MyCapability()));
 #else
   n = 0;
 #endif
-  RET_N(n);
+  return (n);
 }
 
-stg_traceEventzh
+stg_traceEventzh ( W_ msg )
 {
-   W_ msg;
-   msg = R1;
-
 #if defined(TRACING) || defined(DEBUG)
 
-   foreign "C" traceUserMsg(MyCapability() "ptr", msg "ptr") [];
+   ccall traceUserMsg(MyCapability() "ptr", msg "ptr");
 
 #elif defined(DTRACE)
 
@@ -2125,7 +2013,7 @@ stg_traceEventzh
    // RtsProbes.h, but that header file includes unistd.h, which doesn't
    // work in Cmm
 #if !defined(solaris2_TARGET_OS)
-   (enabled) = foreign "C" __dtrace_isenabled$HaskellEvent$user__msg$v1() [];
+   (enabled) = ccall __dtrace_isenabled$HaskellEvent$user__msg$v1();
 #else
    // Solaris' DTrace can't handle the
    //     __dtrace_isenabled$HaskellEvent$user__msg$v1
@@ -2139,9 +2027,10 @@ stg_traceEventzh
    enabled = 1;
 #endif
    if (enabled != 0) {
-     foreign "C" dtraceUserMsgWrapper(MyCapability() "ptr", msg "ptr") [];
+     ccall dtraceUserMsgWrapper(MyCapability() "ptr", msg "ptr");
    }
 
 #endif
-   jump %ENTRY_CODE(Sp(0));
+   return ();
 }
+
diff --git a/rts/Printer.c b/rts/Printer.c
index fb00401f59..4f9f83db52 100644
--- a/rts/Printer.c
+++ b/rts/Printer.c
@@ -251,7 +251,6 @@ printClosure( StgClosure *obj )
     case RET_BCO:
     case RET_SMALL:
     case RET_BIG:
-    case RET_DYN:
     case RET_FUN:
     */
 
@@ -478,38 +477,7 @@ printStackChunk( StgPtr sp, StgPtr spBottom )
             printObj((StgClosure*)sp);
 	    continue;
 
-	case RET_DYN:
-	{ 
-	    StgRetDyn* r;
-	    StgPtr p;
-	    StgWord dyn;
-	    nat size;
-
-	    r = (StgRetDyn *)sp;
-	    dyn = r->liveness;
-	    debugBelch("RET_DYN (%p)\n", r);
-
-	    p = (P_)(r->payload);
-	    printSmallBitmap(spBottom, sp,
-			     RET_DYN_LIVENESS(r->liveness), 
-			     RET_DYN_BITMAP_SIZE);
-	    p += RET_DYN_BITMAP_SIZE + RET_DYN_NONPTR_REGS_SIZE;
-
-	    for (size = RET_DYN_NONPTRS(dyn); size > 0; size--) {
-		debugBelch("   stk[%ld] (%p) = ", (long)(spBottom-p), p);
-		debugBelch("Word# %ld\n", (long)*p);
-		p++;
-	    }
-	
-	    for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
-		debugBelch("   stk[%ld] (%p) = ", (long)(spBottom-p), p);
-		printPtr(p);
-		p++;
-	    }
-	    continue;
-	}
-
-	case RET_SMALL:
+        case RET_SMALL:
 	    debugBelch("RET_SMALL (%p)\n", info);
 	    bitmap = info->layout.bitmap;
 	    printSmallBitmap(spBottom, sp+1, 
@@ -1112,7 +1080,6 @@ char *closure_type_names[] = {
  [RET_BCO]               = "RET_BCO",
  [RET_SMALL]             = "RET_SMALL",
  [RET_BIG]               = "RET_BIG",
- [RET_DYN]               = "RET_DYN",
  [RET_FUN]               = "RET_FUN",
  [UPDATE_FRAME]          = "UPDATE_FRAME",
  [CATCH_FRAME]           = "CATCH_FRAME",
diff --git a/rts/RaiseAsync.c b/rts/RaiseAsync.c
index 47d88068bf..f5669cb8ec 100644
--- a/rts/RaiseAsync.c
+++ b/rts/RaiseAsync.c
@@ -959,7 +959,7 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
                 // transactions, but I don't fully understand the
                 // interaction with STM invariants.
                 stack->sp[1] = (W_)&stg_NO_TREC_closure;
-                stack->sp[0] = (W_)&stg_gc_unpt_r1_info;
+                stack->sp[0] = (W_)&stg_ret_p_info;
                 tso->what_next = ThreadRunGHC;
                 goto done;
 	    }
diff --git a/rts/RetainerProfile.c b/rts/RetainerProfile.c
index c07dff76e4..5f9164b77b 100644
--- a/rts/RetainerProfile.c
+++ b/rts/RetainerProfile.c
@@ -605,7 +605,6 @@ push( StgClosure *c, retainer c_child_r, StgClosure **first_child )
     case CATCH_FRAME:
     case UNDERFLOW_FRAME:
     case STOP_FRAME:
-    case RET_DYN:
     case RET_BCO:
     case RET_SMALL:
     case RET_BIG:
@@ -931,8 +930,7 @@ pop( StgClosure **c, StgClosure **cp, retainer *r )
         case IND_STATIC:
 	case CONSTR_NOCAF_STATIC:
 	    // stack objects
-	case RET_DYN:
-	case UPDATE_FRAME:
+        case UPDATE_FRAME:
 	case CATCH_FRAME:
         case UNDERFLOW_FRAME:
         case STOP_FRAME:
@@ -1087,7 +1085,6 @@ isRetainer( StgClosure *c )
     case CATCH_FRAME:
     case UNDERFLOW_FRAME:
     case STOP_FRAME:
-    case RET_DYN:
     case RET_BCO:
     case RET_SMALL:
     case RET_BIG:
@@ -1349,29 +1346,7 @@ retainStack( StgClosure *c, retainer c_child_r,
 	    // and don't forget to follow the SRT 
 	    goto follow_srt;
 
-	    // Dynamic bitmap: the mask is stored on the stack 
-	case RET_DYN: {
-	    StgWord dyn;
-	    dyn = ((StgRetDyn *)p)->liveness;
-
-	    // traverse the bitmap first
-	    bitmap = RET_DYN_LIVENESS(dyn);
-	    p      = (P_)&((StgRetDyn *)p)->payload[0];
-	    size   = RET_DYN_BITMAP_SIZE;
-	    p = retain_small_bitmap(p, size, bitmap, c, c_child_r);
-	    
-	    // skip over the non-ptr words
-	    p += RET_DYN_NONPTRS(dyn) + RET_DYN_NONPTR_REGS_SIZE;
-	    
-	    // follow the ptr words
-	    for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
-		retainClosure((StgClosure *)*p, c, c_child_r);
-		p++;
-	    }
-	    continue;
-	}
-
-	case RET_FUN: {
+        case RET_FUN: {
 	    StgRetFun *ret_fun = (StgRetFun *)p;
 	    StgFunInfoTable *fun_info;
 	    
diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm
index b99126187a..2985982d64 100644
--- a/rts/StgMiscClosures.cmm
+++ b/rts/StgMiscClosures.cmm
@@ -22,30 +22,36 @@ import LeaveCriticalSection;
    Stack underflow
    ------------------------------------------------------------------------- */
 
-INFO_TABLE_RET (stg_stack_underflow_frame, UNDERFLOW_FRAME, P_ unused)
+INFO_TABLE_RET (stg_stack_underflow_frame, UNDERFLOW_FRAME,
+                W_ info_ptr, P_ unused)
+    /* no args => explicit stack */
 {
     W_ new_tso;
     W_ ret_off;
 
+    SAVE_STGREGS
+
     SAVE_THREAD_STATE();
     ("ptr" ret_off) = foreign "C" threadStackUnderflow(MyCapability(),
                                                        CurrentTSO);
     LOAD_THREAD_STATE();
 
-    jump %ENTRY_CODE(Sp(ret_off));
+    RESTORE_STGREGS
+
+    jump %ENTRY_CODE(Sp(ret_off)) [*]; // NB. all registers live!
 }
 
 /* ----------------------------------------------------------------------------
    Restore a saved cost centre
    ------------------------------------------------------------------------- */
 
-INFO_TABLE_RET (stg_restore_cccs, RET_SMALL, W_ cccs)
+INFO_TABLE_RET (stg_restore_cccs, RET_SMALL, W_ info_ptr, W_ cccs)
 {
 #if defined(PROFILING)
     CCCS = Sp(1);
 #endif
     Sp_adj(2);
-    jump %ENTRY_CODE(Sp(0));
+    jump %ENTRY_CODE(Sp(0)) [*]; // NB. all registers live!
 }
 
 /* ----------------------------------------------------------------------------
@@ -53,10 +59,9 @@ INFO_TABLE_RET (stg_restore_cccs, RET_SMALL, W_ cccs)
    ------------------------------------------------------------------------- */
 
 /* 9 bits of return code for constructors created by the interpreter. */
-stg_interp_constr_entry
+stg_interp_constr_entry (P_ ret)
 {
-    /* R1 points at the constructor */
-    jump %ENTRY_CODE(Sp(0));
+    return (ret);
 }
 
 /* Some info tables to be used when compiled code returns a value to
@@ -94,76 +99,83 @@ stg_interp_constr_entry
 */
 
 INFO_TABLE_RET( stg_ctoi_R1p, RET_BCO)
+    /* explicit stack */
 {
     Sp_adj(-2);
     Sp(1) = R1;
     Sp(0) = stg_enter_info;
-    jump stg_yield_to_interpreter;
+    jump stg_yield_to_interpreter [];
 }
 
 /*
  * When the returned value is a pointer, but unlifted, in R1 ...
  */
 INFO_TABLE_RET( stg_ctoi_R1unpt, RET_BCO )
+    /* explicit stack */
 {
     Sp_adj(-2);
     Sp(1) = R1;
-    Sp(0) = stg_gc_unpt_r1_info;
-    jump stg_yield_to_interpreter;
+    Sp(0) = stg_ret_p_info;
+    jump stg_yield_to_interpreter [];
 }
 
 /*
  * When the returned value is a non-pointer in R1 ...
  */
 INFO_TABLE_RET( stg_ctoi_R1n, RET_BCO )
+    /* explicit stack */
 {
     Sp_adj(-2);
     Sp(1) = R1;
-    Sp(0) = stg_gc_unbx_r1_info;
-    jump stg_yield_to_interpreter;
+    Sp(0) = stg_ret_p_info;
+    jump stg_yield_to_interpreter [];
 }
 
 /*
  * When the returned value is in F1
  */
 INFO_TABLE_RET( stg_ctoi_F1, RET_BCO )
+    /* explicit stack */
 {
     Sp_adj(-2);
     F_[Sp + WDS(1)] = F1;
-    Sp(0) = stg_gc_f1_info;
-    jump stg_yield_to_interpreter;
+    Sp(0) = stg_ret_f_info;
+    jump stg_yield_to_interpreter [];
 }
 
 /*
  * When the returned value is in D1
  */
 INFO_TABLE_RET( stg_ctoi_D1, RET_BCO )
+    /* explicit stack */
 {
     Sp_adj(-1) - SIZEOF_DOUBLE;
     D_[Sp + WDS(1)] = D1;
-    Sp(0) = stg_gc_d1_info;
-    jump stg_yield_to_interpreter;
+    Sp(0) = stg_ret_d_info;
+    jump stg_yield_to_interpreter [];
 }
 
 /*
  * When the returned value is in L1
  */
 INFO_TABLE_RET( stg_ctoi_L1, RET_BCO )
+    /* explicit stack */
 {
     Sp_adj(-1) - 8;
     L_[Sp + WDS(1)] = L1;
-    Sp(0) = stg_gc_l1_info;
-    jump stg_yield_to_interpreter;
+    Sp(0) = stg_ret_l_info;
+    jump stg_yield_to_interpreter [];
 }
 
 /*
  * When the returned value is a void
  */
 INFO_TABLE_RET( stg_ctoi_V, RET_BCO )
+    /* explicit stack */
 {
     Sp_adj(-1);
-    Sp(0) = stg_gc_void_info;
-    jump stg_yield_to_interpreter;
+    Sp(0) = stg_ret_v_info;
+    jump stg_yield_to_interpreter [];
 }
 
 /*
@@ -172,9 +184,10 @@ INFO_TABLE_RET( stg_ctoi_V, RET_BCO )
  * stack.
  */
 INFO_TABLE_RET( stg_apply_interp, RET_BCO )
+    /* explicit stack */
 {
     /* Just in case we end up in here... (we shouldn't) */
-    jump stg_yield_to_interpreter;
+    jump stg_yield_to_interpreter [];
 }
 
 /* ----------------------------------------------------------------------------
@@ -182,12 +195,13 @@ INFO_TABLE_RET( stg_apply_interp, RET_BCO )
    ------------------------------------------------------------------------- */
 
 INFO_TABLE_FUN( stg_BCO, 4, 0, BCO, "BCO", "BCO", ARG_BCO )
+    /* explicit stack */
 {
   /* entering a BCO means "apply it", same as a function */
   Sp_adj(-2);
   Sp(1) = R1;
   Sp(0) = stg_apply_interp_info;
-  jump stg_yield_to_interpreter;
+  jump stg_yield_to_interpreter [];
 }
 
 /* ----------------------------------------------------------------------------
@@ -201,30 +215,48 @@ INFO_TABLE_FUN( stg_BCO, 4, 0, BCO, "BCO", "BCO", ARG_BCO )
    ------------------------------------------------------------------------- */
 
 INFO_TABLE(stg_IND,1,0,IND,"IND","IND")
+#if 0
+/*
+  This version in high-level cmm generates slightly less good code
+  than the low-level version below it. (ToDo)
+*/
+    (P_ node)
+{
+    TICK_ENT_DYN_IND(); /* tick */
+    node = UNTAG(StgInd_indirectee(node));
+    TICK_ENT_VIA_NODE();
+    jump %GET_ENTRY(node) (node);
+}
+#else
+    /* explicit stack */
 {
     TICK_ENT_DYN_IND(); /* tick */
     R1 = UNTAG(StgInd_indirectee(R1));
     TICK_ENT_VIA_NODE();
-    jump %GET_ENTRY(R1);
+    jump %GET_ENTRY(R1) [R1];
 }
+#endif
 
 INFO_TABLE(stg_IND_direct,1,0,IND,"IND","IND")
+    (P_ node)
 {
     TICK_ENT_DYN_IND(); /* tick */
-    R1 = StgInd_indirectee(R1);
+    node = StgInd_indirectee(node);
     TICK_ENT_VIA_NODE();
-    jump %ENTRY_CODE(Sp(0));
+    jump %ENTRY_CODE(Sp(0)) (node);
 }
 
 INFO_TABLE(stg_IND_STATIC,1,0,IND_STATIC,"IND_STATIC","IND_STATIC")
+    /* explicit stack */
 {
     TICK_ENT_STATIC_IND(); /* tick */
     R1 = UNTAG(StgInd_indirectee(R1));
     TICK_ENT_VIA_NODE();
-    jump %GET_ENTRY(R1);
+    jump %GET_ENTRY(R1) [R1];
 }
 
 INFO_TABLE(stg_IND_PERM,1,0,IND_PERM,"IND_PERM","IND_PERM")
+    /* explicit stack */
 {
     /* Don't add INDs to granularity cost */
 
@@ -259,7 +291,7 @@ INFO_TABLE(stg_IND_PERM,1,0,IND_PERM,"IND_PERM","IND_PERM")
     TICK_ENT_VIA_NODE();
 #endif
 
-    jump %GET_ENTRY(R1);
+    jump %GET_ENTRY(R1) [R1];
 }
 
 /* ----------------------------------------------------------------------------
@@ -272,16 +304,17 @@ INFO_TABLE(stg_IND_PERM,1,0,IND_PERM,"IND_PERM","IND_PERM")
    ------------------------------------------------------------------------- */
 
 INFO_TABLE(stg_BLACKHOLE,1,0,BLACKHOLE,"BLACKHOLE","BLACKHOLE")
+    (P_ node)
 {
-    W_ r, p, info, bq, msg, owner, bd;
+    W_ r, info, owner, bd;
+    P_ p, bq, msg;
 
     TICK_ENT_DYN_IND(); /* tick */
 
 retry:
-    p = StgInd_indirectee(R1);
+    p = StgInd_indirectee(node);
     if (GETTAG(p) != 0) {
-        R1 = p;
-        jump %ENTRY_CODE(Sp(0));
+        return (p);
     }
 
     info = StgHeader_info(p);
@@ -296,33 +329,33 @@ retry:
         info == stg_BLOCKING_QUEUE_CLEAN_info ||
         info == stg_BLOCKING_QUEUE_DIRTY_info)
     {
-        ("ptr" msg) = foreign "C" allocate(MyCapability() "ptr",
-                                           BYTES_TO_WDS(SIZEOF_MessageBlackHole)) [R1];
+        ("ptr" msg) = ccall allocate(MyCapability() "ptr",
+                                     BYTES_TO_WDS(SIZEOF_MessageBlackHole));
 
         SET_HDR(msg, stg_MSG_BLACKHOLE_info, CCS_SYSTEM);
         MessageBlackHole_tso(msg) = CurrentTSO;
-        MessageBlackHole_bh(msg) = R1;
+        MessageBlackHole_bh(msg) = node;
 
-        (r) = foreign "C" messageBlackHole(MyCapability() "ptr", msg "ptr") [R1];
+        (r) = ccall messageBlackHole(MyCapability() "ptr", msg "ptr");
 
         if (r == 0) {
             goto retry;
         } else {
             StgTSO_why_blocked(CurrentTSO) = BlockedOnBlackHole::I16;
             StgTSO_block_info(CurrentTSO) = msg;
-            jump stg_block_blackhole;
+            jump stg_block_blackhole(node);
         }
     }
     else
     {
-        R1 = p;
-        ENTER();
+        ENTER(p);
     }
 }
 
 INFO_TABLE(__stg_EAGER_BLACKHOLE,1,0,BLACKHOLE,"BLACKHOLE","BLACKHOLE")
+    (P_ node)
 {
-    jump ENTRY_LBL(stg_BLACKHOLE);
+    jump ENTRY_LBL(stg_BLACKHOLE) (node);
 }
 
 // CAF_BLACKHOLE is allocated when entering a CAF.  The reason it is
@@ -332,8 +365,9 @@ INFO_TABLE(__stg_EAGER_BLACKHOLE,1,0,BLACKHOLE,"BLACKHOLE","BLACKHOLE")
 // evaluation by another thread (a BLACKHOLE).  See threadPaused().
 //
 INFO_TABLE(stg_CAF_BLACKHOLE,1,0,BLACKHOLE,"BLACKHOLE","BLACKHOLE")
+    (P_ node)
 {
-    jump ENTRY_LBL(stg_BLACKHOLE);
+    jump ENTRY_LBL(stg_BLACKHOLE) (node);
 }
 
 INFO_TABLE(stg_BLOCKING_QUEUE_CLEAN,4,0,BLOCKING_QUEUE,"BLOCKING_QUEUE","BLOCKING_QUEUE")
@@ -349,6 +383,7 @@ INFO_TABLE(stg_BLOCKING_QUEUE_DIRTY,4,0,BLOCKING_QUEUE,"BLOCKING_QUEUE","BLOCKIN
    ------------------------------------------------------------------------- */
 
 INFO_TABLE(stg_WHITEHOLE, 0,0, WHITEHOLE, "WHITEHOLE", "WHITEHOLE")
+    (P_ node)
 {
 #if defined(THREADED_RTS)
     W_ info, i;
@@ -356,18 +391,18 @@ INFO_TABLE(stg_WHITEHOLE, 0,0, WHITEHOLE, "WHITEHOLE", "WHITEHOLE")
     i = 0;
 loop:
     // spin until the WHITEHOLE is updated
-    info = StgHeader_info(R1);
+    info = StgHeader_info(node);
     if (info == stg_WHITEHOLE_info) {
         i = i + 1;
         if (i == SPIN_COUNT) {
             i = 0;
-            foreign "C" yieldThread() [R1];
+            ccall yieldThread();
         }
         goto loop;
     }
-    jump %ENTRY_CODE(info);
+    jump %ENTRY_CODE(info) (node);
 #else
-    foreign "C" barf("WHITEHOLE object entered!") never returns;
+    ccall barf("WHITEHOLE object entered!") never returns;
 #endif
 }
 
@@ -556,8 +591,9 @@ INFO_TABLE(stg_MUT_VAR_DIRTY, 1, 0, MUT_VAR_DIRTY, "MUT_VAR_DIRTY", "MUT_VAR_DIR
  * ------------------------------------------------------------------------- */
 
 INFO_TABLE( stg_dummy_ret, 0, 0, CONSTR_NOCAF_STATIC, "DUMMY_RET", "DUMMY_RET")
+    ()
 {
-  jump %ENTRY_CODE(Sp(0));
+    return ();
 }
 CLOSURE(stg_dummy_ret_closure,stg_dummy_ret);
 
diff --git a/rts/StgStartup.cmm b/rts/StgStartup.cmm
index 4aace82deb..6793913464 100644
--- a/rts/StgStartup.cmm
+++ b/rts/StgStartup.cmm
@@ -35,11 +35,9 @@
    -------------------------------------------------------------------------- */
 
 INFO_TABLE_RET(stg_stop_thread, STOP_FRAME,
-#if defined(PROFILING)
-  W_ unused,
-  W_ unused
-#endif
-)
+               W_ info_ptr,
+               PROF_HDR_FIELDS(W_))
+/* no return list: explicit stack layout */
 {
     /* 
        The final exit.
@@ -75,7 +73,7 @@ INFO_TABLE_RET(stg_stop_thread, STOP_FRAME,
     StgRegTable_rRet(BaseReg) = ThreadFinished;
     R1 = BaseReg;
 
-    jump StgReturn;
+    jump StgReturn [R1];
 }
 
 /* -----------------------------------------------------------------------------
@@ -87,46 +85,57 @@ INFO_TABLE_RET(stg_stop_thread, STOP_FRAME,
    the thread's state away nicely.
    -------------------------------------------------------------------------- */
 
-stg_returnToStackTop
+stg_returnToStackTop /* no args: explicit stack layout */
 {
   LOAD_THREAD_STATE();
   CHECK_SENSIBLE_REGS();
-  jump %ENTRY_CODE(Sp(0));
+  jump %ENTRY_CODE(Sp(0)) [];
 }
 
-stg_returnToSched
+stg_returnToSched /* no args: explicit stack layout */
 {
+  W_ r1;
+  r1 = R1; // foreign calls may clobber R1
   SAVE_THREAD_STATE();
   foreign "C" threadPaused(MyCapability() "ptr", CurrentTSO);
-  jump StgReturn;
+  R1 = r1;
+  jump StgReturn [R1];
 }
 
 // A variant of stg_returnToSched that doesn't call threadPaused() on the
 // current thread.  This is used for switching from compiled execution to the
 // interpreter, where calling threadPaused() on every switch would be too
 // expensive.
-stg_returnToSchedNotPaused
+stg_returnToSchedNotPaused /* no args: explicit stack layout */
 {
   SAVE_THREAD_STATE();
-  jump StgReturn;
+  jump StgReturn [R1];
 }
 
 // A variant of stg_returnToSched, but instead of returning directly to the
 // scheduler, we jump to the code fragment pointed to by R2.  This lets us
 // perform some final actions after making the thread safe, such as unlocking
 // the MVar on which we are about to block in SMP mode.
-stg_returnToSchedButFirst
+stg_returnToSchedButFirst /* no args: explicit stack layout */
 {
+  W_ r1, r2, r3;
+  r1 = R1;
+  r2 = R2;
+  r3 = R3;
   SAVE_THREAD_STATE();
+  // foreign calls may clobber R1/R2/.., so we save them above
   foreign "C" threadPaused(MyCapability() "ptr", CurrentTSO);
-  jump R2;
+  R1 = r1;
+  R2 = r2;
+  R3 = r3;
+  jump R2 [R1,R3];
 }
 
-stg_threadFinished
+stg_threadFinished /* no args: explicit stack layout */
 {
   StgRegTable_rRet(BaseReg) = ThreadFinished;
   R1 = BaseReg;
-  jump StgReturn;
+  jump StgReturn [R1];
 }  
 
 /* -----------------------------------------------------------------------------
@@ -143,31 +152,30 @@ stg_threadFinished
 
     ------------------------------------------------------------------------- */
 
-INFO_TABLE_RET(stg_forceIO, RET_SMALL)
-
+INFO_TABLE_RET(stg_forceIO, RET_SMALL, P_ info_ptr)
+    return (P_ ret)
 {
-  Sp_adj(1);
-  ENTER();
+    ENTER(ret);
 }
 
 /* -----------------------------------------------------------------------------
    Special STG entry points for module registration.
    -------------------------------------------------------------------------- */
 
-stg_init_finish
+stg_init_finish /* no args: explicit stack layout */
 {
-  jump StgReturn;
+  jump StgReturn [];
 }
 
 /* On entry to stg_init:
  *    init_stack[0] = &stg_init_ret;
  *    init_stack[1] = __stginit_Something;
  */
-stg_init
+stg_init /* no args: explicit stack layout */
 {
   W_ next;
   Sp = W_[BaseReg + OFFSET_StgRegTable_rSp];
   next = W_[Sp];
   Sp_adj(1);
-  jump next;
+  jump next [];
 }
diff --git a/rts/StgStdThunks.cmm b/rts/StgStdThunks.cmm
index 171ab52b96..0b69a9a279 100644
--- a/rts/StgStdThunks.cmm
+++ b/rts/StgStdThunks.cmm
@@ -11,6 +11,7 @@
  * ---------------------------------------------------------------------------*/
 
 #include "Cmm.h"
+#include "Updates.h"
 
 /* -----------------------------------------------------------------------------
    The code for a thunk that simply extracts a field from a
@@ -26,17 +27,15 @@
    matching.
    -------------------------------------------------------------------------- */
 
-#define WITHUPD_FRAME_SIZE  (SIZEOF_StgUpdateFrame + SIZEOF_StgHeader)
-#define NOUPD_FRAME_SIZE    (SIZEOF_StgHeader)
-
 #ifdef PROFILING
-#define SAVE_CCCS(fs)   StgHeader_ccs(Sp-fs) = CCCS
-#define GET_SAVED_CCCS  CCCS = StgHeader_ccs(Sp)
-#define RET_PARAMS      W_ unused1, W_ unused2
+#define RET_FIELDS(w_,info_ptr,ccs)             \
+  w_ info_ptr,                                  \
+  w_ ccs
+#define GET_SAVED_CCCS  CCCS = ccs
 #else
-#define SAVE_CCCS(fs)   /* empty */
+#define RET_FIELDS(w_,info_ptr,ccs)             \
+  w_ info_ptr
 #define GET_SAVED_CCCS  /* empty */
-#define RET_PARAMS
 #endif
 
 /*
@@ -56,42 +55,34 @@
 // When profiling, we cannot shortcut by checking the tag,
 // because LDV profiling relies on entering closures to mark them as
 // "used".
-#define SEL_ENTER(offset)                       \
-      R1 = UNTAG(R1);                           \
-      jump %GET_ENTRY(R1);
+#define NEED_EVAL(__x__) 1
 #else
-#define SEL_ENTER(offset)                               \
-      if (GETTAG(R1) != 0) {                            \
-          jump RET_LBL(stg_sel_ret_##offset##_upd);     \
-      }                                                 \
-      jump %GET_ENTRY(R1);
+#define NEED_EVAL(__x__) GETTAG(__x__) == 0
 #endif
 
 #define SELECTOR_CODE_UPD(offset)                                       \
-  INFO_TABLE_RET(stg_sel_ret_##offset##_upd, RET_SMALL, RET_PARAMS)	\
-  {									\
-      R1 = StgClosure_payload(UNTAG(R1),offset);			\
-      GET_SAVED_CCCS;							\
-      Sp = Sp + SIZEOF_StgHeader;					\
-      ENTER();								\
-  }									\
-									\
   INFO_TABLE_SELECTOR(stg_sel_##offset##_upd, offset, THUNK_SELECTOR, "stg_sel_upd", "stg_sel_upd") \
-  {									\
-      TICK_ENT_DYN_THK();						\
-      STK_CHK_NP(WITHUPD_FRAME_SIZE);					\
-      UPD_BH_UPDATABLE();						\
-      LDV_ENTER(R1);							\
-      PUSH_UPD_FRAME(Sp - SIZEOF_StgUpdateFrame, R1);			\
-      ENTER_CCS_THUNK(R1);						\
-      SAVE_CCCS(WITHUPD_FRAME_SIZE);					\
-      W_[Sp-WITHUPD_FRAME_SIZE] = stg_sel_ret_##offset##_upd_info;	\
-      Sp = Sp - WITHUPD_FRAME_SIZE;					\
-      R1 = StgThunk_payload(R1,0);				        \
-      SEL_ENTER(offset);                                                \
+      (P_ node)                                                         \
+  {                                                                     \
+      P_ selectee, field;                                               \
+      TICK_ENT_DYN_THK();                                               \
+      STK_CHK_NP(node);                                                 \
+      UPD_BH_UPDATABLE(node);                                           \
+      LDV_ENTER(node);                                                  \
+      ENTER_CCS_THUNK(node);                                            \
+      selectee = StgThunk_payload(node,0);                              \
+      push (UPDATE_FRAME_FIELDS(,,stg_upd_frame_info,node)) {           \
+        if (NEED_EVAL(selectee)) {                                      \
+          (P_ constr) = call %GET_ENTRY(selectee) (selectee);           \
+          selectee = constr;                                            \
+        }                                                               \
+        field = StgClosure_payload(UNTAG(selectee),offset);             \
+        jump stg_ap_0_fast(field);                                      \
+     }                                                                  \
   }
-  /* NOTE: no need to ENTER() here, we know the closure cannot evaluate to a function,
-     because we're going to do a field selection on the result. */
+  /* NOTE: no need to ENTER() here, we know the closure cannot
+     evaluate to a function, because we're going to do a field
+     selection on the result. */
 
 SELECTOR_CODE_UPD(0)
 SELECTOR_CODE_UPD(1)
@@ -110,33 +101,27 @@ SELECTOR_CODE_UPD(13)
 SELECTOR_CODE_UPD(14)
 SELECTOR_CODE_UPD(15)
 
-#define SELECTOR_CODE_NOUPD(offset) \
-  INFO_TABLE_RET(stg_sel_ret_##offset##_noupd, RET_SMALL, RET_PARAMS)	\
-  {									\
-      R1 = StgClosure_payload(UNTAG(R1),offset);			\
-      GET_SAVED_CCCS;							\
-      Sp = Sp + SIZEOF_StgHeader;					\
-      ENTER();								\
-  }									\
-									\
-  INFO_TABLE_SELECTOR(stg_sel_##offset##_noupd, offset, THUNK_SELECTOR, "stg_sel_noupd", "stg_sel_noupd")\
-  {									\
-      TICK_ENT_DYN_THK();						\
-      STK_CHK_NP(NOUPD_FRAME_SIZE);					\
-      UPD_BH_SINGLE_ENTRY();						\
-      LDV_ENTER(R1);							\
-      TICK_UPDF_OMITTED();						\
-      ENTER_CCS_THUNK(R1);						\
-      SAVE_CCCS(NOUPD_FRAME_SIZE);					\
-      W_[Sp-NOUPD_FRAME_SIZE] = stg_sel_ret_##offset##_noupd_info;	\
-      Sp = Sp - NOUPD_FRAME_SIZE;					\
-      R1 = StgThunk_payload(R1,0);					\
-      if (GETTAG(R1) != 0) {                                            \
-          jump RET_LBL(stg_sel_ret_##offset##_noupd);		        \
+
+#define SELECTOR_CODE_NOUPD(offset)                                     \
+  INFO_TABLE_SELECTOR(stg_sel_##offset##_noupd, offset, THUNK_SELECTOR, "stg_sel_noupd", "stg_sel_noupd") \
+      (P_ node)                                                         \
+  {                                                                     \
+      P_ selectee, field;                                               \
+      TICK_ENT_DYN_THK();                                               \
+      STK_CHK_NP(node);                                                 \
+      UPD_BH_UPDATABLE(node);                                           \
+      LDV_ENTER(node);                                                  \
+      ENTER_CCS_THUNK(node);                                            \
+      selectee = StgThunk_payload(node,0);                              \
+      if (NEED_EVAL(selectee)) {                                        \
+          (P_ constr) = call %GET_ENTRY(selectee) (selectee);           \
+          selectee = constr;                                            \
       }                                                                 \
-      jump %GET_ENTRY(R1);						\
+      field = StgClosure_payload(UNTAG(selectee),offset);               \
+      jump stg_ap_0_fast(field);                                        \
   }
 
+
 SELECTOR_CODE_NOUPD(0)
 SELECTOR_CODE_NOUPD(1)
 SELECTOR_CODE_NOUPD(2)
@@ -173,131 +158,120 @@ SELECTOR_CODE_NOUPD(15)
  */
 
 INFO_TABLE(stg_ap_1_upd,1,0,THUNK_1_0,"stg_ap_1_upd_info","stg_ap_1_upd_info")
+    (P_ node)
 {
-  TICK_ENT_DYN_THK();
-  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(1));
-  UPD_BH_UPDATABLE();
-  LDV_ENTER(R1);
-  ENTER_CCS_THUNK(R1);
-  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
-  R1 = StgThunk_payload(R1,0);
-  Sp = Sp - SIZEOF_StgUpdateFrame;
-  jump stg_ap_0_fast;
+    TICK_ENT_DYN_THK();
+    STK_CHK_NP(node);
+    UPD_BH_UPDATABLE(node);
+    LDV_ENTER(node);
+    ENTER_CCS_THUNK(node);
+    push (UPDATE_FRAME_FIELDS(,,stg_upd_frame_info, node)) {
+        jump stg_ap_0_fast
+            (StgThunk_payload(node,0));
+    }
 }
 
 INFO_TABLE(stg_ap_2_upd,2,0,THUNK_2_0,"stg_ap_2_upd_info","stg_ap_2_upd_info")
+    (P_ node)
 {
-  TICK_ENT_DYN_THK();
-  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(2));
-  UPD_BH_UPDATABLE();
-  LDV_ENTER(R1);
-  ENTER_CCS_THUNK(R1);
-  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,1);
-  R1 = StgThunk_payload(R1,0);
-  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(1);
-  Sp_adj(-1); // for stg_ap_*_ret
-  TICK_UNKNOWN_CALL();
-  TICK_SLOW_CALL_p();
-  jump RET_LBL(stg_ap_p);
+    TICK_ENT_DYN_THK();
+    STK_CHK_NP(node);
+    UPD_BH_UPDATABLE(node);
+    LDV_ENTER(node);
+    ENTER_CCS_THUNK(node);
+    push (UPDATE_FRAME_FIELDS(,,stg_upd_frame_info, node)) {
+        jump stg_ap_p_fast
+            (StgThunk_payload(node,0),
+             StgThunk_payload(node,1));
+    }
 }
 
 INFO_TABLE(stg_ap_3_upd,3,0,THUNK,"stg_ap_3_upd_info","stg_ap_3_upd_info")
+    (P_ node)
 {
-  TICK_ENT_DYN_THK();
-  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(3));
-  UPD_BH_UPDATABLE();
-  LDV_ENTER(R1);
-  ENTER_CCS_THUNK(R1);
-  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,2);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,1);
-  R1 = StgThunk_payload(R1,0);
-  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(2);
-  Sp_adj(-1); // for stg_ap_*_ret
-  TICK_UNKNOWN_CALL();
-  TICK_SLOW_CALL_pp();
-  jump RET_LBL(stg_ap_pp);
+    TICK_ENT_DYN_THK();
+    STK_CHK_NP(node);
+    UPD_BH_UPDATABLE(node);
+    LDV_ENTER(node);
+    ENTER_CCS_THUNK(node);
+    push (UPDATE_FRAME_FIELDS(,,stg_upd_frame_info, node)) {
+        jump stg_ap_pp_fast
+            (StgThunk_payload(node,0),
+             StgThunk_payload(node,1),
+             StgThunk_payload(node,2));
+    }
 }
 
 INFO_TABLE(stg_ap_4_upd,4,0,THUNK,"stg_ap_4_upd_info","stg_ap_4_upd_info")
+    (P_ node)
 {
-  TICK_ENT_DYN_THK();
-  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(4));
-  UPD_BH_UPDATABLE();
-  LDV_ENTER(R1);
-  ENTER_CCS_THUNK(R1);
-  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,3);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,2);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,1);
-  R1 = StgThunk_payload(R1,0);
-  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(3);
-  Sp_adj(-1); // for stg_ap_*_ret
-  TICK_UNKNOWN_CALL();
-  TICK_SLOW_CALL_ppp();
-  jump RET_LBL(stg_ap_ppp);
+    TICK_ENT_DYN_THK();
+    STK_CHK_NP(node);
+    UPD_BH_UPDATABLE(node);
+    LDV_ENTER(node);
+    ENTER_CCS_THUNK(node);
+    push (UPDATE_FRAME_FIELDS(,,stg_upd_frame_info, node)) {
+        jump stg_ap_ppp_fast
+            (StgThunk_payload(node,0),
+             StgThunk_payload(node,1),
+             StgThunk_payload(node,2),
+             StgThunk_payload(node,3));
+    }
 }
 
 INFO_TABLE(stg_ap_5_upd,5,0,THUNK,"stg_ap_5_upd_info","stg_ap_5_upd_info")
+    (P_ node)
 {
-  TICK_ENT_DYN_THK();
-  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(5));
-  UPD_BH_UPDATABLE();
-  LDV_ENTER(R1);
-  ENTER_CCS_THUNK(R1);
-  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,4);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,3);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,2);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(4)] = StgThunk_payload(R1,1);
-  R1 = StgThunk_payload(R1,0);
-  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(4);
-  Sp_adj(-1); // for stg_ap_*_ret
-  TICK_UNKNOWN_CALL();
-  TICK_SLOW_CALL_pppp();
-  jump RET_LBL(stg_ap_pppp);
+    TICK_ENT_DYN_THK();
+    STK_CHK_NP(node);
+    UPD_BH_UPDATABLE(node);
+    LDV_ENTER(node);
+    ENTER_CCS_THUNK(node);
+    push (UPDATE_FRAME_FIELDS(,,stg_upd_frame_info, node)) {
+        jump stg_ap_pppp_fast
+            (StgThunk_payload(node,0),
+             StgThunk_payload(node,1),
+             StgThunk_payload(node,2),
+             StgThunk_payload(node,3),
+             StgThunk_payload(node,4));
+    }
 }
 
 INFO_TABLE(stg_ap_6_upd,6,0,THUNK,"stg_ap_6_upd_info","stg_ap_6_upd_info")
+    (P_ node)
 {
-  TICK_ENT_DYN_THK();
-  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(6));
-  UPD_BH_UPDATABLE();
-  LDV_ENTER(R1);
-  ENTER_CCS_THUNK(R1);
-  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,5);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,4);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,3);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(4)] = StgThunk_payload(R1,2);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(5)] = StgThunk_payload(R1,1);
-  R1 = StgThunk_payload(R1,0);
-  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(5);
-  Sp_adj(-1); // for stg_ap_*_ret
-  TICK_UNKNOWN_CALL();
-  TICK_SLOW_CALL_ppppp();
-  jump RET_LBL(stg_ap_ppppp);
+    TICK_ENT_DYN_THK();
+    STK_CHK_NP(node);
+    UPD_BH_UPDATABLE(node);
+    LDV_ENTER(node);
+    ENTER_CCS_THUNK(node);
+    push (UPDATE_FRAME_FIELDS(,,stg_upd_frame_info, node)) {
+        jump stg_ap_ppppp_fast
+            (StgThunk_payload(node,0),
+             StgThunk_payload(node,1),
+             StgThunk_payload(node,2),
+             StgThunk_payload(node,3),
+             StgThunk_payload(node,4),
+             StgThunk_payload(node,5));
+    }
 }
 
 INFO_TABLE(stg_ap_7_upd,7,0,THUNK,"stg_ap_7_upd_info","stg_ap_7_upd_info")
+    (P_ node)
 {
-  TICK_ENT_DYN_THK();
-  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(7));
-  UPD_BH_UPDATABLE();
-  LDV_ENTER(R1);
-  ENTER_CCS_THUNK(R1);
-  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,6);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,5);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,4);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(4)] = StgThunk_payload(R1,3);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(5)] = StgThunk_payload(R1,2);
-  W_[Sp-SIZEOF_StgUpdateFrame-WDS(6)] = StgThunk_payload(R1,1);
-  R1 = StgThunk_payload(R1,0);
-  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(6);
-  Sp_adj(-1); // for stg_ap_*_ret
-  TICK_UNKNOWN_CALL();
-  TICK_SLOW_CALL_pppppp();
-  jump RET_LBL(stg_ap_pppppp);
+    TICK_ENT_DYN_THK();
+    STK_CHK_NP(node);
+    UPD_BH_UPDATABLE(node);
+    LDV_ENTER(node);
+    ENTER_CCS_THUNK(node);
+    push (UPDATE_FRAME_FIELDS(,,stg_upd_frame_info, node)) {
+      jump stg_ap_pppppp_fast
+          (StgThunk_payload(node,0),
+           StgThunk_payload(node,1),
+           StgThunk_payload(node,2),
+           StgThunk_payload(node,3),
+           StgThunk_payload(node,4),
+           StgThunk_payload(node,5),
+           StgThunk_payload(node,6));
+    }
 }
diff --git a/rts/Updates.cmm b/rts/Updates.cmm
index 44fbc0e194..2bc21ec332 100644
--- a/rts/Updates.cmm
+++ b/rts/Updates.cmm
@@ -16,85 +16,72 @@
 
 #include "Updates.h"
 
-#if defined(PROFILING)
-#define UPD_FRAME_PARAMS W_ unused1, W_ unused2, P_ unused3
-#else
-#define UPD_FRAME_PARAMS P_ unused1
-#endif
-
-/* The update fragment has been tuned so as to generate good
-   code with gcc, which accounts for some of the strangeness in the
-   way it is written.  
-
-   In particular, the JMP_(ret) bit is passed down and pinned on the
-   end of each branch (there end up being two major branches in the
-   code), since we don't mind duplicating this jump.
-*/
-
-/* on entry to the update code
-   (1) R1 points to the closure being returned
-   (2) Sp points to the update frame
-*/
-
-INFO_TABLE_RET( stg_upd_frame, UPDATE_FRAME, UPD_FRAME_PARAMS)
+/*
+ * The update code is PERFORMANCE CRITICAL, if you make any changes
+ * here make sure you eyeball the assembly and check that the fast
+ * path (update in generation 0) is optimal.
+ *
+ * The return(ret) bit is passed down and pinned on the end of each
+ * branch (there end up being two major branches in the code), since
+ * we don't mind duplicating this jump.
+ */
+INFO_TABLE_RET ( stg_upd_frame, UPDATE_FRAME,
+                 UPDATE_FRAME_FIELDS(W_,P_,info_ptr,updatee) )
+    return (P_ ret) /* the closure being returned */
 {
-    W_ updatee;
-    
-    updatee = StgUpdateFrame_updatee(Sp);
-
-    /* remove the update frame from the stack */
-    Sp = Sp + SIZEOF_StgUpdateFrame;
-    
     /* ToDo: it might be a PAP, so we should check... */
     TICK_UPD_CON_IN_NEW(sizeW_fromITBL(%GET_STD_INFO(updatee)));
-    
-    updateWithIndirection(updatee,
-                          R1,
-                          jump %ENTRY_CODE(Sp(0)) [R1]);
-}
 
+    updateWithIndirection(updatee, ret, return (ret));
+}
 
-INFO_TABLE_RET( stg_marked_upd_frame, UPDATE_FRAME, UPD_FRAME_PARAMS)
+/*
+ * An update frame where the updatee has been replaced by a BLACKHOLE
+ * closure by threadPaused.  We may have threads to wake up, and we
+ * also have to check whether the blackhole has been updated by
+ * another thread in the meantime.
+ */
+INFO_TABLE_RET ( stg_marked_upd_frame, UPDATE_FRAME,
+                 UPDATE_FRAME_FIELDS(W_,P_,info_ptr,updatee) )
+    return (P_ ret) /* the closure being returned */
 {
-    W_ updatee, v, i, tso, link;
+    W_ v, i, tso, link;
 
     // we know the closure is a BLACKHOLE
-    updatee = StgUpdateFrame_updatee(Sp);
     v = StgInd_indirectee(updatee);
 
-    // remove the update frame from the stack
-    Sp = Sp + SIZEOF_StgUpdateFrame;
-    
     if (GETTAG(v) != 0) {
         // updated by someone else: discard our value and use the
         // other one to increase sharing, but check the blocking
         // queues to see if any threads were waiting on this BLACKHOLE.
-        R1 = v;
-        foreign "C" checkBlockingQueues(MyCapability() "ptr",
-                                        CurrentTSO "ptr") [R1];
-        jump %ENTRY_CODE(Sp(0)) [R1];
+        ccall checkBlockingQueues(MyCapability() "ptr", CurrentTSO "ptr");
+        return (v);
     }
 
     // common case: it is still our BLACKHOLE
     if (v == CurrentTSO) {
-        updateWithIndirection(updatee,
-                              R1,
-                              jump %ENTRY_CODE(Sp(0)) [R1]);
+        updateWithIndirection(updatee, ret, return (ret));
     }
 
     // The other cases are all handled by the generic code
-    foreign "C" updateThunk (MyCapability() "ptr", CurrentTSO "ptr", 
-                             updatee "ptr", R1 "ptr") [R1];
+    ccall updateThunk (MyCapability() "ptr", CurrentTSO "ptr",
+                       updatee "ptr", ret "ptr");
 
-    jump %ENTRY_CODE(Sp(0)) [R1];
+    return (ret);
 }
 
-// Special update frame code for CAFs and eager-blackholed thunks: it
-// knows how to update blackholes, but is distinct from
-// stg_marked_upd_frame so that lazy blackholing won't treat it as the
-// high watermark.
-INFO_TABLE_RET (stg_bh_upd_frame, UPDATE_FRAME, UPD_FRAME_PARAMS)
+/*
+ * Special update frame code for CAFs and eager-blackholed thunks: it
+ * knows how to update blackholes, but is distinct from
+ * stg_marked_upd_frame so that lazy blackholing won't treat it as the
+ * high watermark.
+ */
+INFO_TABLE_RET ( stg_bh_upd_frame, UPDATE_FRAME,
+                 UPDATE_FRAME_FIELDS(W_,P_,info_ptr,updatee) )
+    return (P_ ret) /* the closure being returned */
 {
-    jump RET_LBL(stg_marked_upd_frame) [R1];
+    // This all compiles away to a single jump instruction (sigh)
+    jump RET_LBL(stg_marked_upd_frame)
+        ( UPDATE_FRAME_FIELDS(,,info_ptr,updatee) )
+        (ret);
 }
-
diff --git a/rts/Updates.h b/rts/Updates.h
index 954f02afe1..0205e6e763 100644
--- a/rts/Updates.h
+++ b/rts/Updates.h
@@ -24,29 +24,34 @@
  *  field. So, we call LDV_RECORD_CREATE().
  */
 
-/* We have two versions of this macro (sadly), one for use in C-- code,
+/*
+ * We have two versions of this macro (sadly), one for use in C-- code,
  * and the other for C.
  *
  * The and_then argument is a performance hack so that we can paste in
  * the continuation code directly.  It helps shave a couple of
  * instructions off the common case in the update code, which is
  * worthwhile (the update code is often part of the inner loop).
- * (except that gcc now appears to common up this code again and
- * invert the optimisation.  Grrrr --SDM).
  */
 #ifdef CMINUSMINUS
 
-#define updateWithIndirection(p1, p2, and_then)	\
+#define UPDATE_FRAME_FIELDS(w_,p_,info_ptr,updatee)     \
+                 w_ info_ptr,                           \
+                 PROF_HDR_FIELDS(w_)                    \
+                 p_ updatee
+
+
+#define updateWithIndirection(p1, p2, and_then) \
     W_ bd;							\
 								\
     OVERWRITING_CLOSURE(p1);                                    \
     StgInd_indirectee(p1) = p2;                                 \
-    prim %write_barrier() [];					\
+    prim %write_barrier();                                      \
     SET_INFO(p1, stg_BLACKHOLE_info);                           \
     LDV_RECORD_CREATE(p1);                                      \
     bd = Bdescr(p1);						\
     if (bdescr_gen_no(bd) != 0 :: bits16) {			\
-      recordMutableCap(p1, TO_W_(bdescr_gen_no(bd)), R1);  	\
+      recordMutableCap(p1, TO_W_(bdescr_gen_no(bd)));           \
       TICK_UPD_OLD_IND();					\
       and_then;							\
     } else {							\
diff --git a/rts/sm/Compact.c b/rts/sm/Compact.c
index c97e168433..34111f9206 100644
--- a/rts/sm/Compact.c
+++ b/rts/sm/Compact.c
@@ -301,37 +301,7 @@ thread_stack(StgPtr p, StgPtr stack_end)
 	
 	switch (info->i.type) {
 	    
-	    // Dynamic bitmap: the mask is stored on the stack 
-	case RET_DYN:
-	{
-	    StgWord dyn;
-	    dyn = ((StgRetDyn *)p)->liveness;
-
-	    // traverse the bitmap first
-	    bitmap = RET_DYN_LIVENESS(dyn);
-	    p      = (P_)&((StgRetDyn *)p)->payload[0];
-	    size   = RET_DYN_BITMAP_SIZE;
-	    while (size > 0) {
-		if ((bitmap & 1) == 0) {
-		    thread((StgClosure **)p);
-		}
-		p++;
-		bitmap = bitmap >> 1;
-		size--;
-	    }
-	    
-	    // skip over the non-ptr words
-	    p += RET_DYN_NONPTRS(dyn) + RET_DYN_NONPTR_REGS_SIZE;
-	    
-	    // follow the ptr words
-	    for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
-		thread((StgClosure **)p);
-		p++;
-	    }
-	    continue;
-	}
-	    
-	    // small bitmap (<= 32 entries, or 64 on a 64-bit machine) 
+            // small bitmap (<= 32 entries, or 64 on a 64-bit machine)
         case CATCH_RETRY_FRAME:
         case CATCH_STM_FRAME:
         case ATOMICALLY_FRAME:
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index 8be393b4bc..0ac9e2623a 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -670,7 +670,6 @@ loop:
   case RET_BCO:
   case RET_SMALL:
   case RET_BIG:
-  case RET_DYN:
   case UPDATE_FRAME:
   case UNDERFLOW_FRAME:
   case STOP_FRAME:
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index 5c7fb8aa76..6237662720 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -105,32 +105,6 @@ checkStackFrame( StgPtr c )
 
     /* All activation records have 'bitmap' style layout info. */
     switch (info->i.type) {
-    case RET_DYN: /* Dynamic bitmap: the mask is stored on the stack */
-    {
-	StgWord dyn;
-	StgPtr p;
-	StgRetDyn* r;
-	
-	r = (StgRetDyn *)c;
-	dyn = r->liveness;
-	
-	p = (P_)(r->payload);
-	checkSmallBitmap(p,RET_DYN_LIVENESS(r->liveness),RET_DYN_BITMAP_SIZE);
-	p += RET_DYN_BITMAP_SIZE + RET_DYN_NONPTR_REGS_SIZE;
-
-	// skip over the non-pointers
-	p += RET_DYN_NONPTRS(dyn);
-	
-	// follow the ptr words
-	for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
-	    checkClosureShallow((StgClosure *)*p);
-	    p++;
-	}
-	
-	return sizeofW(StgRetDyn) + RET_DYN_BITMAP_SIZE +
-	    RET_DYN_NONPTR_REGS_SIZE +
-	    RET_DYN_NONPTRS(dyn) + RET_DYN_PTRS(dyn);
-    }
 
     case UPDATE_FRAME:
       ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgUpdateFrame*)c)->updatee));
@@ -381,7 +355,6 @@ checkClosure( StgClosure* p )
     case RET_BCO:
     case RET_SMALL:
     case RET_BIG:
-    case RET_DYN:
     case UPDATE_FRAME:
     case UNDERFLOW_FRAME:
     case STOP_FRAME:
diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c
index cbdf01b720..668b95da6b 100644
--- a/rts/sm/Scav.c
+++ b/rts/sm/Scav.c
@@ -1685,32 +1685,6 @@ scavenge_stack(StgPtr p, StgPtr stack_end)
 	goto follow_srt;
     }
 
-      // Dynamic bitmap: the mask is stored on the stack, and
-      // there are a number of non-pointers followed by a number
-      // of pointers above the bitmapped area.  (see StgMacros.h,
-      // HEAP_CHK_GEN).
-    case RET_DYN:
-    {
-	StgWord dyn;
-	dyn = ((StgRetDyn *)p)->liveness;
-
-	// traverse the bitmap first
-	bitmap = RET_DYN_LIVENESS(dyn);
-	p      = (P_)&((StgRetDyn *)p)->payload[0];
-	size   = RET_DYN_BITMAP_SIZE;
-	p = scavenge_small_bitmap(p, size, bitmap);
-
-	// skip over the non-ptr words
-	p += RET_DYN_NONPTRS(dyn) + RET_DYN_NONPTR_REGS_SIZE;
-	
-	// follow the ptr words
-	for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
-	    evacuate((StgClosure **)p);
-	    p++;
-	}
-	continue;
-    }
-
     case RET_FUN:
     {
 	StgRetFun *ret_fun = (StgRetFun *)p;
diff --git a/utils/genapply/GenApply.hs b/utils/genapply/GenApply.hs
index 18a2f85a12..e859184c59 100644
--- a/utils/genapply/GenApply.hs
+++ b/utils/genapply/GenApply.hs
@@ -232,7 +232,7 @@ genMkPAP regstatus macro jump ticker disamb
             if is_fun_case then mb_tag_node arity else empty,
             if overflow_regs
                 then text "jump_SAVE_CCCS" <> parens (text jump) <> semi
-                else text "jump " <> text jump <> semi
+                else text "jump " <> text jump <+> text "[*]" <> semi
             ]) $$
            text "}"
 
@@ -334,7 +334,7 @@ genMkPAP regstatus macro jump ticker disamb
                 then text "R2 = " <> fun_info_label <> semi
                 else empty,
             if is_fun_case then mb_tag_node n_args else empty,
-            text "jump " <> text jump <> semi
+            text "jump " <> text jump <+> text "[*]" <> semi
           ])
 
 -- The LARGER ARITY cases:
@@ -416,7 +416,7 @@ enterFastPathHelper tag regstatus no_load_regs args_in_regs args =
         reg_doc,
         text "  Sp_adj(" <> int sp' <> text ");",
         -- enter, but adjust offset with tag
-        text "  jump " <> text "%GET_ENTRY(R1-" <> int tag <> text ");",
+        text "  jump " <> text "%GET_ENTRY(R1-" <> int tag <> text ") [*];",
         text "}"
        ]
   -- I don't totally understand this code, I copied it from
@@ -478,7 +478,7 @@ genApply regstatus args =
    in
     vcat [
       text "INFO_TABLE_RET(" <> mkApplyName args <> text ", " <>
-        text "RET_SMALL, " <> (cat $ zipWith formalParam args [1..]) <>
+        text "RET_SMALL, W_ info_ptr, " <> (cat $ zipWith formalParam args [1..]) <>
         text ")\n{",
       nest 4 (vcat [
        text "W_ info;",
@@ -701,7 +701,7 @@ genApplyFast regstatus args =
           nest 4 (vcat [
              text "Sp_adj" <> parens (int (-sp_offset)) <> semi,
              saveRegOffs reg_locs,
-             text "jump" <+> fun_ret_label <> semi
+             text "jump" <+> fun_ret_label <+> text "[*]" <> semi
           ]),
           char '}'
         ]),
@@ -739,7 +739,7 @@ genStackApply regstatus args =
    (assign_regs, sp') = loadRegArgs regstatus 0 args
    body = vcat [assign_regs,
                 text "Sp_adj" <> parens (int sp') <> semi,
-                text "jump %GET_ENTRY(UNTAG(R1));"
+                text "jump %GET_ENTRY(UNTAG(R1)) [*];"
                 ]
 
 -- -----------------------------------------------------------------------------
@@ -766,7 +766,7 @@ genStackSave regstatus args =
                 text "Sp(2) = R1;",
                 text "Sp(1) =" <+> int stk_args <> semi,
                 text "Sp(0) = stg_gc_fun_info;",
-                text "jump stg_gc_noregs;"
+                text "jump stg_gc_noregs [];"
                 ]
 
    std_frame_size = 3 -- the std bits of the frame. See StgRetFun in Closures.h,