Module hierarchy: StgToCmm (#13009)

Add StgToCmm module hierarchy. Platform modules that are used in several other places (NCG, LLVM codegen, Cmm transformations) are put into GHC.Platform.
author: Sylvain Henry <sylvain@haskus.fr> 2019-08-13 17:26:32 +0200
committer: Sylvain Henry <sylvain@haskus.fr> 2019-09-10 00:04:50 +0200
commit: 447864a94a1679b5b079e08bb7208a0005381cef (patch)
tree: baa469c52620ce7ae02def3e5e6a6f109cc89f40 /compiler/GHC
parent: 270fbe8512f04b6107755fa22bdec62205c0a567 (diff)
download: haskell-447864a94a1679b5b079e08bb7208a0005381cef.tar.gz
27 files changed, 11330 insertions, 0 deletions
diff --git a/compiler/GHC/Platform/ARM.hs b/compiler/GHC/Platform/ARM.hs
new file mode 100644
index 0000000000..d0c7e5811a
--- /dev/null
+++ b/compiler/GHC/Platform/ARM.hs
@@ -0,0 +1,10 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.Platform.ARM where
+
+import GhcPrelude
+
+#define MACHREGS_NO_REGS 0
+#define MACHREGS_arm 1
+#include "../../../includes/CodeGen.Platform.hs"
+
diff --git a/compiler/GHC/Platform/ARM64.hs b/compiler/GHC/Platform/ARM64.hs
new file mode 100644
index 0000000000..ebd66b92c5
--- /dev/null
+++ b/compiler/GHC/Platform/ARM64.hs
@@ -0,0 +1,10 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.Platform.ARM64 where
+
+import GhcPrelude
+
+#define MACHREGS_NO_REGS 0
+#define MACHREGS_aarch64 1
+#include "../../../includes/CodeGen.Platform.hs"
+
diff --git a/compiler/GHC/Platform/NoRegs.hs b/compiler/GHC/Platform/NoRegs.hs
new file mode 100644
index 0000000000..e8abf44253
--- /dev/null
+++ b/compiler/GHC/Platform/NoRegs.hs
@@ -0,0 +1,9 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.Platform.NoRegs where
+
+import GhcPrelude
+
+#define MACHREGS_NO_REGS 1
+#include "../../../includes/CodeGen.Platform.hs"
+
diff --git a/compiler/GHC/Platform/PPC.hs b/compiler/GHC/Platform/PPC.hs
new file mode 100644
index 0000000000..f405f95438
--- /dev/null
+++ b/compiler/GHC/Platform/PPC.hs
@@ -0,0 +1,10 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.Platform.PPC where
+
+import GhcPrelude
+
+#define MACHREGS_NO_REGS 0
+#define MACHREGS_powerpc 1
+#include "../../../includes/CodeGen.Platform.hs"
+
diff --git a/compiler/GHC/Platform/Regs.hs b/compiler/GHC/Platform/Regs.hs
new file mode 100644
index 0000000000..e7887fbe72
--- /dev/null
+++ b/compiler/GHC/Platform/Regs.hs
@@ -0,0 +1,107 @@
+
+module GHC.Platform.Regs
+       (callerSaves, activeStgRegs, haveRegBase, globalRegMaybe, freeReg)
+       where
+
+import GhcPrelude
+
+import CmmExpr
+import GHC.Platform
+import Reg
+
+import qualified GHC.Platform.ARM        as ARM
+import qualified GHC.Platform.ARM64      as ARM64
+import qualified GHC.Platform.PPC        as PPC
+import qualified GHC.Platform.SPARC      as SPARC
+import qualified GHC.Platform.X86        as X86
+import qualified GHC.Platform.X86_64     as X86_64
+import qualified GHC.Platform.NoRegs     as NoRegs
+
+-- | Returns 'True' if this global register is stored in a caller-saves
+-- machine register.
+
+callerSaves :: Platform -> GlobalReg -> Bool
+callerSaves platform
+ | platformUnregisterised platform = NoRegs.callerSaves
+ | otherwise
+ = case platformArch platform of
+   ArchX86    -> X86.callerSaves
+   ArchX86_64 -> X86_64.callerSaves
+   ArchSPARC  -> SPARC.callerSaves
+   ArchARM {} -> ARM.callerSaves
+   ArchARM64  -> ARM64.callerSaves
+   arch
+    | arch `elem` [ArchPPC, ArchPPC_64 ELF_V1, ArchPPC_64 ELF_V2] ->
+        PPC.callerSaves
+
+    | otherwise -> NoRegs.callerSaves
+
+-- | Here is where the STG register map is defined for each target arch.
+-- The order matters (for the llvm backend anyway)! We must make sure to
+-- maintain the order here with the order used in the LLVM calling conventions.
+-- Note that also, this isn't all registers, just the ones that are currently
+-- possbily mapped to real registers.
+activeStgRegs :: Platform -> [GlobalReg]
+activeStgRegs platform
+ | platformUnregisterised platform = NoRegs.activeStgRegs
+ | otherwise
+ = case platformArch platform of
+   ArchX86    -> X86.activeStgRegs
+   ArchX86_64 -> X86_64.activeStgRegs
+   ArchSPARC  -> SPARC.activeStgRegs
+   ArchARM {} -> ARM.activeStgRegs
+   ArchARM64  -> ARM64.activeStgRegs
+   arch
+    | arch `elem` [ArchPPC, ArchPPC_64 ELF_V1, ArchPPC_64 ELF_V2] ->
+        PPC.activeStgRegs
+
+    | otherwise -> NoRegs.activeStgRegs
+
+haveRegBase :: Platform -> Bool
+haveRegBase platform
+ | platformUnregisterised platform = NoRegs.haveRegBase
+ | otherwise
+ = case platformArch platform of
+   ArchX86    -> X86.haveRegBase
+   ArchX86_64 -> X86_64.haveRegBase
+   ArchSPARC  -> SPARC.haveRegBase
+   ArchARM {} -> ARM.haveRegBase
+   ArchARM64  -> ARM64.haveRegBase
+   arch
+    | arch `elem` [ArchPPC, ArchPPC_64 ELF_V1, ArchPPC_64 ELF_V2] ->
+        PPC.haveRegBase
+
+    | otherwise -> NoRegs.haveRegBase
+
+globalRegMaybe :: Platform -> GlobalReg -> Maybe RealReg
+globalRegMaybe platform
+ | platformUnregisterised platform = NoRegs.globalRegMaybe
+ | otherwise
+ = case platformArch platform of
+   ArchX86    -> X86.globalRegMaybe
+   ArchX86_64 -> X86_64.globalRegMaybe
+   ArchSPARC  -> SPARC.globalRegMaybe
+   ArchARM {} -> ARM.globalRegMaybe
+   ArchARM64  -> ARM64.globalRegMaybe
+   arch
+    | arch `elem` [ArchPPC, ArchPPC_64 ELF_V1, ArchPPC_64 ELF_V2] ->
+        PPC.globalRegMaybe
+
+    | otherwise -> NoRegs.globalRegMaybe
+
+freeReg :: Platform -> RegNo -> Bool
+freeReg platform
+ | platformUnregisterised platform = NoRegs.freeReg
+ | otherwise
+ = case platformArch platform of
+   ArchX86    -> X86.freeReg
+   ArchX86_64 -> X86_64.freeReg
+   ArchSPARC  -> SPARC.freeReg
+   ArchARM {} -> ARM.freeReg
+   ArchARM64  -> ARM64.freeReg
+   arch
+    | arch `elem` [ArchPPC, ArchPPC_64 ELF_V1, ArchPPC_64 ELF_V2] ->
+        PPC.freeReg
+
+    | otherwise -> NoRegs.freeReg
+
diff --git a/compiler/GHC/Platform/SPARC.hs b/compiler/GHC/Platform/SPARC.hs
new file mode 100644
index 0000000000..b0cdb27f44
--- /dev/null
+++ b/compiler/GHC/Platform/SPARC.hs
@@ -0,0 +1,10 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.Platform.SPARC where
+
+import GhcPrelude
+
+#define MACHREGS_NO_REGS 0
+#define MACHREGS_sparc 1
+#include "../../../includes/CodeGen.Platform.hs"
+
diff --git a/compiler/GHC/Platform/X86.hs b/compiler/GHC/Platform/X86.hs
new file mode 100644
index 0000000000..1570ba9fa0
--- /dev/null
+++ b/compiler/GHC/Platform/X86.hs
@@ -0,0 +1,10 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.Platform.X86 where
+
+import GhcPrelude
+
+#define MACHREGS_NO_REGS 0
+#define MACHREGS_i386 1
+#include "../../../includes/CodeGen.Platform.hs"
+
diff --git a/compiler/GHC/Platform/X86_64.hs b/compiler/GHC/Platform/X86_64.hs
new file mode 100644
index 0000000000..d2d1b15c71
--- /dev/null
+++ b/compiler/GHC/Platform/X86_64.hs
@@ -0,0 +1,10 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.Platform.X86_64 where
+
+import GhcPrelude
+
+#define MACHREGS_NO_REGS 0
+#define MACHREGS_x86_64 1
+#include "../../../includes/CodeGen.Platform.hs"
+
diff --git a/compiler/GHC/StgToCmm.hs b/compiler/GHC/StgToCmm.hs
new file mode 100644
index 0000000000..c7ee604692
--- /dev/null
+++ b/compiler/GHC/StgToCmm.hs
@@ -0,0 +1,223 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE DataKinds #-}
+
+-----------------------------------------------------------------------------
+--
+-- Stg to C-- code generation
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm ( codeGen ) where
+
+#include "HsVersions.h"
+
+import GhcPrelude as Prelude
+
+import GHC.StgToCmm.Prof (initCostCentres, ldvEnter)
+import GHC.StgToCmm.Monad
+import GHC.StgToCmm.Env
+import GHC.StgToCmm.Bind
+import GHC.StgToCmm.Con
+import GHC.StgToCmm.Layout
+import GHC.StgToCmm.Utils
+import GHC.StgToCmm.Closure
+import GHC.StgToCmm.Hpc
+import GHC.StgToCmm.Ticky
+
+import Cmm
+import CmmUtils
+import CLabel
+
+import StgSyn
+import DynFlags
+import ErrUtils
+
+import HscTypes
+import CostCentre
+import Id
+import IdInfo
+import RepType
+import DataCon
+import TyCon
+import Module
+import Outputable
+import Stream
+import BasicTypes
+import VarSet ( isEmptyDVarSet )
+
+import OrdList
+import MkGraph
+
+import Data.IORef
+import Control.Monad (when,void)
+import Util
+
+codeGen :: DynFlags
+        -> Module
+        -> [TyCon]
+        -> CollectedCCs                -- (Local/global) cost-centres needing declaring/registering.
+        -> [CgStgTopBinding]           -- Bindings to convert
+        -> HpcInfo
+        -> Stream IO CmmGroup ()       -- Output as a stream, so codegen can
+                                       -- be interleaved with output
+
+codeGen dflags this_mod data_tycons
+        cost_centre_info stg_binds hpc_info
+  = do  {     -- cg: run the code generator, and yield the resulting CmmGroup
+              -- Using an IORef to store the state is a bit crude, but otherwise
+              -- we would need to add a state monad layer.
+        ; cgref <- liftIO $ newIORef =<< initC
+        ; let cg :: FCode () -> Stream IO CmmGroup ()
+              cg fcode = do
+                cmm <- liftIO . withTiming (return dflags) (text "STG -> Cmm") (`seq` ()) $ do
+                         st <- readIORef cgref
+                         let (a,st') = runC dflags this_mod st (getCmm fcode)
+
+                         -- NB. stub-out cgs_tops and cgs_stmts.  This fixes
+                         -- a big space leak.  DO NOT REMOVE!
+                         writeIORef cgref $! st'{ cgs_tops = nilOL,
+                                                  cgs_stmts = mkNop }
+                         return a
+                yield cmm
+
+               -- Note [codegen-split-init] the cmm_init block must come
+               -- FIRST.  This is because when -split-objs is on we need to
+               -- combine this block with its initialisation routines; see
+               -- Note [pipeline-split-init].
+        ; cg (mkModuleInit cost_centre_info this_mod hpc_info)
+
+        ; mapM_ (cg . cgTopBinding dflags) stg_binds
+
+                -- Put datatype_stuff after code_stuff, because the
+                -- datatype closure table (for enumeration types) to
+                -- (say) PrelBase_True_closure, which is defined in
+                -- code_stuff
+        ; let do_tycon tycon = do
+                -- Generate a table of static closures for an
+                -- enumeration type Note that the closure pointers are
+                -- tagged.
+                 when (isEnumerationTyCon tycon) $ cg (cgEnumerationTyCon tycon)
+                 mapM_ (cg . cgDataCon) (tyConDataCons tycon)
+
+        ; mapM_ do_tycon data_tycons
+        }
+
+---------------------------------------------------------------
+--      Top-level bindings
+---------------------------------------------------------------
+
+{- 'cgTopBinding' is only used for top-level bindings, since they need
+to be allocated statically (not in the heap) and need to be labelled.
+No unboxed bindings can happen at top level.
+
+In the code below, the static bindings are accumulated in the
+@MkCgState@, and transferred into the ``statics'' slot by @forkStatics@.
+This is so that we can write the top level processing in a compositional
+style, with the increasing static environment being plumbed as a state
+variable. -}
+
+cgTopBinding :: DynFlags -> CgStgTopBinding -> FCode ()
+cgTopBinding dflags (StgTopLifted (StgNonRec id rhs))
+  = do  { let (info, fcode) = cgTopRhs dflags NonRecursive id rhs
+        ; fcode
+        ; addBindC info
+        }
+
+cgTopBinding dflags (StgTopLifted (StgRec pairs))
+  = do  { let (bndrs, rhss) = unzip pairs
+        ; let pairs' = zip bndrs rhss
+              r = unzipWith (cgTopRhs dflags Recursive) pairs'
+              (infos, fcodes) = unzip r
+        ; addBindsC infos
+        ; sequence_ fcodes
+        }
+
+cgTopBinding dflags (StgTopStringLit id str)
+  = do  { let label = mkBytesLabel (idName id)
+        ; let (lit, decl) = mkByteStringCLit label str
+        ; emitDecl decl
+        ; addBindC (litIdInfo dflags id mkLFStringLit lit)
+        }
+
+cgTopRhs :: DynFlags -> RecFlag -> Id -> CgStgRhs -> (CgIdInfo, FCode ())
+        -- The Id is passed along for setting up a binding...
+
+cgTopRhs dflags _rec bndr (StgRhsCon _cc con args)
+  = cgTopRhsCon dflags bndr con (assertNonVoidStgArgs args)
+      -- con args are always non-void,
+      -- see Note [Post-unarisation invariants] in UnariseStg
+
+cgTopRhs dflags rec bndr (StgRhsClosure fvs cc upd_flag args body)
+  = ASSERT(isEmptyDVarSet fvs)    -- There should be no free variables
+    cgTopRhsClosure dflags rec bndr cc upd_flag args body
+
+
+---------------------------------------------------------------
+--      Module initialisation code
+---------------------------------------------------------------
+
+mkModuleInit
+        :: CollectedCCs         -- cost centre info
+        -> Module
+        -> HpcInfo
+        -> FCode ()
+
+mkModuleInit cost_centre_info this_mod hpc_info
+  = do  { initHpc this_mod hpc_info
+        ; initCostCentres cost_centre_info
+        }
+
+
+---------------------------------------------------------------
+--      Generating static stuff for algebraic data types
+---------------------------------------------------------------
+
+
+cgEnumerationTyCon :: TyCon -> FCode ()
+cgEnumerationTyCon tycon
+  = do dflags <- getDynFlags
+       emitRODataLits (mkLocalClosureTableLabel (tyConName tycon) NoCafRefs)
+             [ CmmLabelOff (mkLocalClosureLabel (dataConName con) NoCafRefs)
+                           (tagForCon dflags con)
+             | con <- tyConDataCons tycon]
+
+
+cgDataCon :: DataCon -> FCode ()
+-- Generate the entry code, info tables, and (for niladic constructor)
+-- the static closure, for a constructor.
+cgDataCon data_con
+  = do  { dflags <- getDynFlags
+        ; let
+            (tot_wds, --  #ptr_wds + #nonptr_wds
+             ptr_wds) --  #ptr_wds
+              = mkVirtConstrSizes dflags arg_reps
+
+            nonptr_wds   = tot_wds - ptr_wds
+
+            dyn_info_tbl =
+              mkDataConInfoTable dflags data_con False ptr_wds nonptr_wds
+
+            -- We're generating info tables, so we don't know and care about
+            -- what the actual arguments are. Using () here as the place holder.
+            arg_reps :: [NonVoid PrimRep]
+            arg_reps = [ NonVoid rep_ty
+                       | ty <- dataConRepArgTys data_con
+                       , rep_ty <- typePrimRep ty
+                       , not (isVoidRep rep_ty) ]
+
+        ; emitClosureAndInfoTable dyn_info_tbl NativeDirectCall [] $
+            -- NB: the closure pointer is assumed *untagged* on
+            -- entry to a constructor.  If the pointer is tagged,
+            -- then we should not be entering it.  This assumption
+            -- is used in ldvEnter and when tagging the pointer to
+            -- return it.
+            -- NB 2: We don't set CC when entering data (WDP 94/06)
+            do { tickyEnterDynCon
+               ; ldvEnter (CmmReg nodeReg)
+               ; tickyReturnOldCon (length arg_reps)
+               ; void $ emitReturn [cmmOffsetB dflags (CmmReg nodeReg) (tagForCon dflags data_con)]
+               }
+                    -- The case continuation code expects a tagged pointer
+        }
diff --git a/compiler/GHC/StgToCmm/ArgRep.hs b/compiler/GHC/StgToCmm/ArgRep.hs
new file mode 100644
index 0000000000..cc2fe8306a
--- /dev/null
+++ b/compiler/GHC/StgToCmm/ArgRep.hs
@@ -0,0 +1,160 @@
+-----------------------------------------------------------------------------
+--
+-- Argument representations used in GHC.StgToCmm.Layout.
+--
+-- (c) The University of Glasgow 2013
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.ArgRep (
+        ArgRep(..), toArgRep, argRepSizeW,
+
+        argRepString, isNonV, idArgRep,
+
+        slowCallPattern,
+
+        ) where
+
+import GhcPrelude
+
+import GHC.StgToCmm.Closure ( idPrimRep )
+
+import SMRep            ( WordOff )
+import Id               ( Id )
+import TyCon            ( PrimRep(..), primElemRepSizeB )
+import BasicTypes       ( RepArity )
+import Constants        ( wORD64_SIZE )
+import DynFlags
+
+import Outputable
+import FastString
+
+-- I extricated this code as this new module in order to avoid a
+-- cyclic dependency between GHC.StgToCmm.Layout and GHC.StgToCmm.Ticky.
+--
+-- NSF 18 Feb 2013
+
+-------------------------------------------------------------------------
+--      Classifying arguments: ArgRep
+-------------------------------------------------------------------------
+
+-- ArgRep is re-exported by GHC.StgToCmm.Layout, but only for use in the
+-- byte-code generator which also needs to know about the
+-- classification of arguments.
+
+data ArgRep = P   -- GC Ptr
+            | N   -- Word-sized non-ptr
+            | L   -- 64-bit non-ptr (long)
+            | V   -- Void
+            | F   -- Float
+            | D   -- Double
+            | V16 -- 16-byte (128-bit) vectors of Float/Double/Int8/Word32/etc.
+            | V32 -- 32-byte (256-bit) vectors of Float/Double/Int8/Word32/etc.
+            | V64 -- 64-byte (512-bit) vectors of Float/Double/Int8/Word32/etc.
+instance Outputable ArgRep where ppr = text . argRepString
+
+argRepString :: ArgRep -> String
+argRepString P = "P"
+argRepString N = "N"
+argRepString L = "L"
+argRepString V = "V"
+argRepString F = "F"
+argRepString D = "D"
+argRepString V16 = "V16"
+argRepString V32 = "V32"
+argRepString V64 = "V64"
+
+toArgRep :: PrimRep -> ArgRep
+toArgRep VoidRep           = V
+toArgRep LiftedRep         = P
+toArgRep UnliftedRep       = P
+toArgRep IntRep            = N
+toArgRep WordRep           = N
+toArgRep Int8Rep           = N  -- Gets widened to native word width for calls
+toArgRep Word8Rep          = N  -- Gets widened to native word width for calls
+toArgRep Int16Rep          = N  -- Gets widened to native word width for calls
+toArgRep Word16Rep         = N  -- Gets widened to native word width for calls
+toArgRep Int32Rep          = N  -- Gets widened to native word width for calls
+toArgRep Word32Rep         = N  -- Gets widened to native word width for calls
+toArgRep AddrRep           = N
+toArgRep Int64Rep          = L
+toArgRep Word64Rep         = L
+toArgRep FloatRep          = F
+toArgRep DoubleRep         = D
+toArgRep (VecRep len elem) = case len*primElemRepSizeB elem of
+                               16 -> V16
+                               32 -> V32
+                               64 -> V64
+                               _  -> error "toArgRep: bad vector primrep"
+
+isNonV :: ArgRep -> Bool
+isNonV V = False
+isNonV _ = True
+
+argRepSizeW :: DynFlags -> ArgRep -> WordOff                -- Size in words
+argRepSizeW _      N   = 1
+argRepSizeW _      P   = 1
+argRepSizeW _      F   = 1
+argRepSizeW dflags L   = wORD64_SIZE        `quot` wORD_SIZE dflags
+argRepSizeW dflags D   = dOUBLE_SIZE dflags `quot` wORD_SIZE dflags
+argRepSizeW _      V   = 0
+argRepSizeW dflags V16 = 16                 `quot` wORD_SIZE dflags
+argRepSizeW dflags V32 = 32                 `quot` wORD_SIZE dflags
+argRepSizeW dflags V64 = 64                 `quot` wORD_SIZE dflags
+
+idArgRep :: Id -> ArgRep
+idArgRep = toArgRep . idPrimRep
+
+-- This list of argument patterns should be kept in sync with at least
+-- the following:
+--
+--  * GHC.StgToCmm.Layout.stdPattern maybe to some degree?
+--
+--  * the RTS_RET(stg_ap_*) and RTS_FUN_DECL(stg_ap_*_fast)
+--  declarations in includes/stg/MiscClosures.h
+--
+--  * the SLOW_CALL_*_ctr declarations in includes/stg/Ticky.h,
+--
+--  * the TICK_SLOW_CALL_*() #defines in includes/Cmm.h,
+--
+--  * the PR_CTR(SLOW_CALL_*_ctr) calls in rts/Ticky.c,
+--
+--  * and the SymI_HasProto(stg_ap_*_{ret,info,fast}) calls and
+--  SymI_HasProto(SLOW_CALL_*_ctr) calls in rts/Linker.c
+--
+-- There may be more places that I haven't found; I merely igrep'd for
+-- pppppp and excluded things that seemed ghci-specific.
+--
+-- Also, it seems at the moment that ticky counters with void
+-- arguments will never be bumped, but I'm still declaring those
+-- counters, defensively.
+--
+-- NSF 6 Mar 2013
+
+slowCallPattern :: [ArgRep] -> (FastString, RepArity)
+-- Returns the generic apply function and arity
+--
+-- The first batch of cases match (some) specialised entries
+-- The last group deals exhaustively with the cases for the first argument
+--   (and the zero-argument case)
+--
+-- In 99% of cases this function will match *all* the arguments in one batch
+
+slowCallPattern (P: P: P: P: P: P: _) = (fsLit "stg_ap_pppppp", 6)
+slowCallPattern (P: P: P: P: P: _)    = (fsLit "stg_ap_ppppp", 5)
+slowCallPattern (P: P: P: P: _)       = (fsLit "stg_ap_pppp", 4)
+slowCallPattern (P: P: P: V: _)       = (fsLit "stg_ap_pppv", 4)
+slowCallPattern (P: P: P: _)          = (fsLit "stg_ap_ppp", 3)
+slowCallPattern (P: P: V: _)          = (fsLit "stg_ap_ppv", 3)
+slowCallPattern (P: P: _)             = (fsLit "stg_ap_pp", 2)
+slowCallPattern (P: V: _)             = (fsLit "stg_ap_pv", 2)
+slowCallPattern (P: _)                = (fsLit "stg_ap_p", 1)
+slowCallPattern (V: _)                = (fsLit "stg_ap_v", 1)
+slowCallPattern (N: _)                = (fsLit "stg_ap_n", 1)
+slowCallPattern (F: _)                = (fsLit "stg_ap_f", 1)
+slowCallPattern (D: _)                = (fsLit "stg_ap_d", 1)
+slowCallPattern (L: _)                = (fsLit "stg_ap_l", 1)
+slowCallPattern (V16: _)              = (fsLit "stg_ap_v16", 1)
+slowCallPattern (V32: _)              = (fsLit "stg_ap_v32", 1)
+slowCallPattern (V64: _)              = (fsLit "stg_ap_v64", 1)
+slowCallPattern []                    = (fsLit "stg_ap_0", 0)
diff --git a/compiler/GHC/StgToCmm/Bind.hs b/compiler/GHC/StgToCmm/Bind.hs
new file mode 100644
index 0000000000..bfe9255783
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Bind.hs
@@ -0,0 +1,753 @@
+-----------------------------------------------------------------------------
+--
+-- Stg to C-- code generation: bindings
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.Bind (
+        cgTopRhsClosure,
+        cgBind,
+        emitBlackHoleCode,
+        pushUpdateFrame, emitUpdateFrame
+  ) where
+
+import GhcPrelude hiding ((<*>))
+
+import GHC.StgToCmm.Expr
+import GHC.StgToCmm.Monad
+import GHC.StgToCmm.Env
+import GHC.StgToCmm.Con
+import GHC.StgToCmm.Heap
+import GHC.StgToCmm.Prof (ldvEnterClosure, enterCostCentreFun, enterCostCentreThunk,
+                   initUpdFrameProf)
+import GHC.StgToCmm.Ticky
+import GHC.StgToCmm.Layout
+import GHC.StgToCmm.Utils
+import GHC.StgToCmm.Closure
+import GHC.StgToCmm.Foreign    (emitPrimCall)
+
+import MkGraph
+import CoreSyn          ( AltCon(..), tickishIsCode )
+import BlockId
+import SMRep
+import Cmm
+import CmmInfo
+import CmmUtils
+import CLabel
+import StgSyn
+import CostCentre
+import Id
+import IdInfo
+import Name
+import Module
+import ListSetOps
+import Util
+import VarSet
+import BasicTypes
+import Outputable
+import FastString
+import DynFlags
+
+import Control.Monad
+
+------------------------------------------------------------------------
+--              Top-level bindings
+------------------------------------------------------------------------
+
+-- For closures bound at top level, allocate in static space.
+-- They should have no free variables.
+
+cgTopRhsClosure :: DynFlags
+                -> RecFlag              -- member of a recursive group?
+                -> Id
+                -> CostCentreStack      -- Optional cost centre annotation
+                -> UpdateFlag
+                -> [Id]                 -- Args
+                -> CgStgExpr
+                -> (CgIdInfo, FCode ())
+
+cgTopRhsClosure dflags rec id ccs upd_flag args body =
+  let closure_label = mkLocalClosureLabel (idName id) (idCafInfo id)
+      cg_id_info    = litIdInfo dflags id lf_info (CmmLabel closure_label)
+      lf_info       = mkClosureLFInfo dflags id TopLevel [] upd_flag args
+  in (cg_id_info, gen_code dflags lf_info closure_label)
+  where
+  -- special case for a indirection (f = g).  We create an IND_STATIC
+  -- closure pointing directly to the indirectee.  This is exactly
+  -- what the CAF will eventually evaluate to anyway, we're just
+  -- shortcutting the whole process, and generating a lot less code
+  -- (#7308). Eventually the IND_STATIC closure will be eliminated
+  -- by assembly '.equiv' directives, where possible (#15155).
+  -- See note [emit-time elimination of static indirections] in CLabel.
+  --
+  -- Note: we omit the optimisation when this binding is part of a
+  -- recursive group, because the optimisation would inhibit the black
+  -- hole detection from working in that case.  Test
+  -- concurrent/should_run/4030 fails, for instance.
+  --
+  gen_code dflags _ closure_label
+    | StgApp f [] <- body, null args, isNonRec rec
+    = do
+         cg_info <- getCgIdInfo f
+         let closure_rep   = mkStaticClosureFields dflags
+                                    indStaticInfoTable ccs MayHaveCafRefs
+                                    [unLit (idInfoToAmode cg_info)]
+         emitDataLits closure_label closure_rep
+         return ()
+
+  gen_code dflags lf_info _closure_label
+   = do { let name = idName id
+        ; mod_name <- getModuleName
+        ; let descr         = closureDescription dflags mod_name name
+              closure_info  = mkClosureInfo dflags True id lf_info 0 0 descr
+
+        -- We don't generate the static closure here, because we might
+        -- want to add references to static closures to it later.  The
+        -- static closure is generated by CmmBuildInfoTables.updInfoSRTs,
+        -- See Note [SRTs], specifically the [FUN] optimisation.
+
+        ; let fv_details :: [(NonVoid Id, ByteOff)]
+              header = if isLFThunk lf_info then ThunkHeader else StdHeader
+              (_, _, fv_details) = mkVirtHeapOffsets dflags header []
+        -- Don't drop the non-void args until the closure info has been made
+        ; forkClosureBody (closureCodeBody True id closure_info ccs
+                                (nonVoidIds args) (length args) body fv_details)
+
+        ; return () }
+
+  unLit (CmmLit l) = l
+  unLit _ = panic "unLit"
+
+------------------------------------------------------------------------
+--              Non-top-level bindings
+------------------------------------------------------------------------
+
+cgBind :: CgStgBinding -> FCode ()
+cgBind (StgNonRec name rhs)
+  = do  { (info, fcode) <- cgRhs name rhs
+        ; addBindC info
+        ; init <- fcode
+        ; emit init }
+        -- init cannot be used in body, so slightly better to sink it eagerly
+
+cgBind (StgRec pairs)
+  = do  {  r <- sequence $ unzipWith cgRhs pairs
+        ;  let (id_infos, fcodes) = unzip r
+        ;  addBindsC id_infos
+        ;  (inits, body) <- getCodeR $ sequence fcodes
+        ;  emit (catAGraphs inits <*> body) }
+
+{- Note [cgBind rec]
+
+   Recursive let-bindings are tricky.
+   Consider the following pseudocode:
+
+     let x = \_ ->  ... y ...
+         y = \_ ->  ... z ...
+         z = \_ ->  ... x ...
+     in ...
+
+   For each binding, we need to allocate a closure, and each closure must
+   capture the address of the other closures.
+   We want to generate the following C-- code:
+     // Initialization Code
+     x = hp - 24; // heap address of x's closure
+     y = hp - 40; // heap address of x's closure
+     z = hp - 64; // heap address of x's closure
+     // allocate and initialize x
+     m[hp-8]   = ...
+     m[hp-16]  = y       // the closure for x captures y
+     m[hp-24] = x_info;
+     // allocate and initialize y
+     m[hp-32] = z;       // the closure for y captures z
+     m[hp-40] = y_info;
+     // allocate and initialize z
+     ...
+
+   For each closure, we must generate not only the code to allocate and
+   initialize the closure itself, but also some initialization Code that
+   sets a variable holding the closure pointer.
+
+   We could generate a pair of the (init code, body code), but since
+   the bindings are recursive we also have to initialise the
+   environment with the CgIdInfo for all the bindings before compiling
+   anything.  So we do this in 3 stages:
+
+     1. collect all the CgIdInfos and initialise the environment
+     2. compile each binding into (init, body) code
+     3. emit all the inits, and then all the bodies
+
+   We'd rather not have separate functions to do steps 1 and 2 for
+   each binding, since in pratice they share a lot of code.  So we
+   have just one function, cgRhs, that returns a pair of the CgIdInfo
+   for step 1, and a monadic computation to generate the code in step
+   2.
+
+   The alternative to separating things in this way is to use a
+   fixpoint.  That's what we used to do, but it introduces a
+   maintenance nightmare because there is a subtle dependency on not
+   being too strict everywhere.  Doing things this way means that the
+   FCode monad can be strict, for example.
+ -}
+
+cgRhs :: Id
+      -> CgStgRhs
+      -> FCode (
+                 CgIdInfo         -- The info for this binding
+               , FCode CmmAGraph  -- A computation which will generate the
+                                  -- code for the binding, and return an
+                                  -- assignent of the form "x = Hp - n"
+                                  -- (see above)
+               )
+
+cgRhs id (StgRhsCon cc con args)
+  = withNewTickyCounterCon (idName id) $
+    buildDynCon id True cc con (assertNonVoidStgArgs args)
+      -- con args are always non-void,
+      -- see Note [Post-unarisation invariants] in UnariseStg
+
+{- See Note [GC recovery] in compiler/GHC.StgToCmm/Closure.hs -}
+cgRhs id (StgRhsClosure fvs cc upd_flag args body)
+  = do dflags <- getDynFlags
+       mkRhsClosure dflags id cc (nonVoidIds (dVarSetElems fvs)) upd_flag args body
+
+------------------------------------------------------------------------
+--              Non-constructor right hand sides
+------------------------------------------------------------------------
+
+mkRhsClosure :: DynFlags -> Id -> CostCentreStack
+             -> [NonVoid Id]                    -- Free vars
+             -> UpdateFlag
+             -> [Id]                            -- Args
+             -> CgStgExpr
+             -> FCode (CgIdInfo, FCode CmmAGraph)
+
+{- mkRhsClosure looks for two special forms of the right-hand side:
+        a) selector thunks
+        b) AP thunks
+
+If neither happens, it just calls mkClosureLFInfo.  You might think
+that mkClosureLFInfo should do all this, but it seems wrong for the
+latter to look at the structure of an expression
+
+Note [Selectors]
+~~~~~~~~~~~~~~~~
+We look at the body of the closure to see if it's a selector---turgid,
+but nothing deep.  We are looking for a closure of {\em exactly} the
+form:
+
+...  = [the_fv] \ u [] ->
+         case the_fv of
+           con a_1 ... a_n -> a_i
+
+Note [Ap thunks]
+~~~~~~~~~~~~~~~~
+A more generic AP thunk of the form
+
+        x = [ x_1...x_n ] \.. [] -> x_1 ... x_n
+
+A set of these is compiled statically into the RTS, so we just use
+those.  We could extend the idea to thunks where some of the x_i are
+global ids (and hence not free variables), but this would entail
+generating a larger thunk.  It might be an option for non-optimising
+compilation, though.
+
+We only generate an Ap thunk if all the free variables are pointers,
+for semi-obvious reasons.
+
+-}
+
+---------- Note [Selectors] ------------------
+mkRhsClosure    dflags bndr _cc
+                [NonVoid the_fv]                -- Just one free var
+                upd_flag                -- Updatable thunk
+                []                      -- A thunk
+                expr
+  | let strip = stripStgTicksTopE (not . tickishIsCode)
+  , StgCase (StgApp scrutinee [{-no args-}])
+         _   -- ignore bndr
+         (AlgAlt _)
+         [(DataAlt _, params, sel_expr)] <- strip expr
+  , StgApp selectee [{-no args-}] <- strip sel_expr
+  , the_fv == scrutinee                -- Scrutinee is the only free variable
+
+  , let (_, _, params_w_offsets) = mkVirtConstrOffsets dflags (addIdReps (assertNonVoidIds params))
+                                   -- pattern binders are always non-void,
+                                   -- see Note [Post-unarisation invariants] in UnariseStg
+  , Just the_offset <- assocMaybe params_w_offsets (NonVoid selectee)
+
+  , let offset_into_int = bytesToWordsRoundUp dflags the_offset
+                          - fixedHdrSizeW dflags
+  , offset_into_int <= mAX_SPEC_SELECTEE_SIZE dflags -- Offset is small enough
+  = -- NOT TRUE: ASSERT(is_single_constructor)
+    -- The simplifier may have statically determined that the single alternative
+    -- is the only possible case and eliminated the others, even if there are
+    -- other constructors in the datatype.  It's still ok to make a selector
+    -- thunk in this case, because we *know* which constructor the scrutinee
+    -- will evaluate to.
+    --
+    -- srt is discarded; it must be empty
+    let lf_info = mkSelectorLFInfo bndr offset_into_int (isUpdatable upd_flag)
+    in cgRhsStdThunk bndr lf_info [StgVarArg the_fv]
+
+---------- Note [Ap thunks] ------------------
+mkRhsClosure    dflags bndr _cc
+                fvs
+                upd_flag
+                []                      -- No args; a thunk
+                (StgApp fun_id args)
+
+  -- We are looking for an "ApThunk"; see data con ApThunk in GHC.StgToCmm.Closure
+  -- of form (x1 x2 .... xn), where all the xi are locals (not top-level)
+  -- So the xi will all be free variables
+  | args `lengthIs` (n_fvs-1)  -- This happens only if the fun_id and
+                               -- args are all distinct local variables
+                               -- The "-1" is for fun_id
+    -- Missed opportunity:   (f x x) is not detected
+  , all (isGcPtrRep . idPrimRep . fromNonVoid) fvs
+  , isUpdatable upd_flag
+  , n_fvs <= mAX_SPEC_AP_SIZE dflags
+  , not (gopt Opt_SccProfilingOn dflags)
+                         -- not when profiling: we don't want to
+                         -- lose information about this particular
+                         -- thunk (e.g. its type) (#949)
+  , idArity fun_id == unknownArity -- don't spoil a known call
+
+          -- Ha! an Ap thunk
+  = cgRhsStdThunk bndr lf_info payload
+
+  where
+    n_fvs   = length fvs
+    lf_info = mkApLFInfo bndr upd_flag n_fvs
+    -- the payload has to be in the correct order, hence we can't
+    -- just use the fvs.
+    payload = StgVarArg fun_id : args
+
+---------- Default case ------------------
+mkRhsClosure dflags bndr cc fvs upd_flag args body
+  = do  { let lf_info = mkClosureLFInfo dflags bndr NotTopLevel fvs upd_flag args
+        ; (id_info, reg) <- rhsIdInfo bndr lf_info
+        ; return (id_info, gen_code lf_info reg) }
+ where
+ gen_code lf_info reg
+  = do  {       -- LAY OUT THE OBJECT
+        -- If the binder is itself a free variable, then don't store
+        -- it in the closure.  Instead, just bind it to Node on entry.
+        -- NB we can be sure that Node will point to it, because we
+        -- haven't told mkClosureLFInfo about this; so if the binder
+        -- _was_ a free var of its RHS, mkClosureLFInfo thinks it *is*
+        -- stored in the closure itself, so it will make sure that
+        -- Node points to it...
+        ; let   reduced_fvs = filter (NonVoid bndr /=) fvs
+
+        -- MAKE CLOSURE INFO FOR THIS CLOSURE
+        ; mod_name <- getModuleName
+        ; dflags <- getDynFlags
+        ; let   name  = idName bndr
+                descr = closureDescription dflags mod_name name
+                fv_details :: [(NonVoid Id, ByteOff)]
+                header = if isLFThunk lf_info then ThunkHeader else StdHeader
+                (tot_wds, ptr_wds, fv_details)
+                   = mkVirtHeapOffsets dflags header (addIdReps reduced_fvs)
+                closure_info = mkClosureInfo dflags False       -- Not static
+                                             bndr lf_info tot_wds ptr_wds
+                                             descr
+
+        -- BUILD ITS INFO TABLE AND CODE
+        ; forkClosureBody $
+                -- forkClosureBody: (a) ensure that bindings in here are not seen elsewhere
+                --                  (b) ignore Sequel from context; use empty Sequel
+                -- And compile the body
+                closureCodeBody False bndr closure_info cc (nonVoidIds args)
+                                (length args) body fv_details
+
+        -- BUILD THE OBJECT
+--      ; (use_cc, blame_cc) <- chooseDynCostCentres cc args body
+        ; let use_cc = cccsExpr; blame_cc = cccsExpr
+        ; emit (mkComment $ mkFastString "calling allocDynClosure")
+        ; let toVarArg (NonVoid a, off) = (NonVoid (StgVarArg a), off)
+        ; let info_tbl = mkCmmInfo closure_info bndr currentCCS
+        ; hp_plus_n <- allocDynClosure (Just bndr) info_tbl lf_info use_cc blame_cc
+                                         (map toVarArg fv_details)
+
+        -- RETURN
+        ; return (mkRhsInit dflags reg lf_info hp_plus_n) }
+
+-------------------------
+cgRhsStdThunk
+        :: Id
+        -> LambdaFormInfo
+        -> [StgArg]             -- payload
+        -> FCode (CgIdInfo, FCode CmmAGraph)
+
+cgRhsStdThunk bndr lf_info payload
+ = do  { (id_info, reg) <- rhsIdInfo bndr lf_info
+       ; return (id_info, gen_code reg)
+       }
+ where
+ gen_code reg  -- AHA!  A STANDARD-FORM THUNK
+  = withNewTickyCounterStdThunk (lfUpdatable lf_info) (idName bndr) $
+    do
+  {     -- LAY OUT THE OBJECT
+    mod_name <- getModuleName
+  ; dflags <- getDynFlags
+  ; let header = if isLFThunk lf_info then ThunkHeader else StdHeader
+        (tot_wds, ptr_wds, payload_w_offsets)
+            = mkVirtHeapOffsets dflags header
+                (addArgReps (nonVoidStgArgs payload))
+
+        descr = closureDescription dflags mod_name (idName bndr)
+        closure_info = mkClosureInfo dflags False       -- Not static
+                                     bndr lf_info tot_wds ptr_wds
+                                     descr
+
+--  ; (use_cc, blame_cc) <- chooseDynCostCentres cc [{- no args-}] body
+  ; let use_cc = cccsExpr; blame_cc = cccsExpr
+
+
+        -- BUILD THE OBJECT
+  ; let info_tbl = mkCmmInfo closure_info bndr currentCCS
+  ; hp_plus_n <- allocDynClosure (Just bndr) info_tbl lf_info
+                                   use_cc blame_cc payload_w_offsets
+
+        -- RETURN
+  ; return (mkRhsInit dflags reg lf_info hp_plus_n) }
+
+
+mkClosureLFInfo :: DynFlags
+                -> Id           -- The binder
+                -> TopLevelFlag -- True of top level
+                -> [NonVoid Id] -- Free vars
+                -> UpdateFlag   -- Update flag
+                -> [Id]         -- Args
+                -> LambdaFormInfo
+mkClosureLFInfo dflags bndr top fvs upd_flag args
+  | null args =
+        mkLFThunk (idType bndr) top (map fromNonVoid fvs) upd_flag
+  | otherwise =
+        mkLFReEntrant top (map fromNonVoid fvs) args (mkArgDescr dflags args)
+
+
+------------------------------------------------------------------------
+--              The code for closures
+------------------------------------------------------------------------
+
+closureCodeBody :: Bool            -- whether this is a top-level binding
+                -> Id              -- the closure's name
+                -> ClosureInfo     -- Lots of information about this closure
+                -> CostCentreStack -- Optional cost centre attached to closure
+                -> [NonVoid Id]    -- incoming args to the closure
+                -> Int             -- arity, including void args
+                -> CgStgExpr
+                -> [(NonVoid Id, ByteOff)] -- the closure's free vars
+                -> FCode ()
+
+{- There are two main cases for the code for closures.
+
+* If there are *no arguments*, then the closure is a thunk, and not in
+  normal form. So it should set up an update frame (if it is
+  shared). NB: Thunks cannot have a primitive type!
+
+* If there is *at least one* argument, then this closure is in
+  normal form, so there is no need to set up an update frame.
+-}
+
+closureCodeBody top_lvl bndr cl_info cc _args arity body fv_details
+  | arity == 0 -- No args i.e. thunk
+  = withNewTickyCounterThunk
+        (isStaticClosure cl_info)
+        (closureUpdReqd cl_info)
+        (closureName cl_info) $
+    emitClosureProcAndInfoTable top_lvl bndr lf_info info_tbl [] $
+      \(_, node, _) -> thunkCode cl_info fv_details cc node arity body
+   where
+     lf_info  = closureLFInfo cl_info
+     info_tbl = mkCmmInfo cl_info bndr cc
+
+closureCodeBody top_lvl bndr cl_info cc args arity body fv_details
+  = -- Note: args may be [], if all args are Void
+    withNewTickyCounterFun
+        (closureSingleEntry cl_info)
+        (closureName cl_info)
+        args $ do {
+
+        ; let
+             lf_info  = closureLFInfo cl_info
+             info_tbl = mkCmmInfo cl_info bndr cc
+
+        -- Emit the main entry code
+        ; emitClosureProcAndInfoTable top_lvl bndr lf_info info_tbl args $
+            \(_offset, node, arg_regs) -> do
+                -- Emit slow-entry code (for entering a closure through a PAP)
+                { mkSlowEntryCode bndr cl_info arg_regs
+                ; dflags <- getDynFlags
+                ; let node_points = nodeMustPointToIt dflags lf_info
+                      node' = if node_points then Just node else Nothing
+                ; loop_header_id <- newBlockId
+                -- Extend reader monad with information that
+                -- self-recursive tail calls can be optimized into local
+                -- jumps. See Note [Self-recursive tail calls] in GHC.StgToCmm.Expr.
+                ; withSelfLoop (bndr, loop_header_id, arg_regs) $ do
+                {
+                -- Main payload
+                ; entryHeapCheck cl_info node' arity arg_regs $ do
+                { -- emit LDV code when profiling
+                  when node_points (ldvEnterClosure cl_info (CmmLocal node))
+                -- ticky after heap check to avoid double counting
+                ; tickyEnterFun cl_info
+                ; enterCostCentreFun cc
+                    (CmmMachOp (mo_wordSub dflags)
+                         [ CmmReg (CmmLocal node) -- See [NodeReg clobbered with loopification]
+                         , mkIntExpr dflags (funTag dflags cl_info) ])
+                ; fv_bindings <- mapM bind_fv fv_details
+                -- Load free vars out of closure *after*
+                -- heap check, to reduce live vars over check
+                ; when node_points $ load_fvs node lf_info fv_bindings
+                ; void $ cgExpr body
+                }}}
+
+  }
+
+-- Note [NodeReg clobbered with loopification]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- Previously we used to pass nodeReg (aka R1) here. With profiling, upon
+-- entering a closure, enterFunCCS was called with R1 passed to it. But since R1
+-- may get clobbered inside the body of a closure, and since a self-recursive
+-- tail call does not restore R1, a subsequent call to enterFunCCS received a
+-- possibly bogus value from R1. The solution is to not pass nodeReg (aka R1) to
+-- enterFunCCS. Instead, we pass node, the callee-saved temporary that stores
+-- the original value of R1. This way R1 may get modified but loopification will
+-- not care.
+
+-- A function closure pointer may be tagged, so we
+-- must take it into account when accessing the free variables.
+bind_fv :: (NonVoid Id, ByteOff) -> FCode (LocalReg, ByteOff)
+bind_fv (id, off) = do { reg <- rebindToReg id; return (reg, off) }
+
+load_fvs :: LocalReg -> LambdaFormInfo -> [(LocalReg, ByteOff)] -> FCode ()
+load_fvs node lf_info = mapM_ (\ (reg, off) ->
+   do dflags <- getDynFlags
+      let tag = lfDynTag dflags lf_info
+      emit $ mkTaggedObjectLoad dflags reg node off tag)
+
+-----------------------------------------
+-- The "slow entry" code for a function.  This entry point takes its
+-- arguments on the stack.  It loads the arguments into registers
+-- according to the calling convention, and jumps to the function's
+-- normal entry point.  The function's closure is assumed to be in
+-- R1/node.
+--
+-- The slow entry point is used for unknown calls: eg. stg_PAP_entry
+
+mkSlowEntryCode :: Id -> ClosureInfo -> [LocalReg] -> FCode ()
+-- If this function doesn't have a specialised ArgDescr, we need
+-- to generate the function's arg bitmap and slow-entry code.
+-- Here, we emit the slow-entry code.
+mkSlowEntryCode bndr cl_info arg_regs -- function closure is already in `Node'
+  | Just (_, ArgGen _) <- closureFunInfo cl_info
+  = do dflags <- getDynFlags
+       let node = idToReg dflags (NonVoid bndr)
+           slow_lbl = closureSlowEntryLabel  cl_info
+           fast_lbl = closureLocalEntryLabel dflags cl_info
+           -- mkDirectJump does not clobber `Node' containing function closure
+           jump = mkJump dflags NativeNodeCall
+                                (mkLblExpr fast_lbl)
+                                (map (CmmReg . CmmLocal) (node : arg_regs))
+                                (initUpdFrameOff dflags)
+       tscope <- getTickScope
+       emitProcWithConvention Slow Nothing slow_lbl
+         (node : arg_regs) (jump, tscope)
+  | otherwise = return ()
+
+-----------------------------------------
+thunkCode :: ClosureInfo -> [(NonVoid Id, ByteOff)] -> CostCentreStack
+          -> LocalReg -> Int -> CgStgExpr -> FCode ()
+thunkCode cl_info fv_details _cc node arity body
+  = do { dflags <- getDynFlags
+       ; let node_points = nodeMustPointToIt dflags (closureLFInfo cl_info)
+             node'       = if node_points then Just node else Nothing
+        ; ldvEnterClosure cl_info (CmmLocal node) -- NB: Node always points when profiling
+
+        -- Heap overflow check
+        ; entryHeapCheck cl_info node' arity [] $ do
+        { -- Overwrite with black hole if necessary
+          -- but *after* the heap-overflow check
+        ; tickyEnterThunk cl_info
+        ; when (blackHoleOnEntry cl_info && node_points)
+                (blackHoleIt node)
+
+          -- Push update frame
+        ; setupUpdate cl_info node $
+            -- We only enter cc after setting up update so
+            -- that cc of enclosing scope will be recorded
+            -- in update frame CAF/DICT functions will be
+            -- subsumed by this enclosing cc
+            do { enterCostCentreThunk (CmmReg nodeReg)
+               ; let lf_info = closureLFInfo cl_info
+               ; fv_bindings <- mapM bind_fv fv_details
+               ; load_fvs node lf_info fv_bindings
+               ; void $ cgExpr body }}}
+
+
+------------------------------------------------------------------------
+--              Update and black-hole wrappers
+------------------------------------------------------------------------
+
+blackHoleIt :: LocalReg -> FCode ()
+-- Only called for closures with no args
+-- Node points to the closure
+blackHoleIt node_reg
+  = emitBlackHoleCode (CmmReg (CmmLocal node_reg))
+
+emitBlackHoleCode :: CmmExpr -> FCode ()
+emitBlackHoleCode node = do
+  dflags <- getDynFlags
+
+  -- Eager blackholing is normally disabled, but can be turned on with
+  -- -feager-blackholing.  When it is on, we replace the info pointer
+  -- of the thunk with stg_EAGER_BLACKHOLE_info on entry.
+
+  -- If we wanted to do eager blackholing with slop filling, we'd need
+  -- to do it at the *end* of a basic block, otherwise we overwrite
+  -- the free variables in the thunk that we still need.  We have a
+  -- patch for this from Andy Cheadle, but not incorporated yet. --SDM
+  -- [6/2004]
+  --
+  -- Previously, eager blackholing was enabled when ticky-ticky was
+  -- on. But it didn't work, and it wasn't strictly necessary to bring
+  -- back minimal ticky-ticky, so now EAGER_BLACKHOLING is
+  -- unconditionally disabled. -- krc 1/2007
+
+  -- Note the eager-blackholing check is here rather than in blackHoleOnEntry,
+  -- because emitBlackHoleCode is called from CmmParse.
+
+  let  eager_blackholing =  not (gopt Opt_SccProfilingOn dflags)
+                         && gopt Opt_EagerBlackHoling dflags
+             -- Profiling needs slop filling (to support LDV
+             -- profiling), so currently eager blackholing doesn't
+             -- work with profiling.
+
+  when eager_blackholing $ do
+    emitStore (cmmOffsetW dflags node (fixedHdrSizeW dflags)) currentTSOExpr
+    -- See Note [Heap memory barriers] in SMP.h.
+    emitPrimCall [] MO_WriteBarrier []
+    emitStore node (CmmReg (CmmGlobal EagerBlackholeInfo))
+
+setupUpdate :: ClosureInfo -> LocalReg -> FCode () -> FCode ()
+        -- Nota Bene: this function does not change Node (even if it's a CAF),
+        -- so that the cost centre in the original closure can still be
+        -- extracted by a subsequent enterCostCentre
+setupUpdate closure_info node body
+  | not (lfUpdatable (closureLFInfo closure_info))
+  = body
+
+  | not (isStaticClosure closure_info)
+  = if not (closureUpdReqd closure_info)
+      then do tickyUpdateFrameOmitted; body
+      else do
+          tickyPushUpdateFrame
+          dflags <- getDynFlags
+          let
+              bh = blackHoleOnEntry closure_info &&
+                   not (gopt Opt_SccProfilingOn dflags) &&
+                   gopt Opt_EagerBlackHoling dflags
+
+              lbl | bh        = mkBHUpdInfoLabel
+                  | otherwise = mkUpdInfoLabel
+
+          pushUpdateFrame lbl (CmmReg (CmmLocal node)) body
+
+  | otherwise   -- A static closure
+  = do  { tickyUpdateBhCaf closure_info
+
+        ; if closureUpdReqd closure_info
+          then do       -- Blackhole the (updatable) CAF:
+                { upd_closure <- link_caf node
+                ; pushUpdateFrame mkBHUpdInfoLabel upd_closure body }
+          else do {tickyUpdateFrameOmitted; body}
+    }
+
+-----------------------------------------------------------------------------
+-- Setting up update frames
+
+-- Push the update frame on the stack in the Entry area,
+-- leaving room for the return address that is already
+-- at the old end of the area.
+--
+pushUpdateFrame :: CLabel -> CmmExpr -> FCode () -> FCode ()
+pushUpdateFrame lbl updatee body
+  = do
+       updfr  <- getUpdFrameOff
+       dflags <- getDynFlags
+       let
+           hdr         = fixedHdrSize dflags
+           frame       = updfr + hdr + sIZEOF_StgUpdateFrame_NoHdr dflags
+       --
+       emitUpdateFrame dflags (CmmStackSlot Old frame) lbl updatee
+       withUpdFrameOff frame body
+
+emitUpdateFrame :: DynFlags -> CmmExpr -> CLabel -> CmmExpr -> FCode ()
+emitUpdateFrame dflags frame lbl updatee = do
+  let
+           hdr         = fixedHdrSize dflags
+           off_updatee = hdr + oFFSET_StgUpdateFrame_updatee dflags
+  --
+  emitStore frame (mkLblExpr lbl)
+  emitStore (cmmOffset dflags frame off_updatee) updatee
+  initUpdFrameProf frame
+
+-----------------------------------------------------------------------------
+-- Entering a CAF
+--
+-- See Note [CAF management] in rts/sm/Storage.c
+
+link_caf :: LocalReg           -- pointer to the closure
+         -> FCode CmmExpr      -- Returns amode for closure to be updated
+-- This function returns the address of the black hole, so it can be
+-- updated with the new value when available.
+link_caf node = do
+  { dflags <- getDynFlags
+        -- Call the RTS function newCAF, returning the newly-allocated
+        -- blackhole indirection closure
+  ; let newCAF_lbl = mkForeignLabel (fsLit "newCAF") Nothing
+                                    ForeignLabelInExternalPackage IsFunction
+  ; bh <- newTemp (bWord dflags)
+  ; emitRtsCallGen [(bh,AddrHint)] newCAF_lbl
+      [ (baseExpr,  AddrHint),
+        (CmmReg (CmmLocal node), AddrHint) ]
+      False
+
+  -- see Note [atomic CAF entry] in rts/sm/Storage.c
+  ; updfr  <- getUpdFrameOff
+  ; let target = entryCode dflags (closureInfoPtr dflags (CmmReg (CmmLocal node)))
+  ; emit =<< mkCmmIfThen
+      (cmmEqWord dflags (CmmReg (CmmLocal bh)) (zeroExpr dflags))
+        -- re-enter the CAF
+       (mkJump dflags NativeNodeCall target [] updfr)
+
+  ; return (CmmReg (CmmLocal bh)) }
+
+------------------------------------------------------------------------
+--              Profiling
+------------------------------------------------------------------------
+
+-- For "global" data constructors the description is simply occurrence
+-- name of the data constructor itself.  Otherwise it is determined by
+-- @closureDescription@ from the let binding information.
+
+closureDescription :: DynFlags
+           -> Module            -- Module
+                   -> Name              -- Id of closure binding
+                   -> String
+        -- Not called for StgRhsCon which have global info tables built in
+        -- CgConTbls.hs with a description generated from the data constructor
+closureDescription dflags mod_name name
+  = showSDocDump dflags (char '<' <>
+                    (if isExternalName name
+                      then ppr name -- ppr will include the module name prefix
+                      else pprModule mod_name <> char '.' <> ppr name) <>
+                    char '>')
+   -- showSDocDump, because we want to see the unique on the Name.
diff --git a/compiler/GHC/StgToCmm/Bind.hs-boot b/compiler/GHC/StgToCmm/Bind.hs-boot
new file mode 100644
index 0000000000..d16c34ebd3
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Bind.hs-boot
@@ -0,0 +1,6 @@
+module GHC.StgToCmm.Bind where
+
+import GHC.StgToCmm.Monad( FCode )
+import StgSyn( CgStgBinding )
+
+cgBind :: CgStgBinding -> FCode ()
diff --git a/compiler/GHC/StgToCmm/CgUtils.hs b/compiler/GHC/StgToCmm/CgUtils.hs
new file mode 100644
index 0000000000..f3dccd9745
--- /dev/null
+++ b/compiler/GHC/StgToCmm/CgUtils.hs
@@ -0,0 +1,186 @@
+{-# LANGUAGE GADTs #-}
+
+-----------------------------------------------------------------------------
+--
+-- Code generator utilities; mostly monadic
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.CgUtils (
+        fixStgRegisters,
+        baseRegOffset,
+        get_Regtable_addr_from_offset,
+        regTableOffset,
+        get_GlobalReg_addr,
+  ) where
+
+import GhcPrelude
+
+import GHC.Platform.Regs
+import Cmm
+import Hoopl.Block
+import Hoopl.Graph
+import CmmUtils
+import CLabel
+import DynFlags
+import Outputable
+
+-- -----------------------------------------------------------------------------
+-- Information about global registers
+
+baseRegOffset :: DynFlags -> GlobalReg -> Int
+
+baseRegOffset dflags (VanillaReg 1 _)    = oFFSET_StgRegTable_rR1 dflags
+baseRegOffset dflags (VanillaReg 2 _)    = oFFSET_StgRegTable_rR2 dflags
+baseRegOffset dflags (VanillaReg 3 _)    = oFFSET_StgRegTable_rR3 dflags
+baseRegOffset dflags (VanillaReg 4 _)    = oFFSET_StgRegTable_rR4 dflags
+baseRegOffset dflags (VanillaReg 5 _)    = oFFSET_StgRegTable_rR5 dflags
+baseRegOffset dflags (VanillaReg 6 _)    = oFFSET_StgRegTable_rR6 dflags
+baseRegOffset dflags (VanillaReg 7 _)    = oFFSET_StgRegTable_rR7 dflags
+baseRegOffset dflags (VanillaReg 8 _)    = oFFSET_StgRegTable_rR8 dflags
+baseRegOffset dflags (VanillaReg 9 _)    = oFFSET_StgRegTable_rR9 dflags
+baseRegOffset dflags (VanillaReg 10 _)   = oFFSET_StgRegTable_rR10 dflags
+baseRegOffset _      (VanillaReg n _)    = panic ("Registers above R10 are not supported (tried to use R" ++ show n ++ ")")
+baseRegOffset dflags (FloatReg  1)       = oFFSET_StgRegTable_rF1 dflags
+baseRegOffset dflags (FloatReg  2)       = oFFSET_StgRegTable_rF2 dflags
+baseRegOffset dflags (FloatReg  3)       = oFFSET_StgRegTable_rF3 dflags
+baseRegOffset dflags (FloatReg  4)       = oFFSET_StgRegTable_rF4 dflags
+baseRegOffset dflags (FloatReg  5)       = oFFSET_StgRegTable_rF5 dflags
+baseRegOffset dflags (FloatReg  6)       = oFFSET_StgRegTable_rF6 dflags
+baseRegOffset _      (FloatReg  n)       = panic ("Registers above F6 are not supported (tried to use F" ++ show n ++ ")")
+baseRegOffset dflags (DoubleReg 1)       = oFFSET_StgRegTable_rD1 dflags
+baseRegOffset dflags (DoubleReg 2)       = oFFSET_StgRegTable_rD2 dflags
+baseRegOffset dflags (DoubleReg 3)       = oFFSET_StgRegTable_rD3 dflags
+baseRegOffset dflags (DoubleReg 4)       = oFFSET_StgRegTable_rD4 dflags
+baseRegOffset dflags (DoubleReg 5)       = oFFSET_StgRegTable_rD5 dflags
+baseRegOffset dflags (DoubleReg 6)       = oFFSET_StgRegTable_rD6 dflags
+baseRegOffset _      (DoubleReg n)       = panic ("Registers above D6 are not supported (tried to use D" ++ show n ++ ")")
+baseRegOffset dflags (XmmReg 1)          = oFFSET_StgRegTable_rXMM1 dflags
+baseRegOffset dflags (XmmReg 2)          = oFFSET_StgRegTable_rXMM2 dflags
+baseRegOffset dflags (XmmReg 3)          = oFFSET_StgRegTable_rXMM3 dflags
+baseRegOffset dflags (XmmReg 4)          = oFFSET_StgRegTable_rXMM4 dflags
+baseRegOffset dflags (XmmReg 5)          = oFFSET_StgRegTable_rXMM5 dflags
+baseRegOffset dflags (XmmReg 6)          = oFFSET_StgRegTable_rXMM6 dflags
+baseRegOffset _      (XmmReg n)          = panic ("Registers above XMM6 are not supported (tried to use XMM" ++ show n ++ ")")
+baseRegOffset dflags (YmmReg 1)          = oFFSET_StgRegTable_rYMM1 dflags
+baseRegOffset dflags (YmmReg 2)          = oFFSET_StgRegTable_rYMM2 dflags
+baseRegOffset dflags (YmmReg 3)          = oFFSET_StgRegTable_rYMM3 dflags
+baseRegOffset dflags (YmmReg 4)          = oFFSET_StgRegTable_rYMM4 dflags
+baseRegOffset dflags (YmmReg 5)          = oFFSET_StgRegTable_rYMM5 dflags
+baseRegOffset dflags (YmmReg 6)          = oFFSET_StgRegTable_rYMM6 dflags
+baseRegOffset _      (YmmReg n)          = panic ("Registers above YMM6 are not supported (tried to use YMM" ++ show n ++ ")")
+baseRegOffset dflags (ZmmReg 1)          = oFFSET_StgRegTable_rZMM1 dflags
+baseRegOffset dflags (ZmmReg 2)          = oFFSET_StgRegTable_rZMM2 dflags
+baseRegOffset dflags (ZmmReg 3)          = oFFSET_StgRegTable_rZMM3 dflags
+baseRegOffset dflags (ZmmReg 4)          = oFFSET_StgRegTable_rZMM4 dflags
+baseRegOffset dflags (ZmmReg 5)          = oFFSET_StgRegTable_rZMM5 dflags
+baseRegOffset dflags (ZmmReg 6)          = oFFSET_StgRegTable_rZMM6 dflags
+baseRegOffset _      (ZmmReg n)          = panic ("Registers above ZMM6 are not supported (tried to use ZMM" ++ show n ++ ")")
+baseRegOffset dflags Sp                  = oFFSET_StgRegTable_rSp dflags
+baseRegOffset dflags SpLim               = oFFSET_StgRegTable_rSpLim dflags
+baseRegOffset dflags (LongReg 1)         = oFFSET_StgRegTable_rL1 dflags
+baseRegOffset _      (LongReg n)         = panic ("Registers above L1 are not supported (tried to use L" ++ show n ++ ")")
+baseRegOffset dflags Hp                  = oFFSET_StgRegTable_rHp dflags
+baseRegOffset dflags HpLim               = oFFSET_StgRegTable_rHpLim dflags
+baseRegOffset dflags CCCS                = oFFSET_StgRegTable_rCCCS dflags
+baseRegOffset dflags CurrentTSO          = oFFSET_StgRegTable_rCurrentTSO dflags
+baseRegOffset dflags CurrentNursery      = oFFSET_StgRegTable_rCurrentNursery dflags
+baseRegOffset dflags HpAlloc             = oFFSET_StgRegTable_rHpAlloc dflags
+baseRegOffset dflags EagerBlackholeInfo  = oFFSET_stgEagerBlackholeInfo dflags
+baseRegOffset dflags GCEnter1            = oFFSET_stgGCEnter1 dflags
+baseRegOffset dflags GCFun               = oFFSET_stgGCFun dflags
+baseRegOffset _      BaseReg             = panic "CgUtils.baseRegOffset:BaseReg"
+baseRegOffset _      PicBaseReg          = panic "CgUtils.baseRegOffset:PicBaseReg"
+baseRegOffset _      MachSp              = panic "CgUtils.baseRegOffset:MachSp"
+baseRegOffset _      UnwindReturnReg     = panic "CgUtils.baseRegOffset:UnwindReturnReg"
+
+
+-- -----------------------------------------------------------------------------
+--
+-- STG/Cmm GlobalReg
+--
+-- -----------------------------------------------------------------------------
+
+-- | We map STG registers onto appropriate CmmExprs.  Either they map
+-- to real machine registers or stored as offsets from BaseReg.  Given
+-- a GlobalReg, get_GlobalReg_addr always produces the
+-- register table address for it.
+get_GlobalReg_addr :: DynFlags -> GlobalReg -> CmmExpr
+get_GlobalReg_addr dflags BaseReg = regTableOffset dflags 0
+get_GlobalReg_addr dflags mid
+    = get_Regtable_addr_from_offset dflags (baseRegOffset dflags mid)
+
+-- Calculate a literal representing an offset into the register table.
+-- Used when we don't have an actual BaseReg to offset from.
+regTableOffset :: DynFlags -> Int -> CmmExpr
+regTableOffset dflags n =
+  CmmLit (CmmLabelOff mkMainCapabilityLabel (oFFSET_Capability_r dflags + n))
+
+get_Regtable_addr_from_offset :: DynFlags -> Int -> CmmExpr
+get_Regtable_addr_from_offset dflags offset =
+    if haveRegBase (targetPlatform dflags)
+    then CmmRegOff baseReg offset
+    else regTableOffset dflags offset
+
+-- | Fixup global registers so that they assign to locations within the
+-- RegTable if they aren't pinned for the current target.
+fixStgRegisters :: DynFlags -> RawCmmDecl -> RawCmmDecl
+fixStgRegisters _ top@(CmmData _ _) = top
+
+fixStgRegisters dflags (CmmProc info lbl live graph) =
+  let graph' = modifyGraph (mapGraphBlocks (fixStgRegBlock dflags)) graph
+  in CmmProc info lbl live graph'
+
+fixStgRegBlock :: DynFlags -> Block CmmNode e x -> Block CmmNode e x
+fixStgRegBlock dflags block = mapBlock (fixStgRegStmt dflags) block
+
+fixStgRegStmt :: DynFlags -> CmmNode e x -> CmmNode e x
+fixStgRegStmt dflags stmt = fixAssign $ mapExpDeep fixExpr stmt
+  where
+    platform = targetPlatform dflags
+
+    fixAssign stmt =
+      case stmt of
+        CmmAssign (CmmGlobal reg) src
+          -- MachSp isn't an STG register; it's merely here for tracking unwind
+          -- information
+          | reg == MachSp -> stmt
+          | otherwise ->
+            let baseAddr = get_GlobalReg_addr dflags reg
+            in case reg `elem` activeStgRegs (targetPlatform dflags) of
+                True  -> CmmAssign (CmmGlobal reg) src
+                False -> CmmStore baseAddr src
+        other_stmt -> other_stmt
+
+    fixExpr expr = case expr of
+        -- MachSp isn't an STG; it's merely here for tracking unwind information
+        CmmReg (CmmGlobal MachSp) -> expr
+        CmmReg (CmmGlobal reg) ->
+            -- Replace register leaves with appropriate StixTrees for
+            -- the given target.  MagicIds which map to a reg on this
+            -- arch are left unchanged.  For the rest, BaseReg is taken
+            -- to mean the address of the reg table in MainCapability,
+            -- and for all others we generate an indirection to its
+            -- location in the register table.
+            case reg `elem` activeStgRegs platform of
+                True  -> expr
+                False ->
+                    let baseAddr = get_GlobalReg_addr dflags reg
+                    in case reg of
+                        BaseReg -> baseAddr
+                        _other  -> CmmLoad baseAddr (globalRegType dflags reg)
+
+        CmmRegOff (CmmGlobal reg) offset ->
+            -- RegOf leaves are just a shorthand form. If the reg maps
+            -- to a real reg, we keep the shorthand, otherwise, we just
+            -- expand it and defer to the above code.
+            case reg `elem` activeStgRegs platform of
+                True  -> expr
+                False -> CmmMachOp (MO_Add (wordWidth dflags)) [
+                                    fixExpr (CmmReg (CmmGlobal reg)),
+                                    CmmLit (CmmInt (fromIntegral offset)
+                                                   (wordWidth dflags))]
+
+        other_expr -> other_expr
diff --git a/compiler/GHC/StgToCmm/Closure.hs b/compiler/GHC/StgToCmm/Closure.hs
new file mode 100644
index 0000000000..b56b06f399
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Closure.hs
@@ -0,0 +1,1008 @@
+{-# LANGUAGE CPP, RecordWildCards #-}
+
+-----------------------------------------------------------------------------
+--
+-- Stg to C-- code generation:
+--
+-- The types   LambdaFormInfo
+--             ClosureInfo
+--
+-- Nothing monadic in here!
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.Closure (
+        DynTag,  tagForCon, isSmallFamily,
+
+        idPrimRep, isVoidRep, isGcPtrRep, addIdReps, addArgReps,
+        argPrimRep,
+
+        NonVoid(..), fromNonVoid, nonVoidIds, nonVoidStgArgs,
+        assertNonVoidIds, assertNonVoidStgArgs,
+
+        -- * LambdaFormInfo
+        LambdaFormInfo,         -- Abstract
+        StandardFormInfo,        -- ...ditto...
+        mkLFThunk, mkLFReEntrant, mkConLFInfo, mkSelectorLFInfo,
+        mkApLFInfo, mkLFImported, mkLFArgument, mkLFLetNoEscape,
+        mkLFStringLit,
+        lfDynTag,
+        isLFThunk, isLFReEntrant, lfUpdatable,
+
+        -- * Used by other modules
+        CgLoc(..), SelfLoopInfo, CallMethod(..),
+        nodeMustPointToIt, isKnownFun, funTag, tagForArity, getCallMethod,
+
+        -- * ClosureInfo
+        ClosureInfo,
+        mkClosureInfo,
+        mkCmmInfo,
+
+        -- ** Inspection
+        closureLFInfo, closureName,
+
+        -- ** Labels
+        -- These just need the info table label
+        closureInfoLabel, staticClosureLabel,
+        closureSlowEntryLabel, closureLocalEntryLabel,
+
+        -- ** Predicates
+        -- These are really just functions on LambdaFormInfo
+        closureUpdReqd, closureSingleEntry,
+        closureReEntrant, closureFunInfo,
+        isToplevClosure,
+
+        blackHoleOnEntry,  -- Needs LambdaFormInfo and SMRep
+        isStaticClosure,   -- Needs SMPre
+
+        -- * InfoTables
+        mkDataConInfoTable,
+        cafBlackHoleInfoTable,
+        indStaticInfoTable,
+        staticClosureNeedsLink,
+    ) where
+
+#include "../includes/MachDeps.h"
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import StgSyn
+import SMRep
+import Cmm
+import PprCmmExpr() -- For Outputable instances
+
+import CostCentre
+import BlockId
+import CLabel
+import Id
+import IdInfo
+import DataCon
+import Name
+import Type
+import TyCoRep
+import TcType
+import TyCon
+import RepType
+import BasicTypes
+import Outputable
+import DynFlags
+import Util
+
+import Data.Coerce (coerce)
+import qualified Data.ByteString.Char8 as BS8
+
+-----------------------------------------------------------------------------
+--                Data types and synonyms
+-----------------------------------------------------------------------------
+
+-- These data types are mostly used by other modules, especially
+-- GHC.StgToCmm.Monad, but we define them here because some functions in this
+-- module need to have access to them as well
+
+data CgLoc
+  = CmmLoc CmmExpr      -- A stable CmmExpr; that is, one not mentioning
+                        -- Hp, so that it remains valid across calls
+
+  | LneLoc BlockId [LocalReg]             -- A join point
+        -- A join point (= let-no-escape) should only
+        -- be tail-called, and in a saturated way.
+        -- To tail-call it, assign to these locals,
+        -- and branch to the block id
+
+instance Outputable CgLoc where
+  ppr (CmmLoc e)    = text "cmm" <+> ppr e
+  ppr (LneLoc b rs) = text "lne" <+> ppr b <+> ppr rs
+
+type SelfLoopInfo = (Id, BlockId, [LocalReg])
+
+-- used by ticky profiling
+isKnownFun :: LambdaFormInfo -> Bool
+isKnownFun LFReEntrant{} = True
+isKnownFun LFLetNoEscape = True
+isKnownFun _             = False
+
+
+-------------------------------------
+--        Non-void types
+-------------------------------------
+-- We frequently need the invariant that an Id or a an argument
+-- is of a non-void type. This type is a witness to the invariant.
+
+newtype NonVoid a = NonVoid a
+  deriving (Eq, Show)
+
+fromNonVoid :: NonVoid a -> a
+fromNonVoid (NonVoid a) = a
+
+instance (Outputable a) => Outputable (NonVoid a) where
+  ppr (NonVoid a) = ppr a
+
+nonVoidIds :: [Id] -> [NonVoid Id]
+nonVoidIds ids = [NonVoid id | id <- ids, not (isVoidTy (idType id))]
+
+-- | Used in places where some invariant ensures that all these Ids are
+-- non-void; e.g. constructor field binders in case expressions.
+-- See Note [Post-unarisation invariants] in UnariseStg.
+assertNonVoidIds :: [Id] -> [NonVoid Id]
+assertNonVoidIds ids = ASSERT(not (any (isVoidTy . idType) ids))
+                       coerce ids
+
+nonVoidStgArgs :: [StgArg] -> [NonVoid StgArg]
+nonVoidStgArgs args = [NonVoid arg | arg <- args, not (isVoidTy (stgArgType arg))]
+
+-- | Used in places where some invariant ensures that all these arguments are
+-- non-void; e.g. constructor arguments.
+-- See Note [Post-unarisation invariants] in UnariseStg.
+assertNonVoidStgArgs :: [StgArg] -> [NonVoid StgArg]
+assertNonVoidStgArgs args = ASSERT(not (any (isVoidTy . stgArgType) args))
+                            coerce args
+
+
+-----------------------------------------------------------------------------
+--                Representations
+-----------------------------------------------------------------------------
+
+-- Why are these here?
+
+-- | Assumes that there is precisely one 'PrimRep' of the type. This assumption
+-- holds after unarise.
+-- See Note [Post-unarisation invariants]
+idPrimRep :: Id -> PrimRep
+idPrimRep id = typePrimRep1 (idType id)
+    -- See also Note [VoidRep] in RepType
+
+-- | Assumes that Ids have one PrimRep, which holds after unarisation.
+-- See Note [Post-unarisation invariants]
+addIdReps :: [NonVoid Id] -> [NonVoid (PrimRep, Id)]
+addIdReps = map (\id -> let id' = fromNonVoid id
+                         in NonVoid (idPrimRep id', id'))
+
+-- | Assumes that arguments have one PrimRep, which holds after unarisation.
+-- See Note [Post-unarisation invariants]
+addArgReps :: [NonVoid StgArg] -> [NonVoid (PrimRep, StgArg)]
+addArgReps = map (\arg -> let arg' = fromNonVoid arg
+                           in NonVoid (argPrimRep arg', arg'))
+
+-- | Assumes that the argument has one PrimRep, which holds after unarisation.
+-- See Note [Post-unarisation invariants]
+argPrimRep :: StgArg -> PrimRep
+argPrimRep arg = typePrimRep1 (stgArgType arg)
+
+
+-----------------------------------------------------------------------------
+--                LambdaFormInfo
+-----------------------------------------------------------------------------
+
+-- Information about an identifier, from the code generator's point of
+-- view.  Every identifier is bound to a LambdaFormInfo in the
+-- environment, which gives the code generator enough info to be able to
+-- tail call or return that identifier.
+
+data LambdaFormInfo
+  = LFReEntrant         -- Reentrant closure (a function)
+        TopLevelFlag    -- True if top level
+        OneShotInfo
+        !RepArity       -- Arity. Invariant: always > 0
+        !Bool           -- True <=> no fvs
+        ArgDescr        -- Argument descriptor (should really be in ClosureInfo)
+
+  | LFThunk             -- Thunk (zero arity)
+        TopLevelFlag
+        !Bool           -- True <=> no free vars
+        !Bool           -- True <=> updatable (i.e., *not* single-entry)
+        StandardFormInfo
+        !Bool           -- True <=> *might* be a function type
+
+  | LFCon               -- A saturated constructor application
+        DataCon         -- The constructor
+
+  | LFUnknown           -- Used for function arguments and imported things.
+                        -- We know nothing about this closure.
+                        -- Treat like updatable "LFThunk"...
+                        -- Imported things which we *do* know something about use
+                        -- one of the other LF constructors (eg LFReEntrant for
+                        -- known functions)
+        !Bool           -- True <=> *might* be a function type
+                        --      The False case is good when we want to enter it,
+                        --        because then we know the entry code will do
+                        --        For a function, the entry code is the fast entry point
+
+  | LFUnlifted          -- A value of unboxed type;
+                        -- always a value, needs evaluation
+
+  | LFLetNoEscape       -- See LetNoEscape module for precise description
+
+
+-------------------------
+-- StandardFormInfo tells whether this thunk has one of
+-- a small number of standard forms
+
+data StandardFormInfo
+  = NonStandardThunk
+        -- The usual case: not of the standard forms
+
+  | SelectorThunk
+        -- A SelectorThunk is of form
+        --      case x of
+        --           con a1,..,an -> ak
+        -- and the constructor is from a single-constr type.
+       WordOff          -- 0-origin offset of ak within the "goods" of
+                        -- constructor (Recall that the a1,...,an may be laid
+                        -- out in the heap in a non-obvious order.)
+
+  | ApThunk
+        -- An ApThunk is of form
+        --        x1 ... xn
+        -- The code for the thunk just pushes x2..xn on the stack and enters x1.
+        -- There are a few of these (for 1 <= n <= MAX_SPEC_AP_SIZE) pre-compiled
+        -- in the RTS to save space.
+        RepArity                -- Arity, n
+
+
+------------------------------------------------------
+--                Building LambdaFormInfo
+------------------------------------------------------
+
+mkLFArgument :: Id -> LambdaFormInfo
+mkLFArgument id
+  | isUnliftedType ty      = LFUnlifted
+  | might_be_a_function ty = LFUnknown True
+  | otherwise              = LFUnknown False
+  where
+    ty = idType id
+
+-------------
+mkLFLetNoEscape :: LambdaFormInfo
+mkLFLetNoEscape = LFLetNoEscape
+
+-------------
+mkLFReEntrant :: TopLevelFlag    -- True of top level
+              -> [Id]            -- Free vars
+              -> [Id]            -- Args
+              -> ArgDescr        -- Argument descriptor
+              -> LambdaFormInfo
+
+mkLFReEntrant _ _ [] _
+  = pprPanic "mkLFReEntrant" empty
+mkLFReEntrant top fvs args arg_descr
+  = LFReEntrant top os_info (length args) (null fvs) arg_descr
+  where os_info = idOneShotInfo (head args)
+
+-------------
+mkLFThunk :: Type -> TopLevelFlag -> [Id] -> UpdateFlag -> LambdaFormInfo
+mkLFThunk thunk_ty top fvs upd_flag
+  = ASSERT( not (isUpdatable upd_flag) || not (isUnliftedType thunk_ty) )
+    LFThunk top (null fvs)
+            (isUpdatable upd_flag)
+            NonStandardThunk
+            (might_be_a_function thunk_ty)
+
+--------------
+might_be_a_function :: Type -> Bool
+-- Return False only if we are *sure* it's a data type
+-- Look through newtypes etc as much as poss
+might_be_a_function ty
+  | [LiftedRep] <- typePrimRep ty
+  , Just tc <- tyConAppTyCon_maybe (unwrapType ty)
+  , isDataTyCon tc
+  = False
+  | otherwise
+  = True
+
+-------------
+mkConLFInfo :: DataCon -> LambdaFormInfo
+mkConLFInfo con = LFCon con
+
+-------------
+mkSelectorLFInfo :: Id -> Int -> Bool -> LambdaFormInfo
+mkSelectorLFInfo id offset updatable
+  = LFThunk NotTopLevel False updatable (SelectorThunk offset)
+        (might_be_a_function (idType id))
+
+-------------
+mkApLFInfo :: Id -> UpdateFlag -> Arity -> LambdaFormInfo
+mkApLFInfo id upd_flag arity
+  = LFThunk NotTopLevel (arity == 0) (isUpdatable upd_flag) (ApThunk arity)
+        (might_be_a_function (idType id))
+
+-------------
+mkLFImported :: Id -> LambdaFormInfo
+mkLFImported id
+  | Just con <- isDataConWorkId_maybe id
+  , isNullaryRepDataCon con
+  = LFCon con   -- An imported nullary constructor
+                -- We assume that the constructor is evaluated so that
+                -- the id really does point directly to the constructor
+
+  | arity > 0
+  = LFReEntrant TopLevel noOneShotInfo arity True (panic "arg_descr")
+
+  | otherwise
+  = mkLFArgument id -- Not sure of exact arity
+  where
+    arity = idFunRepArity id
+
+-------------
+mkLFStringLit :: LambdaFormInfo
+mkLFStringLit = LFUnlifted
+
+-----------------------------------------------------
+--                Dynamic pointer tagging
+-----------------------------------------------------
+
+type DynTag = Int       -- The tag on a *pointer*
+                        -- (from the dynamic-tagging paper)
+
+-- Note [Data constructor dynamic tags]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- The family size of a data type (the number of constructors
+-- or the arity of a function) can be either:
+--    * small, if the family size < 2**tag_bits
+--    * big, otherwise.
+--
+-- Small families can have the constructor tag in the tag bits.
+-- Big families only use the tag value 1 to represent evaluatedness.
+-- We don't have very many tag bits: for example, we have 2 bits on
+-- x86-32 and 3 bits on x86-64.
+
+isSmallFamily :: DynFlags -> Int -> Bool
+isSmallFamily dflags fam_size = fam_size <= mAX_PTR_TAG dflags
+
+tagForCon :: DynFlags -> DataCon -> DynTag
+tagForCon dflags con
+  | isSmallFamily dflags fam_size = con_tag
+  | otherwise                     = 1
+  where
+    con_tag  = dataConTag con -- NB: 1-indexed
+    fam_size = tyConFamilySize (dataConTyCon con)
+
+tagForArity :: DynFlags -> RepArity -> DynTag
+tagForArity dflags arity
+ | isSmallFamily dflags arity = arity
+ | otherwise                  = 0
+
+lfDynTag :: DynFlags -> LambdaFormInfo -> DynTag
+-- Return the tag in the low order bits of a variable bound
+-- to this LambdaForm
+lfDynTag dflags (LFCon con)                 = tagForCon dflags con
+lfDynTag dflags (LFReEntrant _ _ arity _ _) = tagForArity dflags arity
+lfDynTag _      _other                      = 0
+
+
+-----------------------------------------------------------------------------
+--                Observing LambdaFormInfo
+-----------------------------------------------------------------------------
+
+------------
+isLFThunk :: LambdaFormInfo -> Bool
+isLFThunk (LFThunk {})  = True
+isLFThunk _ = False
+
+isLFReEntrant :: LambdaFormInfo -> Bool
+isLFReEntrant (LFReEntrant {}) = True
+isLFReEntrant _                = False
+
+-----------------------------------------------------------------------------
+--                Choosing SM reps
+-----------------------------------------------------------------------------
+
+lfClosureType :: LambdaFormInfo -> ClosureTypeInfo
+lfClosureType (LFReEntrant _ _ arity _ argd) = Fun arity argd
+lfClosureType (LFCon con)                    = Constr (dataConTagZ con)
+                                                      (dataConIdentity con)
+lfClosureType (LFThunk _ _ _ is_sel _)       = thunkClosureType is_sel
+lfClosureType _                              = panic "lfClosureType"
+
+thunkClosureType :: StandardFormInfo -> ClosureTypeInfo
+thunkClosureType (SelectorThunk off) = ThunkSelector off
+thunkClosureType _                   = Thunk
+
+-- We *do* get non-updatable top-level thunks sometimes.  eg. f = g
+-- gets compiled to a jump to g (if g has non-zero arity), instead of
+-- messing around with update frames and PAPs.  We set the closure type
+-- to FUN_STATIC in this case.
+
+-----------------------------------------------------------------------------
+--                nodeMustPointToIt
+-----------------------------------------------------------------------------
+
+nodeMustPointToIt :: DynFlags -> LambdaFormInfo -> Bool
+-- If nodeMustPointToIt is true, then the entry convention for
+-- this closure has R1 (the "Node" register) pointing to the
+-- closure itself --- the "self" argument
+
+nodeMustPointToIt _ (LFReEntrant top _ _ no_fvs _)
+  =  not no_fvs          -- Certainly if it has fvs we need to point to it
+  || isNotTopLevel top   -- See Note [GC recovery]
+        -- For lex_profiling we also access the cost centre for a
+        -- non-inherited (i.e. non-top-level) function.
+        -- The isNotTopLevel test above ensures this is ok.
+
+nodeMustPointToIt dflags (LFThunk top no_fvs updatable NonStandardThunk _)
+  =  not no_fvs            -- Self parameter
+  || isNotTopLevel top     -- Note [GC recovery]
+  || updatable             -- Need to push update frame
+  || gopt Opt_SccProfilingOn dflags
+          -- For the non-updatable (single-entry case):
+          --
+          -- True if has fvs (in which case we need access to them, and we
+          --                    should black-hole it)
+          -- or profiling (in which case we need to recover the cost centre
+          --                 from inside it)  ToDo: do we need this even for
+          --                                    top-level thunks? If not,
+          --                                    isNotTopLevel subsumes this
+
+nodeMustPointToIt _ (LFThunk {})        -- Node must point to a standard-form thunk
+  = True
+
+nodeMustPointToIt _ (LFCon _) = True
+
+        -- Strictly speaking, the above two don't need Node to point
+        -- to it if the arity = 0.  But this is a *really* unlikely
+        -- situation.  If we know it's nil (say) and we are entering
+        -- it. Eg: let x = [] in x then we will certainly have inlined
+        -- x, since nil is a simple atom.  So we gain little by not
+        -- having Node point to known zero-arity things.  On the other
+        -- hand, we do lose something; Patrick's code for figuring out
+        -- when something has been updated but not entered relies on
+        -- having Node point to the result of an update.  SLPJ
+        -- 27/11/92.
+
+nodeMustPointToIt _ (LFUnknown _)   = True
+nodeMustPointToIt _ LFUnlifted      = False
+nodeMustPointToIt _ LFLetNoEscape   = False
+
+{- Note [GC recovery]
+~~~~~~~~~~~~~~~~~~~~~
+If we a have a local let-binding (function or thunk)
+   let f = <body> in ...
+AND <body> allocates, then the heap-overflow check needs to know how
+to re-start the evaluation.  It uses the "self" pointer to do this.
+So even if there are no free variables in <body>, we still make
+nodeMustPointToIt be True for non-top-level bindings.
+
+Why do any such bindings exist?  After all, let-floating should have
+floated them out.  Well, a clever optimiser might leave one there to
+avoid a space leak, deliberately recomputing a thunk.  Also (and this
+really does happen occasionally) let-floating may make a function f smaller
+so it can be inlined, so now (f True) may generate a local no-fv closure.
+This actually happened during bootstrapping GHC itself, with f=mkRdrFunBind
+in TcGenDeriv.) -}
+
+-----------------------------------------------------------------------------
+--                getCallMethod
+-----------------------------------------------------------------------------
+
+{- The entry conventions depend on the type of closure being entered,
+whether or not it has free variables, and whether we're running
+sequentially or in parallel.
+
+Closure                           Node   Argument   Enter
+Characteristics              Par   Req'd  Passing    Via
+---------------------------------------------------------------------------
+Unknown                     & no  & yes & stack     & node
+Known fun (>1 arg), no fvs  & no  & no  & registers & fast entry (enough args)
+                                                    & slow entry (otherwise)
+Known fun (>1 arg), fvs     & no  & yes & registers & fast entry (enough args)
+0 arg, no fvs \r,\s         & no  & no  & n/a       & direct entry
+0 arg, no fvs \u            & no  & yes & n/a       & node
+0 arg, fvs \r,\s,selector   & no  & yes & n/a       & node
+0 arg, fvs \r,\s            & no  & yes & n/a       & direct entry
+0 arg, fvs \u               & no  & yes & n/a       & node
+Unknown                     & yes & yes & stack     & node
+Known fun (>1 arg), no fvs  & yes & no  & registers & fast entry (enough args)
+                                                    & slow entry (otherwise)
+Known fun (>1 arg), fvs     & yes & yes & registers & node
+0 arg, fvs \r,\s,selector   & yes & yes & n/a       & node
+0 arg, no fvs \r,\s         & yes & no  & n/a       & direct entry
+0 arg, no fvs \u            & yes & yes & n/a       & node
+0 arg, fvs \r,\s            & yes & yes & n/a       & node
+0 arg, fvs \u               & yes & yes & n/a       & node
+
+When black-holing, single-entry closures could also be entered via node
+(rather than directly) to catch double-entry. -}
+
+data CallMethod
+  = EnterIt             -- No args, not a function
+
+  | JumpToIt BlockId [LocalReg] -- A join point or a header of a local loop
+
+  | ReturnIt            -- It's a value (function, unboxed value,
+                        -- or constructor), so just return it.
+
+  | SlowCall                -- Unknown fun, or known fun with
+                        -- too few args.
+
+  | DirectEntry         -- Jump directly, with args in regs
+        CLabel          --   The code label
+        RepArity        --   Its arity
+
+getCallMethod :: DynFlags
+              -> Name           -- Function being applied
+              -> Id             -- Function Id used to chech if it can refer to
+                                -- CAF's and whether the function is tail-calling
+                                -- itself
+              -> LambdaFormInfo -- Its info
+              -> RepArity       -- Number of available arguments
+              -> RepArity       -- Number of them being void arguments
+              -> CgLoc          -- Passed in from cgIdApp so that we can
+                                -- handle let-no-escape bindings and self-recursive
+                                -- tail calls using the same data constructor,
+                                -- JumpToIt. This saves us one case branch in
+                                -- cgIdApp
+              -> Maybe SelfLoopInfo -- can we perform a self-recursive tail call?
+              -> CallMethod
+
+getCallMethod dflags _ id _ n_args v_args _cg_loc
+              (Just (self_loop_id, block_id, args))
+  | gopt Opt_Loopification dflags
+  , id == self_loop_id
+  , args `lengthIs` (n_args - v_args)
+  -- If these patterns match then we know that:
+  --   * loopification optimisation is turned on
+  --   * function is performing a self-recursive call in a tail position
+  --   * number of non-void parameters of the function matches functions arity.
+  -- See Note [Self-recursive tail calls] and Note [Void arguments in
+  -- self-recursive tail calls] in GHC.StgToCmm.Expr for more details
+  = JumpToIt block_id args
+
+getCallMethod dflags name id (LFReEntrant _ _ arity _ _) n_args _v_args _cg_loc
+              _self_loop_info
+  | n_args == 0 -- No args at all
+  && not (gopt Opt_SccProfilingOn dflags)
+     -- See Note [Evaluating functions with profiling] in rts/Apply.cmm
+  = ASSERT( arity /= 0 ) ReturnIt
+  | n_args < arity = SlowCall        -- Not enough args
+  | otherwise      = DirectEntry (enterIdLabel dflags name (idCafInfo id)) arity
+
+getCallMethod _ _name _ LFUnlifted n_args _v_args _cg_loc _self_loop_info
+  = ASSERT( n_args == 0 ) ReturnIt
+
+getCallMethod _ _name _ (LFCon _) n_args _v_args _cg_loc _self_loop_info
+  = ASSERT( n_args == 0 ) ReturnIt
+    -- n_args=0 because it'd be ill-typed to apply a saturated
+    --          constructor application to anything
+
+getCallMethod dflags name id (LFThunk _ _ updatable std_form_info is_fun)
+              n_args _v_args _cg_loc _self_loop_info
+  | is_fun      -- it *might* be a function, so we must "call" it (which is always safe)
+  = SlowCall    -- We cannot just enter it [in eval/apply, the entry code
+                -- is the fast-entry code]
+
+  -- Since is_fun is False, we are *definitely* looking at a data value
+  | updatable || gopt Opt_Ticky dflags -- to catch double entry
+      {- OLD: || opt_SMP
+         I decided to remove this, because in SMP mode it doesn't matter
+         if we enter the same thunk multiple times, so the optimisation
+         of jumping directly to the entry code is still valid.  --SDM
+        -}
+  = EnterIt
+
+  -- even a non-updatable selector thunk can be updated by the garbage
+  -- collector, so we must enter it. (#8817)
+  | SelectorThunk{} <- std_form_info
+  = EnterIt
+
+    -- We used to have ASSERT( n_args == 0 ), but actually it is
+    -- possible for the optimiser to generate
+    --   let bot :: Int = error Int "urk"
+    --   in (bot `cast` unsafeCoerce Int (Int -> Int)) 3
+    -- This happens as a result of the case-of-error transformation
+    -- So the right thing to do is just to enter the thing
+
+  | otherwise        -- Jump direct to code for single-entry thunks
+  = ASSERT( n_args == 0 )
+    DirectEntry (thunkEntryLabel dflags name (idCafInfo id) std_form_info
+                updatable) 0
+
+getCallMethod _ _name _ (LFUnknown True) _n_arg _v_args _cg_locs _self_loop_info
+  = SlowCall -- might be a function
+
+getCallMethod _ name _ (LFUnknown False) n_args _v_args _cg_loc _self_loop_info
+  = ASSERT2( n_args == 0, ppr name <+> ppr n_args )
+    EnterIt -- Not a function
+
+getCallMethod _ _name _ LFLetNoEscape _n_args _v_args (LneLoc blk_id lne_regs)
+              _self_loop_info
+  = JumpToIt blk_id lne_regs
+
+getCallMethod _ _ _ _ _ _ _ _ = panic "Unknown call method"
+
+-----------------------------------------------------------------------------
+--              Data types for closure information
+-----------------------------------------------------------------------------
+
+
+{- ClosureInfo: information about a binding
+
+   We make a ClosureInfo for each let binding (both top level and not),
+   but not bindings for data constructors: for those we build a CmmInfoTable
+   directly (see mkDataConInfoTable).
+
+   To a first approximation:
+       ClosureInfo = (LambdaFormInfo, CmmInfoTable)
+
+   A ClosureInfo has enough information
+     a) to construct the info table itself, and build other things
+        related to the binding (e.g. slow entry points for a function)
+     b) to allocate a closure containing that info pointer (i.e.
+           it knows the info table label)
+-}
+
+data ClosureInfo
+  = ClosureInfo {
+        closureName :: !Name,           -- The thing bound to this closure
+           -- we don't really need this field: it's only used in generating
+           -- code for ticky and profiling, and we could pass the information
+           -- around separately, but it doesn't do much harm to keep it here.
+
+        closureLFInfo :: !LambdaFormInfo, -- NOTE: not an LFCon
+          -- this tells us about what the closure contains: it's right-hand-side.
+
+          -- the rest is just an unpacked CmmInfoTable.
+        closureInfoLabel :: !CLabel,
+        closureSMRep     :: !SMRep,          -- representation used by storage mgr
+        closureProf      :: !ProfilingInfo
+    }
+
+-- | Convert from 'ClosureInfo' to 'CmmInfoTable'.
+mkCmmInfo :: ClosureInfo -> Id -> CostCentreStack -> CmmInfoTable
+mkCmmInfo ClosureInfo {..} id ccs
+  = CmmInfoTable { cit_lbl  = closureInfoLabel
+                 , cit_rep  = closureSMRep
+                 , cit_prof = closureProf
+                 , cit_srt  = Nothing
+                 , cit_clo  = if isStaticRep closureSMRep
+                                then Just (id,ccs)
+                                else Nothing }
+
+--------------------------------------
+--        Building ClosureInfos
+--------------------------------------
+
+mkClosureInfo :: DynFlags
+              -> Bool                -- Is static
+              -> Id
+              -> LambdaFormInfo
+              -> Int -> Int        -- Total and pointer words
+              -> String         -- String descriptor
+              -> ClosureInfo
+mkClosureInfo dflags is_static id lf_info tot_wds ptr_wds val_descr
+  = ClosureInfo { closureName      = name
+                , closureLFInfo    = lf_info
+                , closureInfoLabel = info_lbl   -- These three fields are
+                , closureSMRep     = sm_rep     -- (almost) an info table
+                , closureProf      = prof }     -- (we don't have an SRT yet)
+  where
+    name       = idName id
+    sm_rep     = mkHeapRep dflags is_static ptr_wds nonptr_wds (lfClosureType lf_info)
+    prof       = mkProfilingInfo dflags id val_descr
+    nonptr_wds = tot_wds - ptr_wds
+
+    info_lbl = mkClosureInfoTableLabel id lf_info
+
+--------------------------------------
+--   Other functions over ClosureInfo
+--------------------------------------
+
+-- Eager blackholing is normally disabled, but can be turned on with
+-- -feager-blackholing.  When it is on, we replace the info pointer of
+-- the thunk with stg_EAGER_BLACKHOLE_info on entry.
+
+-- If we wanted to do eager blackholing with slop filling,
+-- we'd need to do it at the *end* of a basic block, otherwise
+-- we overwrite the free variables in the thunk that we still
+-- need.  We have a patch for this from Andy Cheadle, but not
+-- incorporated yet. --SDM [6/2004]
+--
+-- Previously, eager blackholing was enabled when ticky-ticky
+-- was on. But it didn't work, and it wasn't strictly necessary
+-- to bring back minimal ticky-ticky, so now EAGER_BLACKHOLING
+-- is unconditionally disabled. -- krc 1/2007
+--
+-- Static closures are never themselves black-holed.
+
+blackHoleOnEntry :: ClosureInfo -> Bool
+blackHoleOnEntry cl_info
+  | isStaticRep (closureSMRep cl_info)
+  = False        -- Never black-hole a static closure
+
+  | otherwise
+  = case closureLFInfo cl_info of
+      LFReEntrant {}            -> False
+      LFLetNoEscape             -> False
+      LFThunk _ _no_fvs upd _ _ -> upd   -- See Note [Black-holing non-updatable thunks]
+      _other -> panic "blackHoleOnEntry"
+
+{- Note [Black-holing non-updatable thunks]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We must not black-hole non-updatable (single-entry) thunks otherwise
+we run into issues like #10414. Specifically:
+
+  * There is no reason to black-hole a non-updatable thunk: it should
+    not be competed for by multiple threads
+
+  * It could, conceivably, cause a space leak if we don't black-hole
+    it, if there was a live but never-followed pointer pointing to it.
+    Let's hope that doesn't happen.
+
+  * It is dangerous to black-hole a non-updatable thunk because
+     - is not updated (of course)
+     - hence, if it is black-holed and another thread tries to evaluate
+       it, that thread will block forever
+    This actually happened in #10414.  So we do not black-hole
+    non-updatable thunks.
+
+  * How could two threads evaluate the same non-updatable (single-entry)
+    thunk?  See Reid Barton's example below.
+
+  * Only eager blackholing could possibly black-hole a non-updatable
+    thunk, because lazy black-holing only affects thunks with an
+    update frame on the stack.
+
+Here is and example due to Reid Barton (#10414):
+    x = \u []  concat [[1], []]
+with the following definitions,
+
+    concat x = case x of
+        []       -> []
+        (:) x xs -> (++) x (concat xs)
+
+    (++) xs ys = case xs of
+        []         -> ys
+        (:) x rest -> (:) x ((++) rest ys)
+
+Where we use the syntax @\u []@ to denote an updatable thunk and @\s []@ to
+denote a single-entry (i.e. non-updatable) thunk. After a thread evaluates @x@
+to WHNF and calls @(++)@ the heap will contain the following thunks,
+
+    x = 1 : y
+    y = \u []  (++) [] z
+    z = \s []  concat []
+
+Now that the stage is set, consider the follow evaluations by two racing threads
+A and B,
+
+  1. Both threads enter @y@ before either is able to replace it with an
+     indirection
+
+  2. Thread A does the case analysis in @(++)@ and consequently enters @z@,
+     replacing it with a black-hole
+
+  3. At some later point thread B does the same case analysis and also attempts
+     to enter @z@. However, it finds that it has been replaced with a black-hole
+     so it blocks.
+
+  4. Thread A eventually finishes evaluating @z@ (to @[]@) and updates @y@
+     accordingly. It does *not* update @z@, however, as it is single-entry. This
+     leaves Thread B blocked forever on a black-hole which will never be
+     updated.
+
+To avoid this sort of condition we never black-hole non-updatable thunks.
+-}
+
+isStaticClosure :: ClosureInfo -> Bool
+isStaticClosure cl_info = isStaticRep (closureSMRep cl_info)
+
+closureUpdReqd :: ClosureInfo -> Bool
+closureUpdReqd ClosureInfo{ closureLFInfo = lf_info } = lfUpdatable lf_info
+
+lfUpdatable :: LambdaFormInfo -> Bool
+lfUpdatable (LFThunk _ _ upd _ _)  = upd
+lfUpdatable _ = False
+
+closureSingleEntry :: ClosureInfo -> Bool
+closureSingleEntry (ClosureInfo { closureLFInfo = LFThunk _ _ upd _ _}) = not upd
+closureSingleEntry (ClosureInfo { closureLFInfo = LFReEntrant _ OneShotLam _ _ _}) = True
+closureSingleEntry _ = False
+
+closureReEntrant :: ClosureInfo -> Bool
+closureReEntrant (ClosureInfo { closureLFInfo = LFReEntrant {} }) = True
+closureReEntrant _ = False
+
+closureFunInfo :: ClosureInfo -> Maybe (RepArity, ArgDescr)
+closureFunInfo (ClosureInfo { closureLFInfo = lf_info }) = lfFunInfo lf_info
+
+lfFunInfo :: LambdaFormInfo ->  Maybe (RepArity, ArgDescr)
+lfFunInfo (LFReEntrant _ _ arity _ arg_desc)  = Just (arity, arg_desc)
+lfFunInfo _                                   = Nothing
+
+funTag :: DynFlags -> ClosureInfo -> DynTag
+funTag dflags (ClosureInfo { closureLFInfo = lf_info })
+    = lfDynTag dflags lf_info
+
+isToplevClosure :: ClosureInfo -> Bool
+isToplevClosure (ClosureInfo { closureLFInfo = lf_info })
+  = case lf_info of
+      LFReEntrant TopLevel _ _ _ _ -> True
+      LFThunk TopLevel _ _ _ _     -> True
+      _other                       -> False
+
+--------------------------------------
+--   Label generation
+--------------------------------------
+
+staticClosureLabel :: ClosureInfo -> CLabel
+staticClosureLabel = toClosureLbl .  closureInfoLabel
+
+closureSlowEntryLabel :: ClosureInfo -> CLabel
+closureSlowEntryLabel = toSlowEntryLbl . closureInfoLabel
+
+closureLocalEntryLabel :: DynFlags -> ClosureInfo -> CLabel
+closureLocalEntryLabel dflags
+  | tablesNextToCode dflags = toInfoLbl  . closureInfoLabel
+  | otherwise               = toEntryLbl . closureInfoLabel
+
+mkClosureInfoTableLabel :: Id -> LambdaFormInfo -> CLabel
+mkClosureInfoTableLabel id lf_info
+  = case lf_info of
+        LFThunk _ _ upd_flag (SelectorThunk offset) _
+                      -> mkSelectorInfoLabel upd_flag offset
+
+        LFThunk _ _ upd_flag (ApThunk arity) _
+                      -> mkApInfoTableLabel upd_flag arity
+
+        LFThunk{}     -> std_mk_lbl name cafs
+        LFReEntrant{} -> std_mk_lbl name cafs
+        _other        -> panic "closureInfoTableLabel"
+
+  where
+    name = idName id
+
+    std_mk_lbl | is_local  = mkLocalInfoTableLabel
+               | otherwise = mkInfoTableLabel
+
+    cafs     = idCafInfo id
+    is_local = isDataConWorkId id
+       -- Make the _info pointer for the implicit datacon worker
+       -- binding local. The reason we can do this is that importing
+       -- code always either uses the _closure or _con_info. By the
+       -- invariants in CorePrep anything else gets eta expanded.
+
+
+thunkEntryLabel :: DynFlags -> Name -> CafInfo -> StandardFormInfo -> Bool -> CLabel
+-- thunkEntryLabel is a local help function, not exported.  It's used from
+-- getCallMethod.
+thunkEntryLabel dflags _thunk_id _ (ApThunk arity) upd_flag
+  = enterApLabel dflags upd_flag arity
+thunkEntryLabel dflags _thunk_id _ (SelectorThunk offset) upd_flag
+  = enterSelectorLabel dflags upd_flag offset
+thunkEntryLabel dflags thunk_id c _ _
+  = enterIdLabel dflags thunk_id c
+
+enterApLabel :: DynFlags -> Bool -> Arity -> CLabel
+enterApLabel dflags is_updatable arity
+  | tablesNextToCode dflags = mkApInfoTableLabel is_updatable arity
+  | otherwise               = mkApEntryLabel is_updatable arity
+
+enterSelectorLabel :: DynFlags -> Bool -> WordOff -> CLabel
+enterSelectorLabel dflags upd_flag offset
+  | tablesNextToCode dflags = mkSelectorInfoLabel upd_flag offset
+  | otherwise               = mkSelectorEntryLabel upd_flag offset
+
+enterIdLabel :: DynFlags -> Name -> CafInfo -> CLabel
+enterIdLabel dflags id c
+  | tablesNextToCode dflags = mkInfoTableLabel id c
+  | otherwise               = mkEntryLabel id c
+
+
+--------------------------------------
+--   Profiling
+--------------------------------------
+
+-- Profiling requires two pieces of information to be determined for
+-- each closure's info table --- description and type.
+
+-- The description is stored directly in the @CClosureInfoTable@ when the
+-- info table is built.
+
+-- The type is determined from the type information stored with the @Id@
+-- in the closure info using @closureTypeDescr@.
+
+mkProfilingInfo :: DynFlags -> Id -> String -> ProfilingInfo
+mkProfilingInfo dflags id val_descr
+  | not (gopt Opt_SccProfilingOn dflags) = NoProfilingInfo
+  | otherwise = ProfilingInfo ty_descr_w8 (BS8.pack val_descr)
+  where
+    ty_descr_w8  = BS8.pack (getTyDescription (idType id))
+
+getTyDescription :: Type -> String
+getTyDescription ty
+  = case (tcSplitSigmaTy ty) of { (_, _, tau_ty) ->
+    case tau_ty of
+      TyVarTy _              -> "*"
+      AppTy fun _            -> getTyDescription fun
+      TyConApp tycon _       -> getOccString tycon
+      FunTy {}              -> '-' : fun_result tau_ty
+      ForAllTy _  ty         -> getTyDescription ty
+      LitTy n                -> getTyLitDescription n
+      CastTy ty _            -> getTyDescription ty
+      CoercionTy co          -> pprPanic "getTyDescription" (ppr co)
+    }
+  where
+    fun_result (FunTy { ft_res = res }) = '>' : fun_result res
+    fun_result other                    = getTyDescription other
+
+getTyLitDescription :: TyLit -> String
+getTyLitDescription l =
+  case l of
+    NumTyLit n -> show n
+    StrTyLit n -> show n
+
+--------------------------------------
+--   CmmInfoTable-related things
+--------------------------------------
+
+mkDataConInfoTable :: DynFlags -> DataCon -> Bool -> Int -> Int -> CmmInfoTable
+mkDataConInfoTable dflags data_con is_static ptr_wds nonptr_wds
+ = CmmInfoTable { cit_lbl  = info_lbl
+                , cit_rep  = sm_rep
+                , cit_prof = prof
+                , cit_srt  = Nothing
+                , cit_clo  = Nothing }
+ where
+   name = dataConName data_con
+   info_lbl = mkConInfoTableLabel name NoCafRefs
+   sm_rep = mkHeapRep dflags is_static ptr_wds nonptr_wds cl_type
+   cl_type = Constr (dataConTagZ data_con) (dataConIdentity data_con)
+                  -- We keep the *zero-indexed* tag in the srt_len field
+                  -- of the info table of a data constructor.
+
+   prof | not (gopt Opt_SccProfilingOn dflags) = NoProfilingInfo
+        | otherwise                            = ProfilingInfo ty_descr val_descr
+
+   ty_descr  = BS8.pack $ occNameString $ getOccName $ dataConTyCon data_con
+   val_descr = BS8.pack $ occNameString $ getOccName data_con
+
+-- We need a black-hole closure info to pass to @allocDynClosure@ when we
+-- want to allocate the black hole on entry to a CAF.
+
+cafBlackHoleInfoTable :: CmmInfoTable
+cafBlackHoleInfoTable
+  = CmmInfoTable { cit_lbl  = mkCAFBlackHoleInfoTableLabel
+                 , cit_rep  = blackHoleRep
+                 , cit_prof = NoProfilingInfo
+                 , cit_srt  = Nothing
+                 , cit_clo  = Nothing }
+
+indStaticInfoTable :: CmmInfoTable
+indStaticInfoTable
+  = CmmInfoTable { cit_lbl  = mkIndStaticInfoLabel
+                 , cit_rep  = indStaticRep
+                 , cit_prof = NoProfilingInfo
+                 , cit_srt  = Nothing
+                 , cit_clo  = Nothing }
+
+staticClosureNeedsLink :: Bool -> CmmInfoTable -> Bool
+-- A static closure needs a link field to aid the GC when traversing
+-- the static closure graph.  But it only needs such a field if either
+--        a) it has an SRT
+--        b) it's a constructor with one or more pointer fields
+-- In case (b), the constructor's fields themselves play the role
+-- of the SRT.
+staticClosureNeedsLink has_srt CmmInfoTable{ cit_rep = smrep }
+  | isConRep smrep         = not (isStaticNoCafCon smrep)
+  | otherwise              = has_srt
diff --git a/compiler/GHC/StgToCmm/Con.hs b/compiler/GHC/StgToCmm/Con.hs
new file mode 100644
index 0000000000..08508fbecc
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Con.hs
@@ -0,0 +1,285 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Stg to C--: code generation for constructors
+--
+-- This module provides the support code for StgToCmm to deal with with
+-- constructors on the RHSs of let(rec)s.
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.Con (
+        cgTopRhsCon, buildDynCon, bindConArgs
+    ) where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import StgSyn
+import CoreSyn  ( AltCon(..) )
+
+import GHC.StgToCmm.Monad
+import GHC.StgToCmm.Env
+import GHC.StgToCmm.Heap
+import GHC.StgToCmm.Layout
+import GHC.StgToCmm.Utils
+import GHC.StgToCmm.Closure
+
+import CmmExpr
+import CmmUtils
+import CLabel
+import MkGraph
+import SMRep
+import CostCentre
+import Module
+import DataCon
+import DynFlags
+import FastString
+import Id
+import RepType (countConRepArgs)
+import Literal
+import PrelInfo
+import Outputable
+import GHC.Platform
+import Util
+import MonadUtils (mapMaybeM)
+
+import Control.Monad
+import Data.Char
+
+
+
+---------------------------------------------------------------
+--      Top-level constructors
+---------------------------------------------------------------
+
+cgTopRhsCon :: DynFlags
+            -> Id               -- Name of thing bound to this RHS
+            -> DataCon          -- Id
+            -> [NonVoid StgArg] -- Args
+            -> (CgIdInfo, FCode ())
+cgTopRhsCon dflags id con args =
+    let id_info = litIdInfo dflags id (mkConLFInfo con) (CmmLabel closure_label)
+    in (id_info, gen_code)
+  where
+   name          = idName id
+   caffy         = idCafInfo id -- any stgArgHasCafRefs args
+   closure_label = mkClosureLabel name caffy
+
+   gen_code =
+     do { this_mod <- getModuleName
+        ; when (platformOS (targetPlatform dflags) == OSMinGW32) $
+              -- Windows DLLs have a problem with static cross-DLL refs.
+              MASSERT( not (isDllConApp dflags this_mod con (map fromNonVoid args)) )
+        ; ASSERT( args `lengthIs` countConRepArgs con ) return ()
+
+        -- LAY IT OUT
+        ; let
+            (tot_wds, --  #ptr_wds + #nonptr_wds
+             ptr_wds, --  #ptr_wds
+             nv_args_w_offsets) =
+                 mkVirtHeapOffsetsWithPadding dflags StdHeader (addArgReps args)
+
+            mk_payload (Padding len _) = return (CmmInt 0 (widthFromBytes len))
+            mk_payload (FieldOff arg _) = do
+                amode <- getArgAmode arg
+                case amode of
+                  CmmLit lit -> return lit
+                  _          -> panic "GHC.StgToCmm.Con.cgTopRhsCon"
+
+            nonptr_wds = tot_wds - ptr_wds
+
+             -- we're not really going to emit an info table, so having
+             -- to make a CmmInfoTable is a bit overkill, but mkStaticClosureFields
+             -- needs to poke around inside it.
+            info_tbl = mkDataConInfoTable dflags con True ptr_wds nonptr_wds
+
+
+        ; payload <- mapM mk_payload nv_args_w_offsets
+                -- NB1: nv_args_w_offsets is sorted into ptrs then non-ptrs
+                -- NB2: all the amodes should be Lits!
+                --      TODO (osa): Why?
+
+        ; let closure_rep = mkStaticClosureFields
+                             dflags
+                             info_tbl
+                             dontCareCCS                -- Because it's static data
+                             caffy                      -- Has CAF refs
+                             payload
+
+                -- BUILD THE OBJECT
+        ; emitDataLits closure_label closure_rep
+
+        ; return () }
+
+
+---------------------------------------------------------------
+--      Lay out and allocate non-top-level constructors
+---------------------------------------------------------------
+
+buildDynCon :: Id                 -- Name of the thing to which this constr will
+                                  -- be bound
+            -> Bool               -- is it genuinely bound to that name, or just
+                                  -- for profiling?
+            -> CostCentreStack    -- Where to grab cost centre from;
+                                  -- current CCS if currentOrSubsumedCCS
+            -> DataCon            -- The data constructor
+            -> [NonVoid StgArg]   -- Its args
+            -> FCode (CgIdInfo, FCode CmmAGraph)
+               -- Return details about how to find it and initialization code
+buildDynCon binder actually_bound cc con args
+    = do dflags <- getDynFlags
+         buildDynCon' dflags (targetPlatform dflags) binder actually_bound cc con args
+
+
+buildDynCon' :: DynFlags
+             -> Platform
+             -> Id -> Bool
+             -> CostCentreStack
+             -> DataCon
+             -> [NonVoid StgArg]
+             -> FCode (CgIdInfo, FCode CmmAGraph)
+
+{- We used to pass a boolean indicating whether all the
+args were of size zero, so we could use a static
+constructor; but I concluded that it just isn't worth it.
+Now I/O uses unboxed tuples there just aren't any constructors
+with all size-zero args.
+
+The reason for having a separate argument, rather than looking at
+the addr modes of the args is that we may be in a "knot", and
+premature looking at the args will cause the compiler to black-hole!
+-}
+
+
+-------- buildDynCon': Nullary constructors --------------
+-- First we deal with the case of zero-arity constructors.  They
+-- will probably be unfolded, so we don't expect to see this case much,
+-- if at all, but it does no harm, and sets the scene for characters.
+--
+-- In the case of zero-arity constructors, or, more accurately, those
+-- which have exclusively size-zero (VoidRep) args, we generate no code
+-- at all.
+
+buildDynCon' dflags _ binder _ _cc con []
+  | isNullaryRepDataCon con
+  = return (litIdInfo dflags binder (mkConLFInfo con)
+                (CmmLabel (mkClosureLabel (dataConName con) (idCafInfo binder))),
+            return mkNop)
+
+-------- buildDynCon': Charlike and Intlike constructors -----------
+{- The following three paragraphs about @Char@-like and @Int@-like
+closures are obsolete, but I don't understand the details well enough
+to properly word them, sorry. I've changed the treatment of @Char@s to
+be analogous to @Int@s: only a subset is preallocated, because @Char@
+has now 31 bits. Only literals are handled here. -- Qrczak
+
+Now for @Char@-like closures.  We generate an assignment of the
+address of the closure to a temporary.  It would be possible simply to
+generate no code, and record the addressing mode in the environment,
+but we'd have to be careful if the argument wasn't a constant --- so
+for simplicity we just always assign to a temporary.
+
+Last special case: @Int@-like closures.  We only special-case the
+situation in which the argument is a literal in the range
+@mIN_INTLIKE@..@mAX_INTLILKE@.  NB: for @Char@-like closures we can
+work with any old argument, but for @Int@-like ones the argument has
+to be a literal.  Reason: @Char@ like closures have an argument type
+which is guaranteed in range.
+
+Because of this, we use can safely return an addressing mode.
+
+We don't support this optimisation when compiling into Windows DLLs yet
+because they don't support cross package data references well.
+-}
+
+buildDynCon' dflags platform binder _ _cc con [arg]
+  | maybeIntLikeCon con
+  , platformOS platform /= OSMinGW32 || not (positionIndependent dflags)
+  , NonVoid (StgLitArg (LitNumber LitNumInt val _)) <- arg
+  , val <= fromIntegral (mAX_INTLIKE dflags) -- Comparisons at type Integer!
+  , val >= fromIntegral (mIN_INTLIKE dflags) -- ...ditto...
+  = do  { let intlike_lbl   = mkCmmClosureLabel rtsUnitId (fsLit "stg_INTLIKE")
+              val_int = fromIntegral val :: Int
+              offsetW = (val_int - mIN_INTLIKE dflags) * (fixedHdrSizeW dflags + 1)
+                -- INTLIKE closures consist of a header and one word payload
+              intlike_amode = cmmLabelOffW dflags intlike_lbl offsetW
+        ; return ( litIdInfo dflags binder (mkConLFInfo con) intlike_amode
+                 , return mkNop) }
+
+buildDynCon' dflags platform binder _ _cc con [arg]
+  | maybeCharLikeCon con
+  , platformOS platform /= OSMinGW32 || not (positionIndependent dflags)
+  , NonVoid (StgLitArg (LitChar val)) <- arg
+  , let val_int = ord val :: Int
+  , val_int <= mAX_CHARLIKE dflags
+  , val_int >= mIN_CHARLIKE dflags
+  = do  { let charlike_lbl   = mkCmmClosureLabel rtsUnitId (fsLit "stg_CHARLIKE")
+              offsetW = (val_int - mIN_CHARLIKE dflags) * (fixedHdrSizeW dflags + 1)
+                -- CHARLIKE closures consist of a header and one word payload
+              charlike_amode = cmmLabelOffW dflags charlike_lbl offsetW
+        ; return ( litIdInfo dflags binder (mkConLFInfo con) charlike_amode
+                 , return mkNop) }
+
+-------- buildDynCon': the general case -----------
+buildDynCon' dflags _ binder actually_bound ccs con args
+  = do  { (id_info, reg) <- rhsIdInfo binder lf_info
+        ; return (id_info, gen_code reg)
+        }
+ where
+  lf_info = mkConLFInfo con
+
+  gen_code reg
+    = do  { let (tot_wds, ptr_wds, args_w_offsets)
+                  = mkVirtConstrOffsets dflags (addArgReps args)
+                nonptr_wds = tot_wds - ptr_wds
+                info_tbl = mkDataConInfoTable dflags con False
+                                ptr_wds nonptr_wds
+          ; let ticky_name | actually_bound = Just binder
+                           | otherwise = Nothing
+
+          ; hp_plus_n <- allocDynClosure ticky_name info_tbl lf_info
+                                          use_cc blame_cc args_w_offsets
+          ; return (mkRhsInit dflags reg lf_info hp_plus_n) }
+    where
+      use_cc      -- cost-centre to stick in the object
+        | isCurrentCCS ccs = cccsExpr
+        | otherwise        = panic "buildDynCon: non-current CCS not implemented"
+
+      blame_cc = use_cc -- cost-centre on which to blame the alloc (same)
+
+
+---------------------------------------------------------------
+--      Binding constructor arguments
+---------------------------------------------------------------
+
+bindConArgs :: AltCon -> LocalReg -> [NonVoid Id] -> FCode [LocalReg]
+-- bindConArgs is called from cgAlt of a case
+-- (bindConArgs con args) augments the environment with bindings for the
+-- binders args, assuming that we have just returned from a 'case' which
+-- found a con
+bindConArgs (DataAlt con) base args
+  = ASSERT(not (isUnboxedTupleCon con))
+    do dflags <- getDynFlags
+       let (_, _, args_w_offsets) = mkVirtConstrOffsets dflags (addIdReps args)
+           tag = tagForCon dflags con
+
+           -- The binding below forces the masking out of the tag bits
+           -- when accessing the constructor field.
+           bind_arg :: (NonVoid Id, ByteOff) -> FCode (Maybe LocalReg)
+           bind_arg (arg@(NonVoid b), offset)
+             | isDeadBinder b  -- See Note [Dead-binder optimisation] in GHC.StgToCmm.Expr
+             = return Nothing
+             | otherwise
+             = do { emit $ mkTaggedObjectLoad dflags (idToReg dflags arg)
+                                              base offset tag
+                  ; Just <$> bindArgToReg arg }
+
+       mapMaybeM bind_arg args_w_offsets
+
+bindConArgs _other_con _base args
+  = ASSERT( null args ) return []
diff --git a/compiler/GHC/StgToCmm/Env.hs b/compiler/GHC/StgToCmm/Env.hs
new file mode 100644
index 0000000000..e32c6a1ecb
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Env.hs
@@ -0,0 +1,208 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Stg to C-- code generation: the binding environment
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+module GHC.StgToCmm.Env (
+        CgIdInfo,
+
+        litIdInfo, lneIdInfo, rhsIdInfo, mkRhsInit,
+        idInfoToAmode,
+
+        addBindC, addBindsC,
+
+        bindArgsToRegs, bindToReg, rebindToReg,
+        bindArgToReg, idToReg,
+        getArgAmode, getNonVoidArgAmodes,
+        getCgIdInfo,
+        maybeLetNoEscape,
+    ) where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import TyCon
+import GHC.StgToCmm.Monad
+import GHC.StgToCmm.Utils
+import GHC.StgToCmm.Closure
+
+import CLabel
+
+import BlockId
+import CmmExpr
+import CmmUtils
+import DynFlags
+import Id
+import MkGraph
+import Name
+import Outputable
+import StgSyn
+import Type
+import TysPrim
+import UniqFM
+import Util
+import VarEnv
+
+-------------------------------------
+--        Manipulating CgIdInfo
+-------------------------------------
+
+mkCgIdInfo :: Id -> LambdaFormInfo -> CmmExpr -> CgIdInfo
+mkCgIdInfo id lf expr
+  = CgIdInfo { cg_id = id, cg_lf = lf
+             , cg_loc = CmmLoc expr }
+
+litIdInfo :: DynFlags -> Id -> LambdaFormInfo -> CmmLit -> CgIdInfo
+litIdInfo dflags id lf lit
+  = CgIdInfo { cg_id = id, cg_lf = lf
+             , cg_loc = CmmLoc (addDynTag dflags (CmmLit lit) tag) }
+  where
+    tag = lfDynTag dflags lf
+
+lneIdInfo :: DynFlags -> Id -> [NonVoid Id] -> CgIdInfo
+lneIdInfo dflags id regs
+  = CgIdInfo { cg_id = id, cg_lf = lf
+             , cg_loc = LneLoc blk_id (map (idToReg dflags) regs) }
+  where
+    lf     = mkLFLetNoEscape
+    blk_id = mkBlockId (idUnique id)
+
+
+rhsIdInfo :: Id -> LambdaFormInfo -> FCode (CgIdInfo, LocalReg)
+rhsIdInfo id lf_info
+  = do dflags <- getDynFlags
+       reg <- newTemp (gcWord dflags)
+       return (mkCgIdInfo id lf_info (CmmReg (CmmLocal reg)), reg)
+
+mkRhsInit :: DynFlags -> LocalReg -> LambdaFormInfo -> CmmExpr -> CmmAGraph
+mkRhsInit dflags reg lf_info expr
+  = mkAssign (CmmLocal reg) (addDynTag dflags expr (lfDynTag dflags lf_info))
+
+idInfoToAmode :: CgIdInfo -> CmmExpr
+-- Returns a CmmExpr for the *tagged* pointer
+idInfoToAmode (CgIdInfo { cg_loc = CmmLoc e }) = e
+idInfoToAmode cg_info
+  = pprPanic "idInfoToAmode" (ppr (cg_id cg_info))        -- LneLoc
+
+addDynTag :: DynFlags -> CmmExpr -> DynTag -> CmmExpr
+-- A tag adds a byte offset to the pointer
+addDynTag dflags expr tag = cmmOffsetB dflags expr tag
+
+maybeLetNoEscape :: CgIdInfo -> Maybe (BlockId, [LocalReg])
+maybeLetNoEscape (CgIdInfo { cg_loc = LneLoc blk_id args}) = Just (blk_id, args)
+maybeLetNoEscape _other                                      = Nothing
+
+
+
+---------------------------------------------------------
+--        The binding environment
+--
+-- There are three basic routines, for adding (addBindC),
+-- modifying(modifyBindC) and looking up (getCgIdInfo) bindings.
+---------------------------------------------------------
+
+addBindC :: CgIdInfo -> FCode ()
+addBindC stuff_to_bind = do
+        binds <- getBinds
+        setBinds $ extendVarEnv binds (cg_id stuff_to_bind) stuff_to_bind
+
+addBindsC :: [CgIdInfo] -> FCode ()
+addBindsC new_bindings = do
+        binds <- getBinds
+        let new_binds = foldl' (\ binds info -> extendVarEnv binds (cg_id info) info)
+                               binds
+                               new_bindings
+        setBinds new_binds
+
+getCgIdInfo :: Id -> FCode CgIdInfo
+getCgIdInfo id
+  = do  { dflags <- getDynFlags
+        ; local_binds <- getBinds -- Try local bindings first
+        ; case lookupVarEnv local_binds id of {
+            Just info -> return info ;
+            Nothing   -> do {
+
+                -- Should be imported; make up a CgIdInfo for it
+          let name = idName id
+        ; if isExternalName name then
+              let ext_lbl
+                      | isUnliftedType (idType id) =
+                          -- An unlifted external Id must refer to a top-level
+                          -- string literal. See Note [Bytes label] in CLabel.
+                          ASSERT( idType id `eqType` addrPrimTy )
+                          mkBytesLabel name
+                      | otherwise = mkClosureLabel name $ idCafInfo id
+              in return $
+                  litIdInfo dflags id (mkLFImported id) (CmmLabel ext_lbl)
+          else
+              cgLookupPanic id -- Bug
+        }}}
+
+cgLookupPanic :: Id -> FCode a
+cgLookupPanic id
+  = do  local_binds <- getBinds
+        pprPanic "GHC.StgToCmm.Env: variable not found"
+                (vcat [ppr id,
+                text "local binds for:",
+                pprUFM local_binds $ \infos ->
+                  vcat [ ppr (cg_id info) | info <- infos ]
+              ])
+
+
+--------------------
+getArgAmode :: NonVoid StgArg -> FCode CmmExpr
+getArgAmode (NonVoid (StgVarArg var)) = idInfoToAmode <$> getCgIdInfo var
+getArgAmode (NonVoid (StgLitArg lit)) = CmmLit <$> cgLit lit
+
+getNonVoidArgAmodes :: [StgArg] -> FCode [CmmExpr]
+-- NB: Filters out void args,
+--     so the result list may be shorter than the argument list
+getNonVoidArgAmodes [] = return []
+getNonVoidArgAmodes (arg:args)
+  | isVoidRep (argPrimRep arg) = getNonVoidArgAmodes args
+  | otherwise = do { amode  <- getArgAmode (NonVoid arg)
+                   ; amodes <- getNonVoidArgAmodes args
+                   ; return ( amode : amodes ) }
+
+
+------------------------------------------------------------------------
+--        Interface functions for binding and re-binding names
+------------------------------------------------------------------------
+
+bindToReg :: NonVoid Id -> LambdaFormInfo -> FCode LocalReg
+-- Bind an Id to a fresh LocalReg
+bindToReg nvid@(NonVoid id) lf_info
+  = do dflags <- getDynFlags
+       let reg = idToReg dflags nvid
+       addBindC (mkCgIdInfo id lf_info (CmmReg (CmmLocal reg)))
+       return reg
+
+rebindToReg :: NonVoid Id -> FCode LocalReg
+-- Like bindToReg, but the Id is already in scope, so
+-- get its LF info from the envt
+rebindToReg nvid@(NonVoid id)
+  = do  { info <- getCgIdInfo id
+        ; bindToReg nvid (cg_lf info) }
+
+bindArgToReg :: NonVoid Id -> FCode LocalReg
+bindArgToReg nvid@(NonVoid id) = bindToReg nvid (mkLFArgument id)
+
+bindArgsToRegs :: [NonVoid Id] -> FCode [LocalReg]
+bindArgsToRegs args = mapM bindArgToReg args
+
+idToReg :: DynFlags -> NonVoid Id -> LocalReg
+-- Make a register from an Id, typically a function argument,
+-- free variable, or case binder
+--
+-- We re-use the Unique from the Id to make it easier to see what is going on
+--
+-- By now the Ids should be uniquely named; else one would worry
+-- about accidental collision
+idToReg dflags (NonVoid id)
+             = LocalReg (idUnique id)
+                        (primRepCmmType dflags (idPrimRep id))
diff --git a/compiler/GHC/StgToCmm/Expr.hs b/compiler/GHC/StgToCmm/Expr.hs
new file mode 100644
index 0000000000..59cd246441
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Expr.hs
@@ -0,0 +1,992 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Stg to C-- code generation: expressions
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.Expr ( cgExpr ) where
+
+#include "HsVersions.h"
+
+import GhcPrelude hiding ((<*>))
+
+import {-# SOURCE #-} GHC.StgToCmm.Bind ( cgBind )
+
+import GHC.StgToCmm.Monad
+import GHC.StgToCmm.Heap
+import GHC.StgToCmm.Env
+import GHC.StgToCmm.Con
+import GHC.StgToCmm.Prof (saveCurrentCostCentre, restoreCurrentCostCentre, emitSetCCC)
+import GHC.StgToCmm.Layout
+import GHC.StgToCmm.Prim
+import GHC.StgToCmm.Hpc
+import GHC.StgToCmm.Ticky
+import GHC.StgToCmm.Utils
+import GHC.StgToCmm.Closure
+
+import StgSyn
+
+import MkGraph
+import BlockId
+import Cmm
+import CmmInfo
+import CoreSyn
+import DataCon
+import ForeignCall
+import Id
+import PrimOp
+import TyCon
+import Type             ( isUnliftedType )
+import RepType          ( isVoidTy, countConRepArgs, primRepSlot )
+import CostCentre       ( CostCentreStack, currentCCS )
+import Maybes
+import Util
+import FastString
+import Outputable
+
+import Control.Monad (unless,void)
+import Control.Arrow (first)
+import Data.Function ( on )
+
+------------------------------------------------------------------------
+--              cgExpr: the main function
+------------------------------------------------------------------------
+
+cgExpr  :: CgStgExpr -> FCode ReturnKind
+
+cgExpr (StgApp fun args)     = cgIdApp fun args
+
+-- seq# a s ==> a
+-- See Note [seq# magic] in PrelRules
+cgExpr (StgOpApp (StgPrimOp SeqOp) [StgVarArg a, _] _res_ty) =
+  cgIdApp a []
+
+-- dataToTag# :: a -> Int#
+-- See Note [dataToTag#] in primops.txt.pp
+cgExpr (StgOpApp (StgPrimOp DataToTagOp) [StgVarArg a] _res_ty) = do
+  dflags <- getDynFlags
+  emitComment (mkFastString "dataToTag#")
+  tmp <- newTemp (bWord dflags)
+  _ <- withSequel (AssignTo [tmp] False) (cgIdApp a [])
+  -- TODO: For small types look at the tag bits instead of reading info table
+  emitReturn [getConstrTag dflags (cmmUntag dflags (CmmReg (CmmLocal tmp)))]
+
+cgExpr (StgOpApp op args ty) = cgOpApp op args ty
+cgExpr (StgConApp con args _)= cgConApp con args
+cgExpr (StgTick t e)         = cgTick t >> cgExpr e
+cgExpr (StgLit lit)       = do cmm_lit <- cgLit lit
+                               emitReturn [CmmLit cmm_lit]
+
+cgExpr (StgLet _ binds expr) = do { cgBind binds;     cgExpr expr }
+cgExpr (StgLetNoEscape _ binds expr) =
+  do { u <- newUnique
+     ; let join_id = mkBlockId u
+     ; cgLneBinds join_id binds
+     ; r <- cgExpr expr
+     ; emitLabel join_id
+     ; return r }
+
+cgExpr (StgCase expr bndr alt_type alts) =
+  cgCase expr bndr alt_type alts
+
+cgExpr (StgLam {}) = panic "cgExpr: StgLam"
+
+------------------------------------------------------------------------
+--              Let no escape
+------------------------------------------------------------------------
+
+{- Generating code for a let-no-escape binding, aka join point is very
+very similar to what we do for a case expression.  The duality is
+between
+        let-no-escape x = b
+        in e
+and
+        case e of ... -> b
+
+That is, the RHS of 'x' (ie 'b') will execute *later*, just like
+the alternative of the case; it needs to be compiled in an environment
+in which all volatile bindings are forgotten, and the free vars are
+bound only to stable things like stack locations..  The 'e' part will
+execute *next*, just like the scrutinee of a case. -}
+
+-------------------------
+cgLneBinds :: BlockId -> CgStgBinding -> FCode ()
+cgLneBinds join_id (StgNonRec bndr rhs)
+  = do  { local_cc <- saveCurrentCostCentre
+                -- See Note [Saving the current cost centre]
+        ; (info, fcode) <- cgLetNoEscapeRhs join_id local_cc bndr rhs
+        ; fcode
+        ; addBindC info }
+
+cgLneBinds join_id (StgRec pairs)
+  = do  { local_cc <- saveCurrentCostCentre
+        ; r <- sequence $ unzipWith (cgLetNoEscapeRhs join_id local_cc) pairs
+        ; let (infos, fcodes) = unzip r
+        ; addBindsC infos
+        ; sequence_ fcodes
+        }
+
+-------------------------
+cgLetNoEscapeRhs
+    :: BlockId          -- join point for successor of let-no-escape
+    -> Maybe LocalReg   -- Saved cost centre
+    -> Id
+    -> CgStgRhs
+    -> FCode (CgIdInfo, FCode ())
+
+cgLetNoEscapeRhs join_id local_cc bndr rhs =
+  do { (info, rhs_code) <- cgLetNoEscapeRhsBody local_cc bndr rhs
+     ; let (bid, _) = expectJust "cgLetNoEscapeRhs" $ maybeLetNoEscape info
+     ; let code = do { (_, body) <- getCodeScoped rhs_code
+                     ; emitOutOfLine bid (first (<*> mkBranch join_id) body) }
+     ; return (info, code)
+     }
+
+cgLetNoEscapeRhsBody
+    :: Maybe LocalReg   -- Saved cost centre
+    -> Id
+    -> CgStgRhs
+    -> FCode (CgIdInfo, FCode ())
+cgLetNoEscapeRhsBody local_cc bndr (StgRhsClosure _ cc _upd args body)
+  = cgLetNoEscapeClosure bndr local_cc cc (nonVoidIds args) body
+cgLetNoEscapeRhsBody local_cc bndr (StgRhsCon cc con args)
+  = cgLetNoEscapeClosure bndr local_cc cc []
+      (StgConApp con args (pprPanic "cgLetNoEscapeRhsBody" $
+                           text "StgRhsCon doesn't have type args"))
+        -- For a constructor RHS we want to generate a single chunk of
+        -- code which can be jumped to from many places, which will
+        -- return the constructor. It's easy; just behave as if it
+        -- was an StgRhsClosure with a ConApp inside!
+
+-------------------------
+cgLetNoEscapeClosure
+        :: Id                   -- binder
+        -> Maybe LocalReg       -- Slot for saved current cost centre
+        -> CostCentreStack      -- XXX: *** NOT USED *** why not?
+        -> [NonVoid Id]         -- Args (as in \ args -> body)
+        -> CgStgExpr            -- Body (as in above)
+        -> FCode (CgIdInfo, FCode ())
+
+cgLetNoEscapeClosure bndr cc_slot _unused_cc args body
+  = do dflags <- getDynFlags
+       return ( lneIdInfo dflags bndr args
+              , code )
+  where
+   code = forkLneBody $ do {
+            ; withNewTickyCounterLNE (idName bndr) args $ do
+            ; restoreCurrentCostCentre cc_slot
+            ; arg_regs <- bindArgsToRegs args
+            ; void $ noEscapeHeapCheck arg_regs (tickyEnterLNE >> cgExpr body) }
+
+
+------------------------------------------------------------------------
+--              Case expressions
+------------------------------------------------------------------------
+
+{- Note [Compiling case expressions]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+It is quite interesting to decide whether to put a heap-check at the
+start of each alternative.  Of course we certainly have to do so if
+the case forces an evaluation, or if there is a primitive op which can
+trigger GC.
+
+A more interesting situation is this (a Plan-B situation)
+
+        !P!;
+        ...P...
+        case x# of
+          0#      -> !Q!; ...Q...
+          default -> !R!; ...R...
+
+where !x! indicates a possible heap-check point. The heap checks
+in the alternatives *can* be omitted, in which case the topmost
+heapcheck will take their worst case into account.
+
+In favour of omitting !Q!, !R!:
+
+ - *May* save a heap overflow test,
+   if ...P... allocates anything.
+
+ - We can use relative addressing from a single Hp to
+   get at all the closures so allocated.
+
+ - No need to save volatile vars etc across heap checks
+   in !Q!, !R!
+
+Against omitting !Q!, !R!
+
+  - May put a heap-check into the inner loop.  Suppose
+        the main loop is P -> R -> P -> R...
+        Q is the loop exit, and only it does allocation.
+    This only hurts us if P does no allocation.  If P allocates,
+    then there is a heap check in the inner loop anyway.
+
+  - May do more allocation than reqd.  This sometimes bites us
+    badly.  For example, nfib (ha!) allocates about 30\% more space if the
+    worst-casing is done, because many many calls to nfib are leaf calls
+    which don't need to allocate anything.
+
+    We can un-allocate, but that costs an instruction
+
+Neither problem hurts us if there is only one alternative.
+
+Suppose the inner loop is P->R->P->R etc.  Then here is
+how many heap checks we get in the *inner loop* under various
+conditions
+
+  Alloc   Heap check in branches (!Q!, !R!)?
+  P Q R      yes     no (absorb to !P!)
+--------------------------------------
+  n n n      0          0
+  n y n      0          1
+  n . y      1          1
+  y . y      2          1
+  y . n      1          1
+
+Best choices: absorb heap checks from Q and R into !P! iff
+  a) P itself does some allocation
+or
+  b) P does allocation, or there is exactly one alternative
+
+We adopt (b) because that is more likely to put the heap check at the
+entry to a function, when not many things are live.  After a bunch of
+single-branch cases, we may have lots of things live
+
+Hence: two basic plans for
+
+        case e of r { alts }
+
+------ Plan A: the general case ---------
+
+        ...save current cost centre...
+
+        ...code for e,
+           with sequel (SetLocals r)
+
+        ...restore current cost centre...
+        ...code for alts...
+        ...alts do their own heap checks
+
+------ Plan B: special case when ---------
+  (i)  e does not allocate or call GC
+  (ii) either upstream code performs allocation
+       or there is just one alternative
+
+  Then heap allocation in the (single) case branch
+  is absorbed by the upstream check.
+  Very common example: primops on unboxed values
+
+        ...code for e,
+           with sequel (SetLocals r)...
+
+        ...code for alts...
+        ...no heap check...
+-}
+
+
+
+-------------------------------------
+data GcPlan
+  = GcInAlts            -- Put a GC check at the start the case alternatives,
+        [LocalReg]      -- which binds these registers
+  | NoGcInAlts          -- The scrutinee is a primitive value, or a call to a
+                        -- primitive op which does no GC.  Absorb the allocation
+                        -- of the case alternative(s) into the upstream check
+
+-------------------------------------
+cgCase :: CgStgExpr -> Id -> AltType -> [CgStgAlt] -> FCode ReturnKind
+
+cgCase (StgOpApp (StgPrimOp op) args _) bndr (AlgAlt tycon) alts
+  | isEnumerationTyCon tycon -- Note [case on bool]
+  = do { tag_expr <- do_enum_primop op args
+
+       -- If the binder is not dead, convert the tag to a constructor
+       -- and assign it. See Note [Dead-binder optimisation]
+       ; unless (isDeadBinder bndr) $ do
+            { dflags <- getDynFlags
+            ; tmp_reg <- bindArgToReg (NonVoid bndr)
+            ; emitAssign (CmmLocal tmp_reg)
+                         (tagToClosure dflags tycon tag_expr) }
+
+       ; (mb_deflt, branches) <- cgAlgAltRhss (NoGcInAlts,AssignedDirectly)
+                                              (NonVoid bndr) alts
+                                 -- See Note [GC for conditionals]
+       ; emitSwitch tag_expr branches mb_deflt 0 (tyConFamilySize tycon - 1)
+       ; return AssignedDirectly
+       }
+  where
+    do_enum_primop :: PrimOp -> [StgArg] -> FCode CmmExpr
+    do_enum_primop TagToEnumOp [arg]  -- No code!
+      = getArgAmode (NonVoid arg)
+    do_enum_primop primop args
+      = do dflags <- getDynFlags
+           tmp <- newTemp (bWord dflags)
+           cgPrimOp [tmp] primop args
+           return (CmmReg (CmmLocal tmp))
+
+{-
+Note [case on bool]
+~~~~~~~~~~~~~~~~~~~
+This special case handles code like
+
+  case a <# b of
+    True ->
+    False ->
+
+-->  case tagToEnum# (a <$# b) of
+        True -> .. ; False -> ...
+
+--> case (a <$# b) of r ->
+    case tagToEnum# r of
+        True -> .. ; False -> ...
+
+If we let the ordinary case code handle it, we'll get something like
+
+ tmp1 = a < b
+ tmp2 = Bool_closure_tbl[tmp1]
+ if (tmp2 & 7 != 0) then ... // normal tagged case
+
+but this junk won't optimise away.  What we really want is just an
+inline comparison:
+
+ if (a < b) then ...
+
+So we add a special case to generate
+
+ tmp1 = a < b
+ if (tmp1 == 0) then ...
+
+and later optimisations will further improve this.
+
+Now that #6135 has been resolved it should be possible to remove that
+special case. The idea behind this special case and pre-6135 implementation
+of Bool-returning primops was that tagToEnum# was added implicitly in the
+codegen and then optimized away. Now the call to tagToEnum# is explicit
+in the source code, which allows to optimize it away at the earlier stages
+of compilation (i.e. at the Core level).
+
+Note [Scrutinising VoidRep]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Suppose we have this STG code:
+   f = \[s : State# RealWorld] ->
+       case s of _ -> blah
+This is very odd.  Why are we scrutinising a state token?  But it
+can arise with bizarre NOINLINE pragmas (#9964)
+    crash :: IO ()
+    crash = IO (\s -> let {-# NOINLINE s' #-}
+                          s' = s
+                      in (# s', () #))
+
+Now the trouble is that 's' has VoidRep, and we do not bind void
+arguments in the environment; they don't live anywhere.  See the
+calls to nonVoidIds in various places.  So we must not look up
+'s' in the environment.  Instead, just evaluate the RHS!  Simple.
+
+Note [Dead-binder optimisation]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+A case-binder, or data-constructor argument, may be marked as dead,
+because we preserve occurrence-info on binders in CoreTidy (see
+CoreTidy.tidyIdBndr).
+
+If the binder is dead, we can sometimes eliminate a load.  While
+CmmSink will eliminate that load, it's very easy to kill it at source
+(giving CmmSink less work to do), and in any case CmmSink only runs
+with -O. Since the majority of case binders are dead, this
+optimisation probably still has a great benefit-cost ratio and we want
+to keep it for -O0. See also Phab:D5358.
+
+This probably also was the reason for occurrence hack in Phab:D5339 to
+exist, perhaps because the occurrence information preserved by
+'CoreTidy.tidyIdBndr' was insufficient.  But now that CmmSink does the
+job we deleted the hacks.
+-}
+
+cgCase (StgApp v []) _ (PrimAlt _) alts
+  | isVoidRep (idPrimRep v)  -- See Note [Scrutinising VoidRep]
+  , [(DEFAULT, _, rhs)] <- alts
+  = cgExpr rhs
+
+{- Note [Dodgy unsafeCoerce 1]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Consider
+    case (x :: HValue) |> co of (y :: MutVar# Int)
+        DEFAULT -> ...
+We want to gnerate an assignment
+     y := x
+We want to allow this assignment to be generated in the case when the
+types are compatible, because this allows some slightly-dodgy but
+occasionally-useful casts to be used, such as in RtClosureInspect
+where we cast an HValue to a MutVar# so we can print out the contents
+of the MutVar#.  If instead we generate code that enters the HValue,
+then we'll get a runtime panic, because the HValue really is a
+MutVar#.  The types are compatible though, so we can just generate an
+assignment.
+-}
+cgCase (StgApp v []) bndr alt_type@(PrimAlt _) alts
+  | isUnliftedType (idType v)  -- Note [Dodgy unsafeCoerce 1]
+  || reps_compatible
+  = -- assignment suffices for unlifted types
+    do { dflags <- getDynFlags
+       ; unless reps_compatible $
+           pprPanic "cgCase: reps do not match, perhaps a dodgy unsafeCoerce?"
+                    (pp_bndr v $$ pp_bndr bndr)
+       ; v_info <- getCgIdInfo v
+       ; emitAssign (CmmLocal (idToReg dflags (NonVoid bndr)))
+                    (idInfoToAmode v_info)
+       -- Add bndr to the environment
+       ; _ <- bindArgToReg (NonVoid bndr)
+       ; cgAlts (NoGcInAlts,AssignedDirectly) (NonVoid bndr) alt_type alts }
+  where
+    reps_compatible = ((==) `on` (primRepSlot . idPrimRep)) v bndr
+      -- Must compare SlotTys, not proper PrimReps, because with unboxed sums,
+      -- the types of the binders are generated from slotPrimRep and might not
+      -- match. Test case:
+      --   swap :: (# Int | Int #) -> (# Int | Int #)
+      --   swap (# x | #) = (# | x #)
+      --   swap (# | y #) = (# y | #)
+
+    pp_bndr id = ppr id <+> dcolon <+> ppr (idType id) <+> parens (ppr (idPrimRep id))
+
+{- Note [Dodgy unsafeCoerce 2, #3132]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In all other cases of a lifted Id being cast to an unlifted type, the
+Id should be bound to bottom, otherwise this is an unsafe use of
+unsafeCoerce.  We can generate code to enter the Id and assume that
+it will never return.  Hence, we emit the usual enter/return code, and
+because bottom must be untagged, it will be entered.  The Sequel is a
+type-correct assignment, albeit bogus.  The (dead) continuation loops;
+it would be better to invoke some kind of panic function here.
+-}
+cgCase scrut@(StgApp v []) _ (PrimAlt _) _
+  = do { dflags <- getDynFlags
+       ; mb_cc <- maybeSaveCostCentre True
+       ; _ <- withSequel
+                  (AssignTo [idToReg dflags (NonVoid v)] False) (cgExpr scrut)
+       ; restoreCurrentCostCentre mb_cc
+       ; emitComment $ mkFastString "should be unreachable code"
+       ; l <- newBlockId
+       ; emitLabel l
+       ; emit (mkBranch l)  -- an infinite loop
+       ; return AssignedDirectly
+       }
+
+{- Note [Handle seq#]
+~~~~~~~~~~~~~~~~~~~~~
+See Note [seq# magic] in PrelRules.
+The special case for seq# in cgCase does this:
+
+  case seq# a s of v
+    (# s', a' #) -> e
+==>
+  case a of v
+    (# s', a' #) -> e
+
+(taking advantage of the fact that the return convention for (# State#, a #)
+is the same as the return convention for just 'a')
+-}
+
+cgCase (StgOpApp (StgPrimOp SeqOp) [StgVarArg a, _] _) bndr alt_type alts
+  = -- Note [Handle seq#]
+    -- And see Note [seq# magic] in PrelRules
+    -- Use the same return convention as vanilla 'a'.
+    cgCase (StgApp a []) bndr alt_type alts
+
+cgCase scrut bndr alt_type alts
+  = -- the general case
+    do { dflags <- getDynFlags
+       ; up_hp_usg <- getVirtHp        -- Upstream heap usage
+       ; let ret_bndrs = chooseReturnBndrs bndr alt_type alts
+             alt_regs  = map (idToReg dflags) ret_bndrs
+       ; simple_scrut <- isSimpleScrut scrut alt_type
+       ; let do_gc  | is_cmp_op scrut  = False  -- See Note [GC for conditionals]
+                    | not simple_scrut = True
+                    | isSingleton alts = False
+                    | up_hp_usg > 0    = False
+                    | otherwise        = True
+               -- cf Note [Compiling case expressions]
+             gc_plan = if do_gc then GcInAlts alt_regs else NoGcInAlts
+
+       ; mb_cc <- maybeSaveCostCentre simple_scrut
+
+       ; let sequel = AssignTo alt_regs do_gc{- Note [scrut sequel] -}
+       ; ret_kind <- withSequel sequel (cgExpr scrut)
+       ; restoreCurrentCostCentre mb_cc
+       ; _ <- bindArgsToRegs ret_bndrs
+       ; cgAlts (gc_plan,ret_kind) (NonVoid bndr) alt_type alts
+       }
+  where
+    is_cmp_op (StgOpApp (StgPrimOp op) _ _) = isComparisonPrimOp op
+    is_cmp_op _                             = False
+
+{- Note [GC for conditionals]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For boolean conditionals it seems that we have always done NoGcInAlts.
+That is, we have always done the GC check before the conditional.
+This is enshrined in the special case for
+   case tagToEnum# (a>b) of ...
+See Note [case on bool]
+
+It's odd, and it's flagrantly inconsistent with the rules described
+Note [Compiling case expressions].  However, after eliminating the
+tagToEnum# (#13397) we will have:
+   case (a>b) of ...
+Rather than make it behave quite differently, I am testing for a
+comparison operator here in in the general case as well.
+
+ToDo: figure out what the Right Rule should be.
+
+Note [scrut sequel]
+~~~~~~~~~~~~~~~~~~~
+The job of the scrutinee is to assign its value(s) to alt_regs.
+Additionally, if we plan to do a heap-check in the alternatives (see
+Note [Compiling case expressions]), then we *must* retreat Hp to
+recover any unused heap before passing control to the sequel.  If we
+don't do this, then any unused heap will become slop because the heap
+check will reset the heap usage. Slop in the heap breaks LDV profiling
+(+RTS -hb) which needs to do a linear sweep through the nursery.
+
+
+Note [Inlining out-of-line primops and heap checks]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If shouldInlinePrimOp returns True when called from GHC.StgToCmm.Expr for the
+purpose of heap check placement, we *must* inline the primop later in
+GHC.StgToCmm.Prim. If we don't things will go wrong.
+-}
+
+-----------------
+maybeSaveCostCentre :: Bool -> FCode (Maybe LocalReg)
+maybeSaveCostCentre simple_scrut
+  | simple_scrut = return Nothing
+  | otherwise    = saveCurrentCostCentre
+
+
+-----------------
+isSimpleScrut :: CgStgExpr -> AltType -> FCode Bool
+-- Simple scrutinee, does not block or allocate; hence safe to amalgamate
+-- heap usage from alternatives into the stuff before the case
+-- NB: if you get this wrong, and claim that the expression doesn't allocate
+--     when it does, you'll deeply mess up allocation
+isSimpleScrut (StgOpApp op args _) _       = isSimpleOp op args
+isSimpleScrut (StgLit _)       _           = return True       -- case 1# of { 0# -> ..; ... }
+isSimpleScrut (StgApp _ [])    (PrimAlt _) = return True       -- case x# of { 0# -> ..; ... }
+isSimpleScrut _                _           = return False
+
+isSimpleOp :: StgOp -> [StgArg] -> FCode Bool
+-- True iff the op cannot block or allocate
+isSimpleOp (StgFCallOp (CCall (CCallSpec _ _ safe)) _) _ = return $! not (playSafe safe)
+-- dataToTag# evalautes its argument, see Note [dataToTag#] in primops.txt.pp
+isSimpleOp (StgPrimOp DataToTagOp) _ = return False
+isSimpleOp (StgPrimOp op) stg_args                  = do
+    arg_exprs <- getNonVoidArgAmodes stg_args
+    dflags <- getDynFlags
+    -- See Note [Inlining out-of-line primops and heap checks]
+    return $! isJust $ shouldInlinePrimOp dflags op arg_exprs
+isSimpleOp (StgPrimCallOp _) _                           = return False
+
+-----------------
+chooseReturnBndrs :: Id -> AltType -> [CgStgAlt] -> [NonVoid Id]
+-- These are the binders of a case that are assigned by the evaluation of the
+-- scrutinee.
+-- They're non-void, see Note [Post-unarisation invariants] in UnariseStg.
+chooseReturnBndrs bndr (PrimAlt _) _alts
+  = assertNonVoidIds [bndr]
+
+chooseReturnBndrs _bndr (MultiValAlt n) [(_, ids, _)]
+  = ASSERT2(ids `lengthIs` n, ppr n $$ ppr ids $$ ppr _bndr)
+    assertNonVoidIds ids     -- 'bndr' is not assigned!
+
+chooseReturnBndrs bndr (AlgAlt _) _alts
+  = assertNonVoidIds [bndr]  -- Only 'bndr' is assigned
+
+chooseReturnBndrs bndr PolyAlt _alts
+  = assertNonVoidIds [bndr]  -- Only 'bndr' is assigned
+
+chooseReturnBndrs _ _ _ = panic "chooseReturnBndrs"
+                             -- MultiValAlt has only one alternative
+
+-------------------------------------
+cgAlts :: (GcPlan,ReturnKind) -> NonVoid Id -> AltType -> [CgStgAlt]
+       -> FCode ReturnKind
+-- At this point the result of the case are in the binders
+cgAlts gc_plan _bndr PolyAlt [(_, _, rhs)]
+  = maybeAltHeapCheck gc_plan (cgExpr rhs)
+
+cgAlts gc_plan _bndr (MultiValAlt _) [(_, _, rhs)]
+  = maybeAltHeapCheck gc_plan (cgExpr rhs)
+        -- Here bndrs are *already* in scope, so don't rebind them
+
+cgAlts gc_plan bndr (PrimAlt _) alts
+  = do  { dflags <- getDynFlags
+
+        ; tagged_cmms <- cgAltRhss gc_plan bndr alts
+
+        ; let bndr_reg = CmmLocal (idToReg dflags bndr)
+              (DEFAULT,deflt) = head tagged_cmms
+                -- PrimAlts always have a DEFAULT case
+                -- and it always comes first
+
+              tagged_cmms' = [(lit,code)
+                             | (LitAlt lit, code) <- tagged_cmms]
+        ; emitCmmLitSwitch (CmmReg bndr_reg) tagged_cmms' deflt
+        ; return AssignedDirectly }
+
+cgAlts gc_plan bndr (AlgAlt tycon) alts
+  = do  { dflags <- getDynFlags
+
+        ; (mb_deflt, branches) <- cgAlgAltRhss gc_plan bndr alts
+
+        ; let fam_sz   = tyConFamilySize tycon
+              bndr_reg = CmmLocal (idToReg dflags bndr)
+
+                    -- Is the constructor tag in the node reg?
+        ; if isSmallFamily dflags fam_sz
+          then do
+                let   -- Yes, bndr_reg has constr. tag in ls bits
+                   tag_expr = cmmConstrTag1 dflags (CmmReg bndr_reg)
+                   branches' = [(tag+1,branch) | (tag,branch) <- branches]
+                emitSwitch tag_expr branches' mb_deflt 1 fam_sz
+
+           else -- No, get tag from info table
+                let -- Note that ptr _always_ has tag 1
+                    -- when the family size is big enough
+                    untagged_ptr = cmmRegOffB bndr_reg (-1)
+                    tag_expr = getConstrTag dflags (untagged_ptr)
+                in emitSwitch tag_expr branches mb_deflt 0 (fam_sz - 1)
+
+        ; return AssignedDirectly }
+
+cgAlts _ _ _ _ = panic "cgAlts"
+        -- UbxTupAlt and PolyAlt have only one alternative
+
+
+-- Note [alg-alt heap check]
+--
+-- In an algebraic case with more than one alternative, we will have
+-- code like
+--
+-- L0:
+--   x = R1
+--   goto L1
+-- L1:
+--   if (x & 7 >= 2) then goto L2 else goto L3
+-- L2:
+--   Hp = Hp + 16
+--   if (Hp > HpLim) then goto L4
+--   ...
+-- L4:
+--   call gc() returns to L5
+-- L5:
+--   x = R1
+--   goto L1
+
+-------------------
+cgAlgAltRhss :: (GcPlan,ReturnKind) -> NonVoid Id -> [CgStgAlt]
+             -> FCode ( Maybe CmmAGraphScoped
+                      , [(ConTagZ, CmmAGraphScoped)] )
+cgAlgAltRhss gc_plan bndr alts
+  = do { tagged_cmms <- cgAltRhss gc_plan bndr alts
+
+       ; let { mb_deflt = case tagged_cmms of
+                           ((DEFAULT,rhs) : _) -> Just rhs
+                           _other              -> Nothing
+                            -- DEFAULT is always first, if present
+
+              ; branches = [ (dataConTagZ con, cmm)
+                           | (DataAlt con, cmm) <- tagged_cmms ]
+              }
+
+       ; return (mb_deflt, branches)
+       }
+
+
+-------------------
+cgAltRhss :: (GcPlan,ReturnKind) -> NonVoid Id -> [CgStgAlt]
+          -> FCode [(AltCon, CmmAGraphScoped)]
+cgAltRhss gc_plan bndr alts = do
+  dflags <- getDynFlags
+  let
+    base_reg = idToReg dflags bndr
+    cg_alt :: CgStgAlt -> FCode (AltCon, CmmAGraphScoped)
+    cg_alt (con, bndrs, rhs)
+      = getCodeScoped             $
+        maybeAltHeapCheck gc_plan $
+        do { _ <- bindConArgs con base_reg (assertNonVoidIds bndrs)
+                    -- alt binders are always non-void,
+                    -- see Note [Post-unarisation invariants] in UnariseStg
+           ; _ <- cgExpr rhs
+           ; return con }
+  forkAlts (map cg_alt alts)
+
+maybeAltHeapCheck :: (GcPlan,ReturnKind) -> FCode a -> FCode a
+maybeAltHeapCheck (NoGcInAlts,_)  code = code
+maybeAltHeapCheck (GcInAlts regs, AssignedDirectly) code =
+  altHeapCheck regs code
+maybeAltHeapCheck (GcInAlts regs, ReturnedTo lret off) code =
+  altHeapCheckReturnsTo regs lret off code
+
+-----------------------------------------------------------------------------
+--      Tail calls
+-----------------------------------------------------------------------------
+
+cgConApp :: DataCon -> [StgArg] -> FCode ReturnKind
+cgConApp con stg_args
+  | isUnboxedTupleCon con       -- Unboxed tuple: assign and return
+  = do { arg_exprs <- getNonVoidArgAmodes stg_args
+       ; tickyUnboxedTupleReturn (length arg_exprs)
+       ; emitReturn arg_exprs }
+
+  | otherwise   --  Boxed constructors; allocate and return
+  = ASSERT2( stg_args `lengthIs` countConRepArgs con, ppr con <> parens (ppr (countConRepArgs con)) <+> ppr stg_args )
+    do  { (idinfo, fcode_init) <- buildDynCon (dataConWorkId con) False
+                                     currentCCS con (assertNonVoidStgArgs stg_args)
+                                     -- con args are always non-void,
+                                     -- see Note [Post-unarisation invariants] in UnariseStg
+                -- The first "con" says that the name bound to this
+                -- closure is "con", which is a bit of a fudge, but
+                -- it only affects profiling (hence the False)
+
+        ; emit =<< fcode_init
+        ; tickyReturnNewCon (length stg_args)
+        ; emitReturn [idInfoToAmode idinfo] }
+
+cgIdApp :: Id -> [StgArg] -> FCode ReturnKind
+cgIdApp fun_id args = do
+    dflags         <- getDynFlags
+    fun_info       <- getCgIdInfo fun_id
+    self_loop_info <- getSelfLoop
+    let fun_arg     = StgVarArg fun_id
+        fun_name    = idName    fun_id
+        fun         = idInfoToAmode fun_info
+        lf_info     = cg_lf         fun_info
+        n_args      = length args
+        v_args      = length $ filter (isVoidTy . stgArgType) args
+        node_points dflags = nodeMustPointToIt dflags lf_info
+    case getCallMethod dflags fun_name fun_id lf_info n_args v_args (cg_loc fun_info) self_loop_info of
+            -- A value in WHNF, so we can just return it.
+        ReturnIt
+          | isVoidTy (idType fun_id) -> emitReturn []
+          | otherwise                -> emitReturn [fun]
+          -- ToDo: does ReturnIt guarantee tagged?
+
+        EnterIt -> ASSERT( null args )  -- Discarding arguments
+                   emitEnter fun
+
+        SlowCall -> do      -- A slow function call via the RTS apply routines
+                { tickySlowCall lf_info args
+                ; emitComment $ mkFastString "slowCall"
+                ; slowCall fun args }
+
+        -- A direct function call (possibly with some left-over arguments)
+        DirectEntry lbl arity -> do
+                { tickyDirectCall arity args
+                ; if node_points dflags
+                     then directCall NativeNodeCall   lbl arity (fun_arg:args)
+                     else directCall NativeDirectCall lbl arity args }
+
+        -- Let-no-escape call or self-recursive tail-call
+        JumpToIt blk_id lne_regs -> do
+          { adjustHpBackwards -- always do this before a tail-call
+          ; cmm_args <- getNonVoidArgAmodes args
+          ; emitMultiAssign lne_regs cmm_args
+          ; emit (mkBranch blk_id)
+          ; return AssignedDirectly }
+
+-- Note [Self-recursive tail calls]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- Self-recursive tail calls can be optimized into a local jump in the same
+-- way as let-no-escape bindings (see Note [What is a non-escaping let] in
+-- stgSyn/CoreToStg.hs). Consider this:
+--
+-- foo.info:
+--     a = R1  // calling convention
+--     b = R2
+--     goto L1
+-- L1: ...
+--     ...
+-- ...
+-- L2: R1 = x
+--     R2 = y
+--     call foo(R1,R2)
+--
+-- Instead of putting x and y into registers (or other locations required by the
+-- calling convention) and performing a call we can put them into local
+-- variables a and b and perform jump to L1:
+--
+-- foo.info:
+--     a = R1
+--     b = R2
+--     goto L1
+-- L1: ...
+--     ...
+-- ...
+-- L2: a = x
+--     b = y
+--     goto L1
+--
+-- This can be done only when function is calling itself in a tail position
+-- and only if the call passes number of parameters equal to function's arity.
+-- Note that this cannot be performed if a function calls itself with a
+-- continuation.
+--
+-- This in fact implements optimization known as "loopification". It was
+-- described in "Low-level code optimizations in the Glasgow Haskell Compiler"
+-- by Krzysztof Woś, though we use different approach. Krzysztof performed his
+-- optimization at the Cmm level, whereas we perform ours during code generation
+-- (Stg-to-Cmm pass) essentially making sure that optimized Cmm code is
+-- generated in the first place.
+--
+-- Implementation is spread across a couple of places in the code:
+--
+--   * FCode monad stores additional information in its reader environment
+--     (cgd_self_loop field). This information tells us which function can
+--     tail call itself in an optimized way (it is the function currently
+--     being compiled), what is the label of a loop header (L1 in example above)
+--     and information about local registers in which we should arguments
+--     before making a call (this would be a and b in example above).
+--
+--   * Whenever we are compiling a function, we set that information to reflect
+--     the fact that function currently being compiled can be jumped to, instead
+--     of called. This is done in closureCodyBody in GHC.StgToCmm.Bind.
+--
+--   * We also have to emit a label to which we will be jumping. We make sure
+--     that the label is placed after a stack check but before the heap
+--     check. The reason is that making a recursive tail-call does not increase
+--     the stack so we only need to check once. But it may grow the heap, so we
+--     have to repeat the heap check in every self-call. This is done in
+--     do_checks in GHC.StgToCmm.Heap.
+--
+--   * When we begin compilation of another closure we remove the additional
+--     information from the environment. This is done by forkClosureBody
+--     in GHC.StgToCmm.Monad. Other functions that duplicate the environment -
+--     forkLneBody, forkAlts, codeOnly - duplicate that information. In other
+--     words, we only need to clean the environment of the self-loop information
+--     when compiling right hand side of a closure (binding).
+--
+--   * When compiling a call (cgIdApp) we use getCallMethod to decide what kind
+--     of call will be generated. getCallMethod decides to generate a self
+--     recursive tail call when (a) environment stores information about
+--     possible self tail-call; (b) that tail call is to a function currently
+--     being compiled; (c) number of passed non-void arguments is equal to
+--     function's arity. (d) loopification is turned on via -floopification
+--     command-line option.
+--
+--   * Command line option to turn loopification on and off is implemented in
+--     DynFlags.
+--
+--
+-- Note [Void arguments in self-recursive tail calls]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- State# tokens can get in the way of the loopification optimization as seen in
+-- #11372. Consider this:
+--
+-- foo :: [a]
+--     -> (a -> State# s -> (# State s, Bool #))
+--     -> State# s
+--     -> (# State# s, Maybe a #)
+-- foo [] f s = (# s, Nothing #)
+-- foo (x:xs) f s = case f x s of
+--      (# s', b #) -> case b of
+--          True -> (# s', Just x #)
+--          False -> foo xs f s'
+--
+-- We would like to compile the call to foo as a local jump instead of a call
+-- (see Note [Self-recursive tail calls]). However, the generated function has
+-- an arity of 2 while we apply it to 3 arguments, one of them being of void
+-- type. Thus, we mustn't count arguments of void type when checking whether
+-- we can turn a call into a self-recursive jump.
+--
+
+emitEnter :: CmmExpr -> FCode ReturnKind
+emitEnter fun = do
+  { dflags <- getDynFlags
+  ; adjustHpBackwards
+  ; sequel <- getSequel
+  ; updfr_off <- getUpdFrameOff
+  ; case sequel of
+      -- For a return, we have the option of generating a tag-test or
+      -- not.  If the value is tagged, we can return directly, which
+      -- is quicker than entering the value.  This is a code
+      -- size/speed trade-off: when optimising for speed rather than
+      -- size we could generate the tag test.
+      --
+      -- Right now, we do what the old codegen did, and omit the tag
+      -- test, just generating an enter.
+      Return -> do
+        { let entry = entryCode dflags $ closureInfoPtr dflags $ CmmReg nodeReg
+        ; emit $ mkJump dflags NativeNodeCall entry
+                        [cmmUntag dflags fun] updfr_off
+        ; return AssignedDirectly
+        }
+
+      -- The result will be scrutinised in the sequel.  This is where
+      -- we generate a tag-test to avoid entering the closure if
+      -- possible.
+      --
+      -- The generated code will be something like this:
+      --
+      --    R1 = fun  -- copyout
+      --    if (fun & 7 != 0) goto Lret else goto Lcall
+      --  Lcall:
+      --    call [fun] returns to Lret
+      --  Lret:
+      --    fun' = R1  -- copyin
+      --    ...
+      --
+      -- Note in particular that the label Lret is used as a
+      -- destination by both the tag-test and the call.  This is
+      -- because Lret will necessarily be a proc-point, and we want to
+      -- ensure that we generate only one proc-point for this
+      -- sequence.
+      --
+      -- Furthermore, we tell the caller that we generated a native
+      -- return continuation by returning (ReturnedTo Lret off), so
+      -- that the continuation can be reused by the heap-check failure
+      -- code in the enclosing case expression.
+      --
+      AssignTo res_regs _ -> do
+       { lret <- newBlockId
+       ; let (off, _, copyin) = copyInOflow dflags NativeReturn (Young lret) res_regs []
+       ; lcall <- newBlockId
+       ; updfr_off <- getUpdFrameOff
+       ; let area = Young lret
+       ; let (outArgs, regs, copyout) = copyOutOflow dflags NativeNodeCall Call area
+                                          [fun] updfr_off []
+         -- refer to fun via nodeReg after the copyout, to avoid having
+         -- both live simultaneously; this sometimes enables fun to be
+         -- inlined in the RHS of the R1 assignment.
+       ; let entry = entryCode dflags (closureInfoPtr dflags (CmmReg nodeReg))
+             the_call = toCall entry (Just lret) updfr_off off outArgs regs
+       ; tscope <- getTickScope
+       ; emit $
+           copyout <*>
+           mkCbranch (cmmIsTagged dflags (CmmReg nodeReg))
+                     lret lcall Nothing <*>
+           outOfLine lcall (the_call,tscope) <*>
+           mkLabel lret tscope <*>
+           copyin
+       ; return (ReturnedTo lret off)
+       }
+  }
+
+------------------------------------------------------------------------
+--              Ticks
+------------------------------------------------------------------------
+
+-- | Generate Cmm code for a tick. Depending on the type of Tickish,
+-- this will either generate actual Cmm instrumentation code, or
+-- simply pass on the annotation as a @CmmTickish@.
+cgTick :: Tickish Id -> FCode ()
+cgTick tick
+  = do { dflags <- getDynFlags
+       ; case tick of
+           ProfNote   cc t p -> emitSetCCC cc t p
+           HpcTick    m n    -> emit (mkTickBox dflags m n)
+           SourceNote s n    -> emitTick $ SourceNote s n
+           _other            -> return () -- ignore
+       }
diff --git a/compiler/GHC/StgToCmm/ExtCode.hs b/compiler/GHC/StgToCmm/ExtCode.hs
new file mode 100644
index 0000000000..be2592edd3
--- /dev/null
+++ b/compiler/GHC/StgToCmm/ExtCode.hs
@@ -0,0 +1,252 @@
+{-# LANGUAGE DeriveFunctor #-}
+-- | Our extended FCode monad.
+
+-- We add a mapping from names to CmmExpr, to support local variable names in
+-- the concrete C-- code.  The unique supply of the underlying FCode monad
+-- is used to grab a new unique for each local variable.
+
+-- In C--, a local variable can be declared anywhere within a proc,
+-- and it scopes from the beginning of the proc to the end.  Hence, we have
+-- to collect declarations as we parse the proc, and feed the environment
+-- back in circularly (to avoid a two-pass algorithm).
+
+module GHC.StgToCmm.ExtCode (
+        CmmParse, unEC,
+        Named(..), Env,
+
+        loopDecls,
+        getEnv,
+
+        withName,
+        getName,
+
+        newLocal,
+        newLabel,
+        newBlockId,
+        newFunctionName,
+        newImport,
+        lookupLabel,
+        lookupName,
+
+        code,
+        emit, emitLabel, emitAssign, emitStore,
+        getCode, getCodeR, getCodeScoped,
+        emitOutOfLine,
+        withUpdFrameOff, getUpdFrameOff
+)
+
+where
+
+import GhcPrelude
+
+import qualified GHC.StgToCmm.Monad as F
+import GHC.StgToCmm.Monad (FCode, newUnique)
+
+import Cmm
+import CLabel
+import MkGraph
+
+import BlockId
+import DynFlags
+import FastString
+import Module
+import UniqFM
+import Unique
+import UniqSupply
+
+import Control.Monad (ap)
+
+-- | The environment contains variable definitions or blockids.
+data Named
+        = VarN CmmExpr          -- ^ Holds CmmLit(CmmLabel ..) which gives the label type,
+                                --      eg, RtsLabel, ForeignLabel, CmmLabel etc.
+
+        | FunN   UnitId      -- ^ A function name from this package
+        | LabelN BlockId                -- ^ A blockid of some code or data.
+
+-- | An environment of named things.
+type Env        = UniqFM Named
+
+-- | Local declarations that are in scope during code generation.
+type Decls      = [(FastString,Named)]
+
+-- | Does a computation in the FCode monad, with a current environment
+--      and a list of local declarations. Returns the resulting list of declarations.
+newtype CmmParse a
+        = EC { unEC :: String -> Env -> Decls -> FCode (Decls, a) }
+    deriving (Functor)
+
+type ExtCode = CmmParse ()
+
+returnExtFC :: a -> CmmParse a
+returnExtFC a   = EC $ \_ _ s -> return (s, a)
+
+thenExtFC :: CmmParse a -> (a -> CmmParse b) -> CmmParse b
+thenExtFC (EC m) k = EC $ \c e s -> do (s',r) <- m c e s; unEC (k r) c e s'
+
+instance Applicative CmmParse where
+      pure = returnExtFC
+      (<*>) = ap
+
+instance Monad CmmParse where
+  (>>=) = thenExtFC
+
+instance MonadUnique CmmParse where
+  getUniqueSupplyM = code getUniqueSupplyM
+  getUniqueM = EC $ \_ _ decls -> do
+    u <- getUniqueM
+    return (decls, u)
+
+instance HasDynFlags CmmParse where
+    getDynFlags = EC (\_ _ d -> do dflags <- getDynFlags
+                                   return (d, dflags))
+
+
+-- | Takes the variable decarations and imports from the monad
+--      and makes an environment, which is looped back into the computation.
+--      In this way, we can have embedded declarations that scope over the whole
+--      procedure, and imports that scope over the entire module.
+--      Discards the local declaration contained within decl'
+--
+loopDecls :: CmmParse a -> CmmParse a
+loopDecls (EC fcode) =
+      EC $ \c e globalDecls -> do
+        (_, a) <- F.fixC $ \ ~(decls, _) ->
+          fcode c (addListToUFM e decls) globalDecls
+        return (globalDecls, a)
+
+
+-- | Get the current environment from the monad.
+getEnv :: CmmParse Env
+getEnv  = EC $ \_ e s -> return (s, e)
+
+-- | Get the current context name from the monad
+getName :: CmmParse String
+getName  = EC $ \c _ s -> return (s, c)
+
+-- | Set context name for a sub-parse
+withName :: String -> CmmParse a -> CmmParse a
+withName c' (EC fcode) = EC $ \_ e s -> fcode c' e s
+
+addDecl :: FastString -> Named -> ExtCode
+addDecl name named = EC $ \_ _ s -> return ((name, named) : s, ())
+
+
+-- | Add a new variable to the list of local declarations.
+--      The CmmExpr says where the value is stored.
+addVarDecl :: FastString -> CmmExpr -> ExtCode
+addVarDecl var expr = addDecl var (VarN expr)
+
+-- | Add a new label to the list of local declarations.
+addLabel :: FastString -> BlockId -> ExtCode
+addLabel name block_id = addDecl name (LabelN block_id)
+
+
+-- | Create a fresh local variable of a given type.
+newLocal
+        :: CmmType              -- ^ data type
+        -> FastString           -- ^ name of variable
+        -> CmmParse LocalReg    -- ^ register holding the value
+
+newLocal ty name = do
+   u <- code newUnique
+   let reg = LocalReg u ty
+   addVarDecl name (CmmReg (CmmLocal reg))
+   return reg
+
+
+-- | Allocate a fresh label.
+newLabel :: FastString -> CmmParse BlockId
+newLabel name = do
+   u <- code newUnique
+   addLabel name (mkBlockId u)
+   return (mkBlockId u)
+
+-- | Add add a local function to the environment.
+newFunctionName
+        :: FastString   -- ^ name of the function
+        -> UnitId    -- ^ package of the current module
+        -> ExtCode
+
+newFunctionName name pkg = addDecl name (FunN pkg)
+
+
+-- | Add an imported foreign label to the list of local declarations.
+--      If this is done at the start of the module the declaration will scope
+--      over the whole module.
+newImport
+        :: (FastString, CLabel)
+        -> CmmParse ()
+
+newImport (name, cmmLabel)
+   = addVarDecl name (CmmLit (CmmLabel cmmLabel))
+
+
+-- | Lookup the BlockId bound to the label with this name.
+--      If one hasn't been bound yet, create a fresh one based on the
+--      Unique of the name.
+lookupLabel :: FastString -> CmmParse BlockId
+lookupLabel name = do
+  env <- getEnv
+  return $
+     case lookupUFM env name of
+        Just (LabelN l) -> l
+        _other          -> mkBlockId (newTagUnique (getUnique name) 'L')
+
+
+-- | Lookup the location of a named variable.
+--      Unknown names are treated as if they had been 'import'ed from the runtime system.
+--      This saves us a lot of bother in the RTS sources, at the expense of
+--      deferring some errors to link time.
+lookupName :: FastString -> CmmParse CmmExpr
+lookupName name = do
+  env    <- getEnv
+  return $
+     case lookupUFM env name of
+        Just (VarN e)   -> e
+        Just (FunN pkg) -> CmmLit (CmmLabel (mkCmmCodeLabel pkg          name))
+        _other          -> CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId name))
+
+
+-- | Lift an FCode computation into the CmmParse monad
+code :: FCode a -> CmmParse a
+code fc = EC $ \_ _ s -> do
+                r <- fc
+                return (s, r)
+
+emit :: CmmAGraph -> CmmParse ()
+emit = code . F.emit
+
+emitLabel :: BlockId -> CmmParse ()
+emitLabel = code . F.emitLabel
+
+emitAssign :: CmmReg  -> CmmExpr -> CmmParse ()
+emitAssign l r = code (F.emitAssign l r)
+
+emitStore :: CmmExpr  -> CmmExpr -> CmmParse ()
+emitStore l r = code (F.emitStore l r)
+
+getCode :: CmmParse a -> CmmParse CmmAGraph
+getCode (EC ec) = EC $ \c e s -> do
+  ((s',_), gr) <- F.getCodeR (ec c e s)
+  return (s', gr)
+
+getCodeR :: CmmParse a -> CmmParse (a, CmmAGraph)
+getCodeR (EC ec) = EC $ \c e s -> do
+  ((s', r), gr) <- F.getCodeR (ec c e s)
+  return (s', (r,gr))
+
+getCodeScoped :: CmmParse a -> CmmParse (a, CmmAGraphScoped)
+getCodeScoped (EC ec) = EC $ \c e s -> do
+  ((s', r), gr) <- F.getCodeScoped (ec c e s)
+  return (s', (r,gr))
+
+emitOutOfLine :: BlockId -> CmmAGraphScoped -> CmmParse ()
+emitOutOfLine l g = code (F.emitOutOfLine l g)
+
+withUpdFrameOff :: UpdFrameOffset -> CmmParse () -> CmmParse ()
+withUpdFrameOff size inner
+  = EC $ \c e s -> F.withUpdFrameOff size $ (unEC inner) c e s
+
+getUpdFrameOff :: CmmParse UpdFrameOffset
+getUpdFrameOff = code $ F.getUpdFrameOff
diff --git a/compiler/GHC/StgToCmm/Foreign.hs b/compiler/GHC/StgToCmm/Foreign.hs
new file mode 100644
index 0000000000..dacaff41ba
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Foreign.hs
@@ -0,0 +1,627 @@
+-----------------------------------------------------------------------------
+--
+-- Code generation for foreign calls.
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.Foreign (
+  cgForeignCall,
+  emitPrimCall, emitCCall,
+  emitForeignCall,     -- For CmmParse
+  emitSaveThreadState,
+  saveThreadState,
+  emitLoadThreadState,
+  loadThreadState,
+  emitOpenNursery,
+  emitCloseNursery,
+ ) where
+
+import GhcPrelude hiding( succ, (<*>) )
+
+import StgSyn
+import GHC.StgToCmm.Prof (storeCurCCS, ccsType)
+import GHC.StgToCmm.Env
+import GHC.StgToCmm.Monad
+import GHC.StgToCmm.Utils
+import GHC.StgToCmm.Closure
+import GHC.StgToCmm.Layout
+
+import BlockId (newBlockId)
+import Cmm
+import CmmUtils
+import MkGraph
+import Type
+import RepType
+import CLabel
+import SMRep
+import ForeignCall
+import DynFlags
+import Maybes
+import Outputable
+import UniqSupply
+import BasicTypes
+
+import TyCoRep
+import TysPrim
+import Util (zipEqual)
+
+import Control.Monad
+
+-----------------------------------------------------------------------------
+-- Code generation for Foreign Calls
+-----------------------------------------------------------------------------
+
+-- | Emit code for a foreign call, and return the results to the sequel.
+-- Precondition: the length of the arguments list is the same as the
+-- arity of the foreign function.
+cgForeignCall :: ForeignCall            -- the op
+              -> Type                   -- type of foreign function
+              -> [StgArg]               -- x,y    arguments
+              -> Type                   -- result type
+              -> FCode ReturnKind
+
+cgForeignCall (CCall (CCallSpec target cconv safety)) typ stg_args res_ty
+  = do  { dflags <- getDynFlags
+        ; let -- in the stdcall calling convention, the symbol needs @size appended
+              -- to it, where size is the total number of bytes of arguments.  We
+              -- attach this info to the CLabel here, and the CLabel pretty printer
+              -- will generate the suffix when the label is printed.
+            call_size args
+              | StdCallConv <- cconv = Just (sum (map arg_size args))
+              | otherwise            = Nothing
+
+              -- ToDo: this might not be correct for 64-bit API
+            arg_size (arg, _) = max (widthInBytes $ typeWidth $ cmmExprType dflags arg)
+                                     (wORD_SIZE dflags)
+        ; cmm_args <- getFCallArgs stg_args typ
+        ; (res_regs, res_hints) <- newUnboxedTupleRegs res_ty
+        ; let ((call_args, arg_hints), cmm_target)
+                = case target of
+                   StaticTarget _ _   _      False ->
+                       panic "cgForeignCall: unexpected FFI value import"
+                   StaticTarget _ lbl mPkgId True
+                     -> let labelSource
+                                = case mPkgId of
+                                        Nothing         -> ForeignLabelInThisPackage
+                                        Just pkgId      -> ForeignLabelInPackage pkgId
+                            size = call_size cmm_args
+                        in  ( unzip cmm_args
+                            , CmmLit (CmmLabel
+                                        (mkForeignLabel lbl size labelSource IsFunction)))
+
+                   DynamicTarget    ->  case cmm_args of
+                                           (fn,_):rest -> (unzip rest, fn)
+                                           [] -> panic "cgForeignCall []"
+              fc = ForeignConvention cconv arg_hints res_hints CmmMayReturn
+              call_target = ForeignTarget cmm_target fc
+
+        -- we want to emit code for the call, and then emitReturn.
+        -- However, if the sequel is AssignTo, we shortcut a little
+        -- and generate a foreign call that assigns the results
+        -- directly.  Otherwise we end up generating a bunch of
+        -- useless "r = r" assignments, which are not merely annoying:
+        -- they prevent the common block elimination from working correctly
+        -- in the case of a safe foreign call.
+        -- See Note [safe foreign call convention]
+        --
+        ; sequel <- getSequel
+        ; case sequel of
+            AssignTo assign_to_these _ ->
+                emitForeignCall safety assign_to_these call_target call_args
+
+            _something_else ->
+                do { _ <- emitForeignCall safety res_regs call_target call_args
+                   ; emitReturn (map (CmmReg . CmmLocal) res_regs)
+                   }
+         }
+
+{- Note [safe foreign call convention]
+
+The simple thing to do for a safe foreign call would be the same as an
+unsafe one: just
+
+    emitForeignCall ...
+    emitReturn ...
+
+but consider what happens in this case
+
+   case foo x y z of
+     (# s, r #) -> ...
+
+The sequel is AssignTo [r].  The call to newUnboxedTupleRegs picks [r]
+as the result reg, and we generate
+
+  r = foo(x,y,z) returns to L1  -- emitForeignCall
+ L1:
+  r = r  -- emitReturn
+  goto L2
+L2:
+  ...
+
+Now L1 is a proc point (by definition, it is the continuation of the
+safe foreign call).  If L2 does a heap check, then L2 will also be a
+proc point.
+
+Furthermore, the stack layout algorithm has to arrange to save r
+somewhere between the call and the jump to L1, which is annoying: we
+would have to treat r differently from the other live variables, which
+have to be saved *before* the call.
+
+So we adopt a special convention for safe foreign calls: the results
+are copied out according to the NativeReturn convention by the call,
+and the continuation of the call should copyIn the results.  (The
+copyOut code is actually inserted when the safe foreign call is
+lowered later).  The result regs attached to the safe foreign call are
+only used temporarily to hold the results before they are copied out.
+
+We will now generate this:
+
+  r = foo(x,y,z) returns to L1
+ L1:
+  r = R1  -- copyIn, inserted by mkSafeCall
+  goto L2
+ L2:
+  ... r ...
+
+And when the safe foreign call is lowered later (see Note [lower safe
+foreign calls]) we get this:
+
+  suspendThread()
+  r = foo(x,y,z)
+  resumeThread()
+  R1 = r  -- copyOut, inserted by lowerSafeForeignCall
+  jump L1
+ L1:
+  r = R1  -- copyIn, inserted by mkSafeCall
+  goto L2
+ L2:
+  ... r ...
+
+Now consider what happens if L2 does a heap check: the Adams
+optimisation kicks in and commons up L1 with the heap-check
+continuation, resulting in just one proc point instead of two. Yay!
+-}
+
+
+emitCCall :: [(CmmFormal,ForeignHint)]
+          -> CmmExpr
+          -> [(CmmActual,ForeignHint)]
+          -> FCode ()
+emitCCall hinted_results fn hinted_args
+  = void $ emitForeignCall PlayRisky results target args
+  where
+    (args, arg_hints) = unzip hinted_args
+    (results, result_hints) = unzip hinted_results
+    target = ForeignTarget fn fc
+    fc = ForeignConvention CCallConv arg_hints result_hints CmmMayReturn
+
+
+emitPrimCall :: [CmmFormal] -> CallishMachOp -> [CmmActual] -> FCode ()
+emitPrimCall res op args
+  = void $ emitForeignCall PlayRisky res (PrimTarget op) args
+
+-- alternative entry point, used by CmmParse
+emitForeignCall
+        :: Safety
+        -> [CmmFormal]          -- where to put the results
+        -> ForeignTarget        -- the op
+        -> [CmmActual]          -- arguments
+        -> FCode ReturnKind
+emitForeignCall safety results target args
+  | not (playSafe safety) = do
+    dflags <- getDynFlags
+    let (caller_save, caller_load) = callerSaveVolatileRegs dflags
+    emit caller_save
+    target' <- load_target_into_temp target
+    args' <- mapM maybe_assign_temp args
+    emit $ mkUnsafeCall target' results args'
+    emit caller_load
+    return AssignedDirectly
+
+  | otherwise = do
+    dflags <- getDynFlags
+    updfr_off <- getUpdFrameOff
+    target' <- load_target_into_temp target
+    args' <- mapM maybe_assign_temp args
+    k <- newBlockId
+    let (off, _, copyout) = copyInOflow dflags NativeReturn (Young k) results []
+       -- see Note [safe foreign call convention]
+    tscope <- getTickScope
+    emit $
+           (    mkStore (CmmStackSlot (Young k) (widthInBytes (wordWidth dflags)))
+                        (CmmLit (CmmBlock k))
+            <*> mkLast (CmmForeignCall { tgt  = target'
+                                       , res  = results
+                                       , args = args'
+                                       , succ = k
+                                       , ret_args = off
+                                       , ret_off = updfr_off
+                                       , intrbl = playInterruptible safety })
+            <*> mkLabel k tscope
+            <*> copyout
+           )
+    return (ReturnedTo k off)
+
+load_target_into_temp :: ForeignTarget -> FCode ForeignTarget
+load_target_into_temp (ForeignTarget expr conv) = do
+  tmp <- maybe_assign_temp expr
+  return (ForeignTarget tmp conv)
+load_target_into_temp other_target@(PrimTarget _) =
+  return other_target
+
+-- What we want to do here is create a new temporary for the foreign
+-- call argument if it is not safe to use the expression directly,
+-- because the expression mentions caller-saves GlobalRegs (see
+-- Note [Register Parameter Passing]).
+--
+-- However, we can't pattern-match on the expression here, because
+-- this is used in a loop by CmmParse, and testing the expression
+-- results in a black hole.  So we always create a temporary, and rely
+-- on CmmSink to clean it up later.  (Yuck, ToDo).  The generated code
+-- ends up being the same, at least for the RTS .cmm code.
+--
+maybe_assign_temp :: CmmExpr -> FCode CmmExpr
+maybe_assign_temp e = do
+  dflags <- getDynFlags
+  reg <- newTemp (cmmExprType dflags e)
+  emitAssign (CmmLocal reg) e
+  return (CmmReg (CmmLocal reg))
+
+-- -----------------------------------------------------------------------------
+-- Save/restore the thread state in the TSO
+
+-- This stuff can't be done in suspendThread/resumeThread, because it
+-- refers to global registers which aren't available in the C world.
+
+emitSaveThreadState :: FCode ()
+emitSaveThreadState = do
+  dflags <- getDynFlags
+  code <- saveThreadState dflags
+  emit code
+
+-- | Produce code to save the current thread state to @CurrentTSO@
+saveThreadState :: MonadUnique m => DynFlags -> m CmmAGraph
+saveThreadState dflags = do
+  tso <- newTemp (gcWord dflags)
+  close_nursery <- closeNursery dflags tso
+  pure $ catAGraphs [
+    -- tso = CurrentTSO;
+    mkAssign (CmmLocal tso) currentTSOExpr,
+    -- tso->stackobj->sp = Sp;
+    mkStore (cmmOffset dflags
+                       (CmmLoad (cmmOffset dflags
+                                           (CmmReg (CmmLocal tso))
+                                           (tso_stackobj dflags))
+                                (bWord dflags))
+                       (stack_SP dflags))
+            spExpr,
+    close_nursery,
+    -- and save the current cost centre stack in the TSO when profiling:
+    if gopt Opt_SccProfilingOn dflags then
+        mkStore (cmmOffset dflags (CmmReg (CmmLocal tso)) (tso_CCCS dflags)) cccsExpr
+      else mkNop
+    ]
+
+emitCloseNursery :: FCode ()
+emitCloseNursery = do
+  dflags <- getDynFlags
+  tso <- newTemp (bWord dflags)
+  code <- closeNursery dflags tso
+  emit $ mkAssign (CmmLocal tso) currentTSOExpr <*> code
+
+{- |
+@closeNursery dflags tso@ produces code to close the nursery.
+A local register holding the value of @CurrentTSO@ is expected for
+efficiency.
+
+Closing the nursery corresponds to the following code:
+
+@
+  tso = CurrentTSO;
+  cn = CurrentNuresry;
+
+  // Update the allocation limit for the current thread.  We don't
+  // check to see whether it has overflowed at this point, that check is
+  // made when we run out of space in the current heap block (stg_gc_noregs)
+  // and in the scheduler when context switching (schedulePostRunThread).
+  tso->alloc_limit -= Hp + WDS(1) - cn->start;
+
+  // Set cn->free to the next unoccupied word in the block
+  cn->free = Hp + WDS(1);
+@
+-}
+closeNursery :: MonadUnique m => DynFlags -> LocalReg -> m CmmAGraph
+closeNursery df tso = do
+  let tsoreg  = CmmLocal tso
+  cnreg      <- CmmLocal <$> newTemp (bWord df)
+  pure $ catAGraphs [
+    mkAssign cnreg currentNurseryExpr,
+
+    -- CurrentNursery->free = Hp+1;
+    mkStore (nursery_bdescr_free df cnreg) (cmmOffsetW df hpExpr 1),
+
+    let alloc =
+           CmmMachOp (mo_wordSub df)
+              [ cmmOffsetW df hpExpr 1
+              , CmmLoad (nursery_bdescr_start df cnreg) (bWord df)
+              ]
+
+        alloc_limit = cmmOffset df (CmmReg tsoreg) (tso_alloc_limit df)
+    in
+
+    -- tso->alloc_limit += alloc
+    mkStore alloc_limit (CmmMachOp (MO_Sub W64)
+                               [ CmmLoad alloc_limit b64
+                               , CmmMachOp (mo_WordTo64 df) [alloc] ])
+   ]
+
+emitLoadThreadState :: FCode ()
+emitLoadThreadState = do
+  dflags <- getDynFlags
+  code <- loadThreadState dflags
+  emit code
+
+-- | Produce code to load the current thread state from @CurrentTSO@
+loadThreadState :: MonadUnique m => DynFlags -> m CmmAGraph
+loadThreadState dflags = do
+  tso <- newTemp (gcWord dflags)
+  stack <- newTemp (gcWord dflags)
+  open_nursery <- openNursery dflags tso
+  pure $ catAGraphs [
+    -- tso = CurrentTSO;
+    mkAssign (CmmLocal tso) currentTSOExpr,
+    -- stack = tso->stackobj;
+    mkAssign (CmmLocal stack) (CmmLoad (cmmOffset dflags (CmmReg (CmmLocal tso)) (tso_stackobj dflags)) (bWord dflags)),
+    -- Sp = stack->sp;
+    mkAssign spReg (CmmLoad (cmmOffset dflags (CmmReg (CmmLocal stack)) (stack_SP dflags)) (bWord dflags)),
+    -- SpLim = stack->stack + RESERVED_STACK_WORDS;
+    mkAssign spLimReg (cmmOffsetW dflags (cmmOffset dflags (CmmReg (CmmLocal stack)) (stack_STACK dflags))
+                                (rESERVED_STACK_WORDS dflags)),
+    -- HpAlloc = 0;
+    --   HpAlloc is assumed to be set to non-zero only by a failed
+    --   a heap check, see HeapStackCheck.cmm:GC_GENERIC
+    mkAssign hpAllocReg (zeroExpr dflags),
+    open_nursery,
+    -- and load the current cost centre stack from the TSO when profiling:
+    if gopt Opt_SccProfilingOn dflags
+       then storeCurCCS
+              (CmmLoad (cmmOffset dflags (CmmReg (CmmLocal tso))
+                 (tso_CCCS dflags)) (ccsType dflags))
+       else mkNop
+   ]
+
+
+emitOpenNursery :: FCode ()
+emitOpenNursery = do
+  dflags <- getDynFlags
+  tso <- newTemp (bWord dflags)
+  code <- openNursery dflags tso
+  emit $ mkAssign (CmmLocal tso) currentTSOExpr <*> code
+
+{- |
+@openNursery dflags tso@ produces code to open the nursery. A local register
+holding the value of @CurrentTSO@ is expected for efficiency.
+
+Opening the nursery corresponds to the following code:
+
+@
+   tso = CurrentTSO;
+   cn = CurrentNursery;
+   bdfree = CurrentNursery->free;
+   bdstart = CurrentNursery->start;
+
+   // We *add* the currently occupied portion of the nursery block to
+   // the allocation limit, because we will subtract it again in
+   // closeNursery.
+   tso->alloc_limit += bdfree - bdstart;
+
+   // Set Hp to the last occupied word of the heap block.  Why not the
+   // next unocupied word?  Doing it this way means that we get to use
+   // an offset of zero more often, which might lead to slightly smaller
+   // code on some architectures.
+   Hp = bdfree - WDS(1);
+
+   // Set HpLim to the end of the current nursery block (note that this block
+   // might be a block group, consisting of several adjacent blocks.
+   HpLim = bdstart + CurrentNursery->blocks*BLOCK_SIZE_W - 1;
+@
+-}
+openNursery :: MonadUnique m => DynFlags -> LocalReg -> m CmmAGraph
+openNursery df tso = do
+  let tsoreg =  CmmLocal tso
+  cnreg      <- CmmLocal <$> newTemp (bWord df)
+  bdfreereg  <- CmmLocal <$> newTemp (bWord df)
+  bdstartreg <- CmmLocal <$> newTemp (bWord df)
+
+  -- These assignments are carefully ordered to reduce register
+  -- pressure and generate not completely awful code on x86.  To see
+  -- what code we generate, look at the assembly for
+  -- stg_returnToStackTop in rts/StgStartup.cmm.
+  pure $ catAGraphs [
+     mkAssign cnreg currentNurseryExpr,
+     mkAssign bdfreereg  (CmmLoad (nursery_bdescr_free df cnreg)  (bWord df)),
+
+     -- Hp = CurrentNursery->free - 1;
+     mkAssign hpReg (cmmOffsetW df (CmmReg bdfreereg) (-1)),
+
+     mkAssign bdstartreg (CmmLoad (nursery_bdescr_start df cnreg) (bWord df)),
+
+     -- HpLim = CurrentNursery->start +
+     --              CurrentNursery->blocks*BLOCK_SIZE_W - 1;
+     mkAssign hpLimReg
+         (cmmOffsetExpr df
+             (CmmReg bdstartreg)
+             (cmmOffset df
+               (CmmMachOp (mo_wordMul df) [
+                 CmmMachOp (MO_SS_Conv W32 (wordWidth df))
+                   [CmmLoad (nursery_bdescr_blocks df cnreg) b32],
+                 mkIntExpr df (bLOCK_SIZE df)
+                ])
+               (-1)
+             )
+         ),
+
+     -- alloc = bd->free - bd->start
+     let alloc =
+           CmmMachOp (mo_wordSub df) [CmmReg bdfreereg, CmmReg bdstartreg]
+
+         alloc_limit = cmmOffset df (CmmReg tsoreg) (tso_alloc_limit df)
+     in
+
+     -- tso->alloc_limit += alloc
+     mkStore alloc_limit (CmmMachOp (MO_Add W64)
+                               [ CmmLoad alloc_limit b64
+                               , CmmMachOp (mo_WordTo64 df) [alloc] ])
+
+   ]
+
+nursery_bdescr_free, nursery_bdescr_start, nursery_bdescr_blocks
+  :: DynFlags -> CmmReg -> CmmExpr
+nursery_bdescr_free   dflags cn =
+  cmmOffset dflags (CmmReg cn) (oFFSET_bdescr_free dflags)
+nursery_bdescr_start  dflags cn =
+  cmmOffset dflags (CmmReg cn) (oFFSET_bdescr_start dflags)
+nursery_bdescr_blocks dflags cn =
+  cmmOffset dflags (CmmReg cn) (oFFSET_bdescr_blocks dflags)
+
+tso_stackobj, tso_CCCS, tso_alloc_limit, stack_STACK, stack_SP :: DynFlags -> ByteOff
+tso_stackobj dflags = closureField dflags (oFFSET_StgTSO_stackobj dflags)
+tso_alloc_limit dflags = closureField dflags (oFFSET_StgTSO_alloc_limit dflags)
+tso_CCCS     dflags = closureField dflags (oFFSET_StgTSO_cccs dflags)
+stack_STACK  dflags = closureField dflags (oFFSET_StgStack_stack dflags)
+stack_SP     dflags = closureField dflags (oFFSET_StgStack_sp dflags)
+
+
+closureField :: DynFlags -> ByteOff -> ByteOff
+closureField dflags off = off + fixedHdrSize dflags
+
+-- Note [Unlifted boxed arguments to foreign calls]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- For certain types passed to foreign calls, we adjust the actual
+-- value passed to the call.  For ByteArray#, Array#, SmallArray#,
+-- and ArrayArray#, we pass the address of the array's payload, not
+-- the address of the heap object. For example, consider
+--   foreign import "c_foo" foo :: ByteArray# -> Int# -> IO ()
+-- At a Haskell call like `foo x y`, we'll generate a C call that
+-- is more like
+--   c_foo( x+8, y )
+-- where the "+8" takes the heap pointer (x :: ByteArray#) and moves
+-- it past the header words of the ByteArray object to point directly
+-- to the data inside the ByteArray#. (The exact offset depends
+-- on the target architecture and on profiling) By contrast, (y :: Int#)
+-- requires no such adjustment.
+--
+-- This adjustment is performed by 'add_shim'. The size of the
+-- adjustment depends on the type of heap object. But
+-- how can we determine that type? There are two available options.
+-- We could use the types of the actual values that the foreign call
+-- has been applied to, or we could use the types present in the
+-- foreign function's type. Prior to GHC 8.10, we used the former
+-- strategy since it's a little more simple. However, in issue #16650
+-- and more compellingly in the comments of
+-- https://gitlab.haskell.org/ghc/ghc/merge_requests/939, it was
+-- demonstrated that this leads to bad behavior in the presence
+-- of unsafeCoerce#. Returning to the above example, suppose the
+-- Haskell call looked like
+--   foo (unsafeCoerce# p)
+-- where the types of expressions comprising the arguments are
+--   p :: (Any :: TYPE 'UnliftedRep)
+--   i :: Int#
+-- so that the unsafe-coerce is between Any and ByteArray#.
+-- These two types have the same kind (they are both represented by
+-- a heap pointer) so no GC errors will occur if we do this unsafe coerce.
+-- By the time this gets to the code generator the cast has been
+-- discarded so we have
+--   foo p y
+-- But we *must* adjust the pointer to p by a ByteArray# shim,
+-- *not* by an Any shim (the Any shim involves no offset at all).
+--
+-- To avoid this bad behavior, we adopt the second strategy: use
+-- the types present in the foreign function's type.
+-- In collectStgFArgTypes, we convert the foreign function's
+-- type to a list of StgFArgType. Then, in add_shim, we interpret
+-- these as numeric offsets.
+
+getFCallArgs ::
+     [StgArg]
+  -> Type -- the type of the foreign function
+  -> FCode [(CmmExpr, ForeignHint)]
+-- (a) Drop void args
+-- (b) Add foreign-call shim code
+-- It's (b) that makes this differ from getNonVoidArgAmodes
+-- Precondition: args and typs have the same length
+-- See Note [Unlifted boxed arguments to foreign calls]
+getFCallArgs args typ
+  = do  { mb_cmms <- mapM get (zipEqual "getFCallArgs" args (collectStgFArgTypes typ))
+        ; return (catMaybes mb_cmms) }
+  where
+    get (arg,typ)
+      | null arg_reps
+      = return Nothing
+      | otherwise
+      = do { cmm <- getArgAmode (NonVoid arg)
+           ; dflags <- getDynFlags
+           ; return (Just (add_shim dflags typ cmm, hint)) }
+      where
+        arg_ty   = stgArgType arg
+        arg_reps = typePrimRep arg_ty
+        hint     = typeForeignHint arg_ty
+
+-- The minimum amount of information needed to determine
+-- the offset to apply to an argument to a foreign call.
+-- See Note [Unlifted boxed arguments to foreign calls]
+data StgFArgType
+  = StgPlainType
+  | StgArrayType
+  | StgSmallArrayType
+  | StgByteArrayType
+
+-- See Note [Unlifted boxed arguments to foreign calls]
+add_shim :: DynFlags -> StgFArgType -> CmmExpr -> CmmExpr
+add_shim dflags ty expr = case ty of
+  StgPlainType -> expr
+  StgArrayType -> cmmOffsetB dflags expr (arrPtrsHdrSize dflags)
+  StgSmallArrayType -> cmmOffsetB dflags expr (smallArrPtrsHdrSize dflags)
+  StgByteArrayType -> cmmOffsetB dflags expr (arrWordsHdrSize dflags)
+
+-- From a function, extract information needed to determine
+-- the offset of each argument when used as a C FFI argument.
+-- See Note [Unlifted boxed arguments to foreign calls]
+collectStgFArgTypes :: Type -> [StgFArgType]
+collectStgFArgTypes = go []
+  where
+    -- Skip foralls
+    go bs (ForAllTy _ res) = go bs res
+    go bs (AppTy{}) = reverse bs
+    go bs (TyConApp{}) = reverse bs
+    go bs (LitTy{}) = reverse bs
+    go bs (TyVarTy{}) = reverse bs
+    go  _ (CastTy{}) = panic "myCollectTypeArgs: CastTy"
+    go  _ (CoercionTy{}) = panic "myCollectTypeArgs: CoercionTy"
+    go bs (FunTy {ft_arg = arg, ft_res=res}) =
+      go (typeToStgFArgType arg:bs) res
+
+-- Choose the offset based on the type. For anything other
+-- than an unlifted boxed type, there is no offset.
+-- See Note [Unlifted boxed arguments to foreign calls]
+typeToStgFArgType :: Type -> StgFArgType
+typeToStgFArgType typ
+  | tycon == arrayPrimTyCon = StgArrayType
+  | tycon == mutableArrayPrimTyCon = StgArrayType
+  | tycon == arrayArrayPrimTyCon = StgArrayType
+  | tycon == mutableArrayArrayPrimTyCon = StgArrayType
+  | tycon == smallArrayPrimTyCon = StgSmallArrayType
+  | tycon == smallMutableArrayPrimTyCon = StgSmallArrayType
+  | tycon == byteArrayPrimTyCon = StgByteArrayType
+  | tycon == mutableByteArrayPrimTyCon = StgByteArrayType
+  | otherwise = StgPlainType
+  where
+  -- Should be a tycon app, since this is a foreign call. We look
+  -- through newtypes so the offset does not change if a user replaces
+  -- a type in a foreign function signature with a representationally
+  -- equivalent newtype.
+  tycon = tyConAppTyCon (unwrapType typ)
+
diff --git a/compiler/GHC/StgToCmm/Heap.hs b/compiler/GHC/StgToCmm/Heap.hs
new file mode 100644
index 0000000000..a1f016c13c
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Heap.hs
@@ -0,0 +1,680 @@
+-----------------------------------------------------------------------------
+--
+-- Stg to C--: heap management functions
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.Heap (
+        getVirtHp, setVirtHp, setRealHp,
+        getHpRelOffset,
+
+        entryHeapCheck, altHeapCheck, noEscapeHeapCheck, altHeapCheckReturnsTo,
+        heapStackCheckGen,
+        entryHeapCheck',
+
+        mkStaticClosureFields, mkStaticClosure,
+
+        allocDynClosure, allocDynClosureCmm, allocHeapClosure,
+        emitSetDynHdr
+    ) where
+
+import GhcPrelude hiding ((<*>))
+
+import StgSyn
+import CLabel
+import GHC.StgToCmm.Layout
+import GHC.StgToCmm.Utils
+import GHC.StgToCmm.Monad
+import GHC.StgToCmm.Prof (profDynAlloc, dynProfHdr, staticProfHdr)
+import GHC.StgToCmm.Ticky
+import GHC.StgToCmm.Closure
+import GHC.StgToCmm.Env
+
+import MkGraph
+
+import Hoopl.Label
+import SMRep
+import BlockId
+import Cmm
+import CmmUtils
+import CostCentre
+import IdInfo( CafInfo(..), mayHaveCafRefs )
+import Id ( Id )
+import Module
+import DynFlags
+import FastString( mkFastString, fsLit )
+import Panic( sorry )
+
+import Control.Monad (when)
+import Data.Maybe (isJust)
+
+-----------------------------------------------------------
+--              Initialise dynamic heap objects
+-----------------------------------------------------------
+
+allocDynClosure
+        :: Maybe Id
+        -> CmmInfoTable
+        -> LambdaFormInfo
+        -> CmmExpr              -- Cost Centre to stick in the object
+        -> CmmExpr              -- Cost Centre to blame for this alloc
+                                -- (usually the same; sometimes "OVERHEAD")
+
+        -> [(NonVoid StgArg, VirtualHpOffset)]  -- Offsets from start of object
+                                                -- ie Info ptr has offset zero.
+                                                -- No void args in here
+        -> FCode CmmExpr -- returns Hp+n
+
+allocDynClosureCmm
+        :: Maybe Id -> CmmInfoTable -> LambdaFormInfo -> CmmExpr -> CmmExpr
+        -> [(CmmExpr, ByteOff)]
+        -> FCode CmmExpr -- returns Hp+n
+
+-- allocDynClosure allocates the thing in the heap,
+-- and modifies the virtual Hp to account for this.
+-- The second return value is the graph that sets the value of the
+-- returned LocalReg, which should point to the closure after executing
+-- the graph.
+
+-- allocDynClosure returns an (Hp+8) CmmExpr, and hence the result is
+-- only valid until Hp is changed.  The caller should assign the
+-- result to a LocalReg if it is required to remain live.
+--
+-- The reason we don't assign it to a LocalReg here is that the caller
+-- is often about to call regIdInfo, which immediately assigns the
+-- result of allocDynClosure to a new temp in order to add the tag.
+-- So by not generating a LocalReg here we avoid a common source of
+-- new temporaries and save some compile time.  This can be quite
+-- significant - see test T4801.
+
+
+allocDynClosure mb_id info_tbl lf_info use_cc _blame_cc args_w_offsets = do
+  let (args, offsets) = unzip args_w_offsets
+  cmm_args <- mapM getArgAmode args     -- No void args
+  allocDynClosureCmm mb_id info_tbl lf_info
+                     use_cc _blame_cc (zip cmm_args offsets)
+
+
+allocDynClosureCmm mb_id info_tbl lf_info use_cc _blame_cc amodes_w_offsets = do
+  -- SAY WHAT WE ARE ABOUT TO DO
+  let rep = cit_rep info_tbl
+  tickyDynAlloc mb_id rep lf_info
+  let info_ptr = CmmLit (CmmLabel (cit_lbl info_tbl))
+  allocHeapClosure rep info_ptr use_cc amodes_w_offsets
+
+
+-- | Low-level heap object allocation.
+allocHeapClosure
+  :: SMRep                            -- ^ representation of the object
+  -> CmmExpr                          -- ^ info pointer
+  -> CmmExpr                          -- ^ cost centre
+  -> [(CmmExpr,ByteOff)]              -- ^ payload
+  -> FCode CmmExpr                    -- ^ returns the address of the object
+allocHeapClosure rep info_ptr use_cc payload = do
+  profDynAlloc rep use_cc
+
+  virt_hp <- getVirtHp
+
+  -- Find the offset of the info-ptr word
+  let info_offset = virt_hp + 1
+            -- info_offset is the VirtualHpOffset of the first
+            -- word of the new object
+            -- Remember, virtHp points to last allocated word,
+            -- ie 1 *before* the info-ptr word of new object.
+
+  base <- getHpRelOffset info_offset
+  emitComment $ mkFastString "allocHeapClosure"
+  emitSetDynHdr base info_ptr use_cc
+
+  -- Fill in the fields
+  hpStore base payload
+
+  -- Bump the virtual heap pointer
+  dflags <- getDynFlags
+  setVirtHp (virt_hp + heapClosureSizeW dflags rep)
+
+  return base
+
+
+emitSetDynHdr :: CmmExpr -> CmmExpr -> CmmExpr -> FCode ()
+emitSetDynHdr base info_ptr ccs
+  = do dflags <- getDynFlags
+       hpStore base (zip (header dflags) [0, wORD_SIZE dflags ..])
+  where
+    header :: DynFlags -> [CmmExpr]
+    header dflags = [info_ptr] ++ dynProfHdr dflags ccs
+        -- ToDo: Parallel stuff
+        -- No ticky header
+
+-- Store the item (expr,off) in base[off]
+hpStore :: CmmExpr -> [(CmmExpr, ByteOff)] -> FCode ()
+hpStore base vals = do
+  dflags <- getDynFlags
+  sequence_ $
+    [ emitStore (cmmOffsetB dflags base off) val | (val,off) <- vals ]
+
+-----------------------------------------------------------
+--              Layout of static closures
+-----------------------------------------------------------
+
+-- Make a static closure, adding on any extra padding needed for CAFs,
+-- and adding a static link field if necessary.
+
+mkStaticClosureFields
+        :: DynFlags
+        -> CmmInfoTable
+        -> CostCentreStack
+        -> CafInfo
+        -> [CmmLit]             -- Payload
+        -> [CmmLit]             -- The full closure
+mkStaticClosureFields dflags info_tbl ccs caf_refs payload
+  = mkStaticClosure dflags info_lbl ccs payload padding
+        static_link_field saved_info_field
+  where
+    info_lbl = cit_lbl info_tbl
+
+    -- CAFs must have consistent layout, regardless of whether they
+    -- are actually updatable or not.  The layout of a CAF is:
+    --
+    --        3 saved_info
+    --        2 static_link
+    --        1 indirectee
+    --        0 info ptr
+    --
+    -- the static_link and saved_info fields must always be in the
+    -- same place.  So we use isThunkRep rather than closureUpdReqd
+    -- here:
+
+    is_caf = isThunkRep (cit_rep info_tbl)
+
+    padding
+        | is_caf && null payload = [mkIntCLit dflags 0]
+        | otherwise = []
+
+    static_link_field
+        | is_caf || staticClosureNeedsLink (mayHaveCafRefs caf_refs) info_tbl
+        = [static_link_value]
+        | otherwise
+        = []
+
+    saved_info_field
+        | is_caf     = [mkIntCLit dflags 0]
+        | otherwise  = []
+
+        -- For a static constructor which has NoCafRefs, we set the
+        -- static link field to a non-zero value so the garbage
+        -- collector will ignore it.
+    static_link_value
+        | mayHaveCafRefs caf_refs  = mkIntCLit dflags 0
+        | otherwise                = mkIntCLit dflags 3  -- No CAF refs
+                                      -- See Note [STATIC_LINK fields]
+                                      -- in rts/sm/Storage.h
+
+mkStaticClosure :: DynFlags -> CLabel -> CostCentreStack -> [CmmLit]
+  -> [CmmLit] -> [CmmLit] -> [CmmLit] -> [CmmLit]
+mkStaticClosure dflags info_lbl ccs payload padding static_link_field saved_info_field
+  =  [CmmLabel info_lbl]
+  ++ staticProfHdr dflags ccs
+  ++ payload
+  ++ padding
+  ++ static_link_field
+  ++ saved_info_field
+
+-----------------------------------------------------------
+--              Heap overflow checking
+-----------------------------------------------------------
+
+{- Note [Heap checks]
+   ~~~~~~~~~~~~~~~~~~
+Heap checks come in various forms.  We provide the following entry
+points to the runtime system, all of which use the native C-- entry
+convention.
+
+  * gc() performs garbage collection and returns
+    nothing to its caller
+
+  * A series of canned entry points like
+        r = gc_1p( r )
+    where r is a pointer.  This performs gc, and
+    then returns its argument r to its caller.
+
+  * A series of canned entry points like
+        gcfun_2p( f, x, y )
+    where f is a function closure of arity 2
+    This performs garbage collection, keeping alive the
+    three argument ptrs, and then tail-calls f(x,y)
+
+These are used in the following circumstances
+
+* entryHeapCheck: Function entry
+    (a) With a canned GC entry sequence
+        f( f_clo, x:ptr, y:ptr ) {
+             Hp = Hp+8
+             if Hp > HpLim goto L
+             ...
+          L: HpAlloc = 8
+             jump gcfun_2p( f_clo, x, y ) }
+     Note the tail call to the garbage collector;
+     it should do no register shuffling
+
+    (b) No canned sequence
+        f( f_clo, x:ptr, y:ptr, ...etc... ) {
+          T: Hp = Hp+8
+             if Hp > HpLim goto L
+             ...
+          L: HpAlloc = 8
+             call gc()  -- Needs an info table
+             goto T }
+
+* altHeapCheck: Immediately following an eval
+  Started as
+        case f x y of r { (p,q) -> rhs }
+  (a) With a canned sequence for the results of f
+       (which is the very common case since
+       all boxed cases return just one pointer
+           ...
+           r = f( x, y )
+        K:      -- K needs an info table
+           Hp = Hp+8
+           if Hp > HpLim goto L
+           ...code for rhs...
+
+        L: r = gc_1p( r )
+           goto K }
+
+        Here, the info table needed by the call
+        to gc_1p should be the *same* as the
+        one for the call to f; the C-- optimiser
+        spots this sharing opportunity)
+
+   (b) No canned sequence for results of f
+       Note second info table
+           ...
+           (r1,r2,r3) = call f( x, y )
+        K:
+           Hp = Hp+8
+           if Hp > HpLim goto L
+           ...code for rhs...
+
+        L: call gc()    -- Extra info table here
+           goto K
+
+* generalHeapCheck: Anywhere else
+  e.g. entry to thunk
+       case branch *not* following eval,
+       or let-no-escape
+  Exactly the same as the previous case:
+
+        K:      -- K needs an info table
+           Hp = Hp+8
+           if Hp > HpLim goto L
+           ...
+
+        L: call gc()
+           goto K
+-}
+
+--------------------------------------------------------------
+-- A heap/stack check at a function or thunk entry point.
+
+entryHeapCheck :: ClosureInfo
+               -> Maybe LocalReg -- Function (closure environment)
+               -> Int            -- Arity -- not same as len args b/c of voids
+               -> [LocalReg]     -- Non-void args (empty for thunk)
+               -> FCode ()
+               -> FCode ()
+
+entryHeapCheck cl_info nodeSet arity args code
+  = entryHeapCheck' is_fastf node arity args code
+  where
+    node = case nodeSet of
+              Just r  -> CmmReg (CmmLocal r)
+              Nothing -> CmmLit (CmmLabel $ staticClosureLabel cl_info)
+
+    is_fastf = case closureFunInfo cl_info of
+                 Just (_, ArgGen _) -> False
+                 _otherwise         -> True
+
+-- | lower-level version for CmmParse
+entryHeapCheck' :: Bool           -- is a known function pattern
+                -> CmmExpr        -- expression for the closure pointer
+                -> Int            -- Arity -- not same as len args b/c of voids
+                -> [LocalReg]     -- Non-void args (empty for thunk)
+                -> FCode ()
+                -> FCode ()
+entryHeapCheck' is_fastf node arity args code
+  = do dflags <- getDynFlags
+       let is_thunk = arity == 0
+
+           args' = map (CmmReg . CmmLocal) args
+           stg_gc_fun    = CmmReg (CmmGlobal GCFun)
+           stg_gc_enter1 = CmmReg (CmmGlobal GCEnter1)
+
+           {- Thunks:          jump stg_gc_enter_1
+
+              Function (fast): call (NativeNode) stg_gc_fun(fun, args)
+
+              Function (slow): call (slow) stg_gc_fun(fun, args)
+           -}
+           gc_call upd
+               | is_thunk
+                 = mkJump dflags NativeNodeCall stg_gc_enter1 [node] upd
+
+               | is_fastf
+                 = mkJump dflags NativeNodeCall stg_gc_fun (node : args') upd
+
+               | otherwise
+                 = mkJump dflags Slow stg_gc_fun (node : args') upd
+
+       updfr_sz <- getUpdFrameOff
+
+       loop_id <- newBlockId
+       emitLabel loop_id
+       heapCheck True True (gc_call updfr_sz <*> mkBranch loop_id) code
+
+-- ------------------------------------------------------------
+-- A heap/stack check in a case alternative
+
+
+-- If there are multiple alts and we need to GC, but don't have a
+-- continuation already (the scrut was simple), then we should
+-- pre-generate the continuation.  (if there are multiple alts it is
+-- always a canned GC point).
+
+-- altHeapCheck:
+-- If we have a return continuation,
+--   then if it is a canned GC pattern,
+--           then we do mkJumpReturnsTo
+--           else we do a normal call to stg_gc_noregs
+--   else if it is a canned GC pattern,
+--           then generate the continuation and do mkCallReturnsTo
+--           else we do a normal call to stg_gc_noregs
+
+altHeapCheck :: [LocalReg] -> FCode a -> FCode a
+altHeapCheck regs code = altOrNoEscapeHeapCheck False regs code
+
+altOrNoEscapeHeapCheck :: Bool -> [LocalReg] -> FCode a -> FCode a
+altOrNoEscapeHeapCheck checkYield regs code = do
+    dflags <- getDynFlags
+    case cannedGCEntryPoint dflags regs of
+      Nothing -> genericGC checkYield code
+      Just gc -> do
+        lret <- newBlockId
+        let (off, _, copyin) = copyInOflow dflags NativeReturn (Young lret) regs []
+        lcont <- newBlockId
+        tscope <- getTickScope
+        emitOutOfLine lret (copyin <*> mkBranch lcont, tscope)
+        emitLabel lcont
+        cannedGCReturnsTo checkYield False gc regs lret off code
+
+altHeapCheckReturnsTo :: [LocalReg] -> Label -> ByteOff -> FCode a -> FCode a
+altHeapCheckReturnsTo regs lret off code
+  = do dflags <- getDynFlags
+       case cannedGCEntryPoint dflags regs of
+           Nothing -> genericGC False code
+           Just gc -> cannedGCReturnsTo False True gc regs lret off code
+
+-- noEscapeHeapCheck is implemented identically to altHeapCheck (which
+-- is more efficient), but cannot be optimized away in the non-allocating
+-- case because it may occur in a loop
+noEscapeHeapCheck :: [LocalReg] -> FCode a -> FCode a
+noEscapeHeapCheck regs code = altOrNoEscapeHeapCheck True regs code
+
+cannedGCReturnsTo :: Bool -> Bool -> CmmExpr -> [LocalReg] -> Label -> ByteOff
+                  -> FCode a
+                  -> FCode a
+cannedGCReturnsTo checkYield cont_on_stack gc regs lret off code
+  = do dflags <- getDynFlags
+       updfr_sz <- getUpdFrameOff
+       heapCheck False checkYield (gc_call dflags gc updfr_sz) code
+  where
+    reg_exprs = map (CmmReg . CmmLocal) regs
+      -- Note [stg_gc arguments]
+
+      -- NB. we use the NativeReturn convention for passing arguments
+      -- to the canned heap-check routines, because we are in a case
+      -- alternative and hence the [LocalReg] was passed to us in the
+      -- NativeReturn convention.
+    gc_call dflags label sp
+      | cont_on_stack
+      = mkJumpReturnsTo dflags label NativeReturn reg_exprs lret off sp
+      | otherwise
+      = mkCallReturnsTo dflags label NativeReturn reg_exprs lret off sp []
+
+genericGC :: Bool -> FCode a -> FCode a
+genericGC checkYield code
+  = do updfr_sz <- getUpdFrameOff
+       lretry <- newBlockId
+       emitLabel lretry
+       call <- mkCall generic_gc (GC, GC) [] [] updfr_sz []
+       heapCheck False checkYield (call <*> mkBranch lretry) code
+
+cannedGCEntryPoint :: DynFlags -> [LocalReg] -> Maybe CmmExpr
+cannedGCEntryPoint dflags regs
+  = case map localRegType regs of
+      []  -> Just (mkGcLabel "stg_gc_noregs")
+      [ty]
+          | isGcPtrType ty -> Just (mkGcLabel "stg_gc_unpt_r1")
+          | isFloatType ty -> case width of
+                                  W32       -> Just (mkGcLabel "stg_gc_f1")
+                                  W64       -> Just (mkGcLabel "stg_gc_d1")
+                                  _         -> Nothing
+
+          | width == wordWidth dflags -> Just (mkGcLabel "stg_gc_unbx_r1")
+          | width == W64              -> Just (mkGcLabel "stg_gc_l1")
+          | otherwise                 -> Nothing
+          where
+              width = typeWidth ty
+      [ty1,ty2]
+          |  isGcPtrType ty1
+          && isGcPtrType ty2 -> Just (mkGcLabel "stg_gc_pp")
+      [ty1,ty2,ty3]
+          |  isGcPtrType ty1
+          && isGcPtrType ty2
+          && isGcPtrType ty3 -> Just (mkGcLabel "stg_gc_ppp")
+      [ty1,ty2,ty3,ty4]
+          |  isGcPtrType ty1
+          && isGcPtrType ty2
+          && isGcPtrType ty3
+          && isGcPtrType ty4 -> Just (mkGcLabel "stg_gc_pppp")
+      _otherwise -> Nothing
+
+-- Note [stg_gc arguments]
+-- It might seem that we could avoid passing the arguments to the
+-- stg_gc function, because they are already in the right registers.
+-- While this is usually the case, it isn't always.  Sometimes the
+-- code generator has cleverly avoided the eval in a case, e.g. in
+-- ffi/should_run/4221.hs we found
+--
+--   case a_r1mb of z
+--     FunPtr x y -> ...
+--
+-- where a_r1mb is bound a top-level constructor, and is known to be
+-- evaluated.  The codegen just assigns x, y and z, and continues;
+-- R1 is never assigned.
+--
+-- So we'll have to rely on optimisations to eliminatethese
+-- assignments where possible.
+
+
+-- | The generic GC procedure; no params, no results
+generic_gc :: CmmExpr
+generic_gc = mkGcLabel "stg_gc_noregs"
+
+-- | Create a CLabel for calling a garbage collector entry point
+mkGcLabel :: String -> CmmExpr
+mkGcLabel s = CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit s)))
+
+-------------------------------
+heapCheck :: Bool -> Bool -> CmmAGraph -> FCode a -> FCode a
+heapCheck checkStack checkYield do_gc code
+  = getHeapUsage $ \ hpHw ->
+    -- Emit heap checks, but be sure to do it lazily so
+    -- that the conditionals on hpHw don't cause a black hole
+    do  { dflags <- getDynFlags
+        ; let mb_alloc_bytes
+                 | hpHw > mBLOCK_SIZE = sorry $ unlines
+                    [" Trying to allocate more than "++show mBLOCK_SIZE++" bytes.",
+                     "",
+                     "This is currently not possible due to a limitation of GHC's code generator.",
+                     "See https://gitlab.haskell.org/ghc/ghc/issues/4505 for details.",
+                     "Suggestion: read data from a file instead of having large static data",
+                     "structures in code."]
+                 | hpHw > 0  = Just (mkIntExpr dflags (hpHw * (wORD_SIZE dflags)))
+                 | otherwise = Nothing
+                 where mBLOCK_SIZE = bLOCKS_PER_MBLOCK dflags * bLOCK_SIZE_W dflags
+              stk_hwm | checkStack = Just (CmmLit CmmHighStackMark)
+                      | otherwise  = Nothing
+        ; codeOnly $ do_checks stk_hwm checkYield mb_alloc_bytes do_gc
+        ; tickyAllocHeap True hpHw
+        ; setRealHp hpHw
+        ; code }
+
+heapStackCheckGen :: Maybe CmmExpr -> Maybe CmmExpr -> FCode ()
+heapStackCheckGen stk_hwm mb_bytes
+  = do updfr_sz <- getUpdFrameOff
+       lretry <- newBlockId
+       emitLabel lretry
+       call <- mkCall generic_gc (GC, GC) [] [] updfr_sz []
+       do_checks stk_hwm False mb_bytes (call <*> mkBranch lretry)
+
+-- Note [Single stack check]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~
+-- When compiling a function we can determine how much stack space it
+-- will use. We therefore need to perform only a single stack check at
+-- the beginning of a function to see if we have enough stack space.
+--
+-- The check boils down to comparing Sp-N with SpLim, where N is the
+-- amount of stack space needed (see Note [Stack usage] below).  *BUT*
+-- at this stage of the pipeline we are not supposed to refer to Sp
+-- itself, because the stack is not yet manifest, so we don't quite
+-- know where Sp pointing.
+
+-- So instead of referring directly to Sp - as we used to do in the
+-- past - the code generator uses (old + 0) in the stack check. That
+-- is the address of the first word of the old area, so if we add N
+-- we'll get the address of highest used word.
+--
+-- This makes the check robust.  For example, while we need to perform
+-- only one stack check for each function, we could in theory place
+-- more stack checks later in the function. They would be redundant,
+-- but not incorrect (in a sense that they should not change program
+-- behaviour). We need to make sure however that a stack check
+-- inserted after incrementing the stack pointer checks for a
+-- respectively smaller stack space. This would not be the case if the
+-- code generator produced direct references to Sp. By referencing
+-- (old + 0) we make sure that we always check for a correct amount of
+-- stack: when converting (old + 0) to Sp the stack layout phase takes
+-- into account changes already made to stack pointer. The idea for
+-- this change came from observations made while debugging #8275.
+
+-- Note [Stack usage]
+-- ~~~~~~~~~~~~~~~~~~
+-- At the moment we convert from STG to Cmm we don't know N, the
+-- number of bytes of stack that the function will use, so we use a
+-- special late-bound CmmLit, namely
+--       CmmHighStackMark
+-- to stand for the number of bytes needed. When the stack is made
+-- manifest, the number of bytes needed is calculated, and used to
+-- replace occurrences of CmmHighStackMark
+--
+-- The (Maybe CmmExpr) passed to do_checks is usually
+--     Just (CmmLit CmmHighStackMark)
+-- but can also (in certain hand-written RTS functions)
+--     Just (CmmLit 8)  or some other fixed valuet
+-- If it is Nothing, we don't generate a stack check at all.
+
+do_checks :: Maybe CmmExpr    -- Should we check the stack?
+                              -- See Note [Stack usage]
+          -> Bool             -- Should we check for preemption?
+          -> Maybe CmmExpr    -- Heap headroom (bytes)
+          -> CmmAGraph        -- What to do on failure
+          -> FCode ()
+do_checks mb_stk_hwm checkYield mb_alloc_lit do_gc = do
+  dflags <- getDynFlags
+  gc_id <- newBlockId
+
+  let
+    Just alloc_lit = mb_alloc_lit
+
+    bump_hp   = cmmOffsetExprB dflags hpExpr alloc_lit
+
+    -- Sp overflow if ((old + 0) - CmmHighStack < SpLim)
+    -- At the beginning of a function old + 0 = Sp
+    -- See Note [Single stack check]
+    sp_oflo sp_hwm =
+         CmmMachOp (mo_wordULt dflags)
+                  [CmmMachOp (MO_Sub (typeWidth (cmmRegType dflags spReg)))
+                             [CmmStackSlot Old 0, sp_hwm],
+                   CmmReg spLimReg]
+
+    -- Hp overflow if (Hp > HpLim)
+    -- (Hp has been incremented by now)
+    -- HpLim points to the LAST WORD of valid allocation space.
+    hp_oflo = CmmMachOp (mo_wordUGt dflags) [hpExpr, hpLimExpr]
+
+    alloc_n = mkAssign hpAllocReg alloc_lit
+
+  case mb_stk_hwm of
+    Nothing -> return ()
+    Just stk_hwm -> tickyStackCheck
+      >> (emit =<< mkCmmIfGoto' (sp_oflo stk_hwm) gc_id (Just False) )
+
+  -- Emit new label that might potentially be a header
+  -- of a self-recursive tail call.
+  -- See Note [Self-recursive loop header].
+  self_loop_info <- getSelfLoop
+  case self_loop_info of
+    Just (_, loop_header_id, _)
+        | checkYield && isJust mb_stk_hwm -> emitLabel loop_header_id
+    _otherwise -> return ()
+
+  if (isJust mb_alloc_lit)
+    then do
+     tickyHeapCheck
+     emitAssign hpReg bump_hp
+     emit =<< mkCmmIfThen' hp_oflo (alloc_n <*> mkBranch gc_id) (Just False)
+    else do
+      when (checkYield && not (gopt Opt_OmitYields dflags)) $ do
+         -- Yielding if HpLim == 0
+         let yielding = CmmMachOp (mo_wordEq dflags)
+                                  [CmmReg hpLimReg,
+                                   CmmLit (zeroCLit dflags)]
+         emit =<< mkCmmIfGoto' yielding gc_id (Just False)
+
+  tscope <- getTickScope
+  emitOutOfLine gc_id
+   (do_gc, tscope) -- this is expected to jump back somewhere
+
+                -- Test for stack pointer exhaustion, then
+                -- bump heap pointer, and test for heap exhaustion
+                -- Note that we don't move the heap pointer unless the
+                -- stack check succeeds.  Otherwise we might end up
+                -- with slop at the end of the current block, which can
+                -- confuse the LDV profiler.
+
+-- Note [Self-recursive loop header]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- Self-recursive loop header is required by loopification optimization (See
+-- Note [Self-recursive tail calls] in GHC.StgToCmm.Expr). We emit it if:
+--
+--  1. There is information about self-loop in the FCode environment. We don't
+--     check the binder (first component of the self_loop_info) because we are
+--     certain that if the self-loop info is present then we are compiling the
+--     binder body. Reason: the only possible way to get here with the
+--     self_loop_info present is from closureCodeBody.
+--
+--  2. checkYield && isJust mb_stk_hwm. checkYield tells us that it is possible
+--     to preempt the heap check (see #367 for motivation behind this check). It
+--     is True for heap checks placed at the entry to a function and
+--     let-no-escape heap checks but false for other heap checks (eg. in case
+--     alternatives or created from hand-written high-level Cmm). The second
+--     check (isJust mb_stk_hwm) is true for heap checks at the entry to a
+--     function and some heap checks created in hand-written Cmm. Otherwise it
+--     is Nothing. In other words the only situation when both conditions are
+--     true is when compiling stack and heap checks at the entry to a
+--     function. This is the only situation when we want to emit a self-loop
+--     label.
diff --git a/compiler/GHC/StgToCmm/Hpc.hs b/compiler/GHC/StgToCmm/Hpc.hs
new file mode 100644
index 0000000000..e33d39245c
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Hpc.hs
@@ -0,0 +1,48 @@
+-----------------------------------------------------------------------------
+--
+-- Code generation for coverage
+--
+-- (c) Galois Connections, Inc. 2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.Hpc ( initHpc, mkTickBox ) where
+
+import GhcPrelude
+
+import GHC.StgToCmm.Monad
+
+import MkGraph
+import CmmExpr
+import CLabel
+import Module
+import CmmUtils
+import GHC.StgToCmm.Utils
+import HscTypes
+import DynFlags
+
+import Control.Monad
+
+mkTickBox :: DynFlags -> Module -> Int -> CmmAGraph
+mkTickBox dflags mod n
+  = mkStore tick_box (CmmMachOp (MO_Add W64)
+                                [ CmmLoad tick_box b64
+                                , CmmLit (CmmInt 1 W64)
+                                ])
+  where
+    tick_box = cmmIndex dflags W64
+                        (CmmLit $ CmmLabel $ mkHpcTicksLabel $ mod)
+                        n
+
+initHpc :: Module -> HpcInfo -> FCode ()
+-- Emit top-level tables for HPC and return code to initialise
+initHpc _ (NoHpcInfo {})
+  = return ()
+initHpc this_mod (HpcInfo tickCount _hashNo)
+  = do dflags <- getDynFlags
+       when (gopt Opt_Hpc dflags) $
+           do emitDataLits (mkHpcTicksLabel this_mod)
+                           [ (CmmInt 0 W64)
+                           | _ <- take tickCount [0 :: Int ..]
+                           ]
+
diff --git a/compiler/GHC/StgToCmm/Layout.hs b/compiler/GHC/StgToCmm/Layout.hs
new file mode 100644
index 0000000000..f4834376ed
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Layout.hs
@@ -0,0 +1,623 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Building info tables.
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.Layout (
+        mkArgDescr,
+        emitCall, emitReturn, adjustHpBackwards,
+
+        emitClosureProcAndInfoTable,
+        emitClosureAndInfoTable,
+
+        slowCall, directCall,
+
+        FieldOffOrPadding(..),
+        ClosureHeader(..),
+        mkVirtHeapOffsets,
+        mkVirtHeapOffsetsWithPadding,
+        mkVirtConstrOffsets,
+        mkVirtConstrSizes,
+        getHpRelOffset,
+
+        ArgRep(..), toArgRep, argRepSizeW -- re-exported from GHC.StgToCmm.ArgRep
+  ) where
+
+
+#include "HsVersions.h"
+
+import GhcPrelude hiding ((<*>))
+
+import GHC.StgToCmm.Closure
+import GHC.StgToCmm.Env
+import GHC.StgToCmm.ArgRep -- notably: ( slowCallPattern )
+import GHC.StgToCmm.Ticky
+import GHC.StgToCmm.Monad
+import GHC.StgToCmm.Utils
+
+import MkGraph
+import SMRep
+import BlockId
+import Cmm
+import CmmUtils
+import CmmInfo
+import CLabel
+import StgSyn
+import Id
+import TyCon             ( PrimRep(..), primRepSizeB )
+import BasicTypes        ( RepArity )
+import DynFlags
+import Module
+
+import Util
+import Data.List
+import Outputable
+import FastString
+import Control.Monad
+
+------------------------------------------------------------------------
+--                Call and return sequences
+------------------------------------------------------------------------
+
+-- | Return multiple values to the sequel
+--
+-- If the sequel is @Return@
+--
+-- >     return (x,y)
+--
+-- If the sequel is @AssignTo [p,q]@
+--
+-- >    p=x; q=y;
+--
+emitReturn :: [CmmExpr] -> FCode ReturnKind
+emitReturn results
+  = do { dflags    <- getDynFlags
+       ; sequel    <- getSequel
+       ; updfr_off <- getUpdFrameOff
+       ; case sequel of
+           Return ->
+             do { adjustHpBackwards
+                ; let e = CmmLoad (CmmStackSlot Old updfr_off) (gcWord dflags)
+                ; emit (mkReturn dflags (entryCode dflags e) results updfr_off)
+                }
+           AssignTo regs adjust ->
+             do { when adjust adjustHpBackwards
+                ; emitMultiAssign  regs results }
+       ; return AssignedDirectly
+       }
+
+
+-- | @emitCall conv fun args@ makes a call to the entry-code of @fun@,
+-- using the call/return convention @conv@, passing @args@, and
+-- returning the results to the current sequel.
+--
+emitCall :: (Convention, Convention) -> CmmExpr -> [CmmExpr] -> FCode ReturnKind
+emitCall convs fun args
+  = emitCallWithExtraStack convs fun args noExtraStack
+
+
+-- | @emitCallWithExtraStack conv fun args stack@ makes a call to the
+-- entry-code of @fun@, using the call/return convention @conv@,
+-- passing @args@, pushing some extra stack frames described by
+-- @stack@, and returning the results to the current sequel.
+--
+emitCallWithExtraStack
+   :: (Convention, Convention) -> CmmExpr -> [CmmExpr]
+   -> [CmmExpr] -> FCode ReturnKind
+emitCallWithExtraStack (callConv, retConv) fun args extra_stack
+  = do  { dflags <- getDynFlags
+        ; adjustHpBackwards
+        ; sequel <- getSequel
+        ; updfr_off <- getUpdFrameOff
+        ; case sequel of
+            Return -> do
+              emit $ mkJumpExtra dflags callConv fun args updfr_off extra_stack
+              return AssignedDirectly
+            AssignTo res_regs _ -> do
+              k <- newBlockId
+              let area = Young k
+                  (off, _, copyin) = copyInOflow dflags retConv area res_regs []
+                  copyout = mkCallReturnsTo dflags fun callConv args k off updfr_off
+                                   extra_stack
+              tscope <- getTickScope
+              emit (copyout <*> mkLabel k tscope <*> copyin)
+              return (ReturnedTo k off)
+      }
+
+
+adjustHpBackwards :: FCode ()
+-- This function adjusts the heap pointer just before a tail call or
+-- return.  At a call or return, the virtual heap pointer may be less
+-- than the real Hp, because the latter was advanced to deal with
+-- the worst-case branch of the code, and we may be in a better-case
+-- branch.  In that case, move the real Hp *back* and retract some
+-- ticky allocation count.
+--
+-- It *does not* deal with high-water-mark adjustment.  That's done by
+-- functions which allocate heap.
+adjustHpBackwards
+  = do  { hp_usg <- getHpUsage
+        ; let rHp = realHp hp_usg
+              vHp = virtHp hp_usg
+              adjust_words = vHp -rHp
+        ; new_hp <- getHpRelOffset vHp
+
+        ; emit (if adjust_words == 0
+                then mkNop
+                else mkAssign hpReg new_hp) -- Generates nothing when vHp==rHp
+
+        ; tickyAllocHeap False adjust_words -- ...ditto
+
+        ; setRealHp vHp
+        }
+
+
+-------------------------------------------------------------------------
+--        Making calls: directCall and slowCall
+-------------------------------------------------------------------------
+
+-- General plan is:
+--   - we'll make *one* fast call, either to the function itself
+--     (directCall) or to stg_ap_<pat>_fast (slowCall)
+--     Any left-over arguments will be pushed on the stack,
+--
+--     e.g. Sp[old+8]  = arg1
+--          Sp[old+16] = arg2
+--          Sp[old+32] = stg_ap_pp_info
+--          R2 = arg3
+--          R3 = arg4
+--          call f() return to Nothing updfr_off: 32
+
+
+directCall :: Convention -> CLabel -> RepArity -> [StgArg] -> FCode ReturnKind
+-- (directCall f n args)
+-- calls f(arg1, ..., argn), and applies the result to the remaining args
+-- The function f has arity n, and there are guaranteed at least n args
+-- Both arity and args include void args
+directCall conv lbl arity stg_args
+  = do  { argreps <- getArgRepsAmodes stg_args
+        ; direct_call "directCall" conv lbl arity argreps }
+
+
+slowCall :: CmmExpr -> [StgArg] -> FCode ReturnKind
+-- (slowCall fun args) applies fun to args, returning the results to Sequel
+slowCall fun stg_args
+  = do  dflags <- getDynFlags
+        argsreps <- getArgRepsAmodes stg_args
+        let (rts_fun, arity) = slowCallPattern (map fst argsreps)
+
+        (r, slow_code) <- getCodeR $ do
+           r <- direct_call "slow_call" NativeNodeCall
+                 (mkRtsApFastLabel rts_fun) arity ((P,Just fun):argsreps)
+           emitComment $ mkFastString ("slow_call for " ++
+                                      showSDoc dflags (ppr fun) ++
+                                      " with pat " ++ unpackFS rts_fun)
+           return r
+
+        -- Note [avoid intermediate PAPs]
+        let n_args = length stg_args
+        if n_args > arity && optLevel dflags >= 2
+           then do
+             funv <- (CmmReg . CmmLocal) `fmap` assignTemp fun
+             fun_iptr <- (CmmReg . CmmLocal) `fmap`
+                    assignTemp (closureInfoPtr dflags (cmmUntag dflags funv))
+
+             -- ToDo: we could do slightly better here by reusing the
+             -- continuation from the slow call, which we have in r.
+             -- Also we'd like to push the continuation on the stack
+             -- before the branch, so that we only get one copy of the
+             -- code that saves all the live variables across the
+             -- call, but that might need some improvements to the
+             -- special case in the stack layout code to handle this
+             -- (see Note [diamond proc point]).
+
+             fast_code <- getCode $
+                emitCall (NativeNodeCall, NativeReturn)
+                  (entryCode dflags fun_iptr)
+                  (nonVArgs ((P,Just funv):argsreps))
+
+             slow_lbl <- newBlockId
+             fast_lbl <- newBlockId
+             is_tagged_lbl <- newBlockId
+             end_lbl <- newBlockId
+
+             let correct_arity = cmmEqWord dflags (funInfoArity dflags fun_iptr)
+                                                  (mkIntExpr dflags n_args)
+
+             tscope <- getTickScope
+             emit (mkCbranch (cmmIsTagged dflags funv)
+                             is_tagged_lbl slow_lbl (Just True)
+                   <*> mkLabel is_tagged_lbl tscope
+                   <*> mkCbranch correct_arity fast_lbl slow_lbl (Just True)
+                   <*> mkLabel fast_lbl tscope
+                   <*> fast_code
+                   <*> mkBranch end_lbl
+                   <*> mkLabel slow_lbl tscope
+                   <*> slow_code
+                   <*> mkLabel end_lbl tscope)
+             return r
+
+           else do
+             emit slow_code
+             return r
+
+
+-- Note [avoid intermediate PAPs]
+--
+-- A slow call which needs multiple generic apply patterns will be
+-- almost guaranteed to create one or more intermediate PAPs when
+-- applied to a function that takes the correct number of arguments.
+-- We try to avoid this situation by generating code to test whether
+-- we are calling a function with the correct number of arguments
+-- first, i.e.:
+--
+--   if (TAG(f) != 0} {  // f is not a thunk
+--      if (f->info.arity == n) {
+--         ... make a fast call to f ...
+--      }
+--   }
+--   ... otherwise make the slow call ...
+--
+-- We *only* do this when the call requires multiple generic apply
+-- functions, which requires pushing extra stack frames and probably
+-- results in intermediate PAPs.  (I say probably, because it might be
+-- that we're over-applying a function, but that seems even less
+-- likely).
+--
+-- This very rarely applies, but if it does happen in an inner loop it
+-- can have a severe impact on performance (#6084).
+
+
+--------------
+direct_call :: String
+            -> Convention     -- e.g. NativeNodeCall or NativeDirectCall
+            -> CLabel -> RepArity
+            -> [(ArgRep,Maybe CmmExpr)] -> FCode ReturnKind
+direct_call caller call_conv lbl arity args
+  | debugIsOn && args `lengthLessThan` real_arity  -- Too few args
+  = do -- Caller should ensure that there enough args!
+       pprPanic "direct_call" $
+            text caller <+> ppr arity <+>
+            ppr lbl <+> ppr (length args) <+>
+            ppr (map snd args) <+> ppr (map fst args)
+
+  | null rest_args  -- Precisely the right number of arguments
+  = emitCall (call_conv, NativeReturn) target (nonVArgs args)
+
+  | otherwise       -- Note [over-saturated calls]
+  = do dflags <- getDynFlags
+       emitCallWithExtraStack (call_conv, NativeReturn)
+                              target
+                              (nonVArgs fast_args)
+                              (nonVArgs (stack_args dflags))
+  where
+    target = CmmLit (CmmLabel lbl)
+    (fast_args, rest_args) = splitAt real_arity args
+    stack_args dflags = slowArgs dflags rest_args
+    real_arity = case call_conv of
+                   NativeNodeCall -> arity+1
+                   _              -> arity
+
+
+-- When constructing calls, it is easier to keep the ArgReps and the
+-- CmmExprs zipped together.  However, a void argument has no
+-- representation, so we need to use Maybe CmmExpr (the alternative of
+-- using zeroCLit or even undefined would work, but would be ugly).
+--
+getArgRepsAmodes :: [StgArg] -> FCode [(ArgRep, Maybe CmmExpr)]
+getArgRepsAmodes = mapM getArgRepAmode
+  where getArgRepAmode arg
+           | V <- rep  = return (V, Nothing)
+           | otherwise = do expr <- getArgAmode (NonVoid arg)
+                            return (rep, Just expr)
+           where rep = toArgRep (argPrimRep arg)
+
+nonVArgs :: [(ArgRep, Maybe CmmExpr)] -> [CmmExpr]
+nonVArgs [] = []
+nonVArgs ((_,Nothing)  : args) = nonVArgs args
+nonVArgs ((_,Just arg) : args) = arg : nonVArgs args
+
+{-
+Note [over-saturated calls]
+
+The natural thing to do for an over-saturated call would be to call
+the function with the correct number of arguments, and then apply the
+remaining arguments to the value returned, e.g.
+
+  f a b c d   (where f has arity 2)
+  -->
+  r = call f(a,b)
+  call r(c,d)
+
+but this entails
+  - saving c and d on the stack
+  - making a continuation info table
+  - at the continuation, loading c and d off the stack into regs
+  - finally, call r
+
+Note that since there are a fixed number of different r's
+(e.g.  stg_ap_pp_fast), we can also pre-compile continuations
+that correspond to each of them, rather than generating a fresh
+one for each over-saturated call.
+
+Not only does this generate much less code, it is faster too.  We will
+generate something like:
+
+Sp[old+16] = c
+Sp[old+24] = d
+Sp[old+32] = stg_ap_pp_info
+call f(a,b) -- usual calling convention
+
+For the purposes of the CmmCall node, we count this extra stack as
+just more arguments that we are passing on the stack (cml_args).
+-}
+
+-- | 'slowArgs' takes a list of function arguments and prepares them for
+-- pushing on the stack for "extra" arguments to a function which requires
+-- fewer arguments than we currently have.
+slowArgs :: DynFlags -> [(ArgRep, Maybe CmmExpr)] -> [(ArgRep, Maybe CmmExpr)]
+slowArgs _ [] = []
+slowArgs dflags args -- careful: reps contains voids (V), but args does not
+  | gopt Opt_SccProfilingOn dflags
+              = save_cccs ++ this_pat ++ slowArgs dflags rest_args
+  | otherwise =              this_pat ++ slowArgs dflags rest_args
+  where
+    (arg_pat, n)            = slowCallPattern (map fst args)
+    (call_args, rest_args)  = splitAt n args
+
+    stg_ap_pat = mkCmmRetInfoLabel rtsUnitId arg_pat
+    this_pat   = (N, Just (mkLblExpr stg_ap_pat)) : call_args
+    save_cccs  = [(N, Just (mkLblExpr save_cccs_lbl)), (N, Just cccsExpr)]
+    save_cccs_lbl = mkCmmRetInfoLabel rtsUnitId (fsLit "stg_restore_cccs")
+
+-------------------------------------------------------------------------
+----        Laying out objects on the heap and stack
+-------------------------------------------------------------------------
+
+-- The heap always grows upwards, so hpRel is easy to compute
+hpRel :: VirtualHpOffset         -- virtual offset of Hp
+      -> VirtualHpOffset         -- virtual offset of The Thing
+      -> WordOff                -- integer word offset
+hpRel hp off = off - hp
+
+getHpRelOffset :: VirtualHpOffset -> FCode CmmExpr
+-- See Note [Virtual and real heap pointers] in GHC.StgToCmm.Monad
+getHpRelOffset virtual_offset
+  = do dflags <- getDynFlags
+       hp_usg <- getHpUsage
+       return (cmmRegOffW dflags hpReg (hpRel (realHp hp_usg) virtual_offset))
+
+data FieldOffOrPadding a
+    = FieldOff (NonVoid a) -- Something that needs an offset.
+               ByteOff     -- Offset in bytes.
+    | Padding ByteOff  -- Length of padding in bytes.
+              ByteOff  -- Offset in bytes.
+
+-- | Used to tell the various @mkVirtHeapOffsets@ functions what kind
+-- of header the object has.  This will be accounted for in the
+-- offsets of the fields returned.
+data ClosureHeader
+  = NoHeader
+  | StdHeader
+  | ThunkHeader
+
+mkVirtHeapOffsetsWithPadding
+  :: DynFlags
+  -> ClosureHeader            -- What kind of header to account for
+  -> [NonVoid (PrimRep, a)]   -- Things to make offsets for
+  -> ( WordOff                -- Total number of words allocated
+     , WordOff                -- Number of words allocated for *pointers*
+     , [FieldOffOrPadding a]  -- Either an offset or padding.
+     )
+
+-- Things with their offsets from start of object in order of
+-- increasing offset; BUT THIS MAY BE DIFFERENT TO INPUT ORDER
+-- First in list gets lowest offset, which is initial offset + 1.
+--
+-- mkVirtHeapOffsetsWithPadding always returns boxed things with smaller offsets
+-- than the unboxed things
+
+mkVirtHeapOffsetsWithPadding dflags header things =
+    ASSERT(not (any (isVoidRep . fst . fromNonVoid) things))
+    ( tot_wds
+    , bytesToWordsRoundUp dflags bytes_of_ptrs
+    , concat (ptrs_w_offsets ++ non_ptrs_w_offsets) ++ final_pad
+    )
+  where
+    hdr_words = case header of
+      NoHeader -> 0
+      StdHeader -> fixedHdrSizeW dflags
+      ThunkHeader -> thunkHdrSize dflags
+    hdr_bytes = wordsToBytes dflags hdr_words
+
+    (ptrs, non_ptrs) = partition (isGcPtrRep . fst . fromNonVoid) things
+
+    (bytes_of_ptrs, ptrs_w_offsets) =
+       mapAccumL computeOffset 0 ptrs
+    (tot_bytes, non_ptrs_w_offsets) =
+       mapAccumL computeOffset bytes_of_ptrs non_ptrs
+
+    tot_wds = bytesToWordsRoundUp dflags tot_bytes
+
+    final_pad_size = tot_wds * word_size - tot_bytes
+    final_pad
+        | final_pad_size > 0 = [(Padding final_pad_size
+                                         (hdr_bytes + tot_bytes))]
+        | otherwise          = []
+
+    word_size = wORD_SIZE dflags
+
+    computeOffset bytes_so_far nv_thing =
+        (new_bytes_so_far, with_padding field_off)
+      where
+        (rep, thing) = fromNonVoid nv_thing
+
+        -- Size of the field in bytes.
+        !sizeB = primRepSizeB dflags rep
+
+        -- Align the start offset (eg, 2-byte value should be 2-byte aligned).
+        -- But not more than to a word.
+        !align = min word_size sizeB
+        !start = roundUpTo bytes_so_far align
+        !padding = start - bytes_so_far
+
+        -- Final offset is:
+        --   size of header + bytes_so_far + padding
+        !final_offset = hdr_bytes + bytes_so_far + padding
+        !new_bytes_so_far = start + sizeB
+        field_off = FieldOff (NonVoid thing) final_offset
+
+        with_padding field_off
+            | padding == 0 = [field_off]
+            | otherwise    = [ Padding padding (hdr_bytes + bytes_so_far)
+                             , field_off
+                             ]
+
+
+mkVirtHeapOffsets
+  :: DynFlags
+  -> ClosureHeader            -- What kind of header to account for
+  -> [NonVoid (PrimRep,a)]    -- Things to make offsets for
+  -> (WordOff,                -- _Total_ number of words allocated
+      WordOff,                -- Number of words allocated for *pointers*
+      [(NonVoid a, ByteOff)])
+mkVirtHeapOffsets dflags header things =
+    ( tot_wds
+    , ptr_wds
+    , [ (field, offset) | (FieldOff field offset) <- things_offsets ]
+    )
+  where
+   (tot_wds, ptr_wds, things_offsets) =
+       mkVirtHeapOffsetsWithPadding dflags header things
+
+-- | Just like mkVirtHeapOffsets, but for constructors
+mkVirtConstrOffsets
+  :: DynFlags -> [NonVoid (PrimRep, a)]
+  -> (WordOff, WordOff, [(NonVoid a, ByteOff)])
+mkVirtConstrOffsets dflags = mkVirtHeapOffsets dflags StdHeader
+
+-- | Just like mkVirtConstrOffsets, but used when we don't have the actual
+-- arguments. Useful when e.g. generating info tables; we just need to know
+-- sizes of pointer and non-pointer fields.
+mkVirtConstrSizes :: DynFlags -> [NonVoid PrimRep] -> (WordOff, WordOff)
+mkVirtConstrSizes dflags field_reps
+  = (tot_wds, ptr_wds)
+  where
+    (tot_wds, ptr_wds, _) =
+       mkVirtConstrOffsets dflags
+         (map (\nv_rep -> NonVoid (fromNonVoid nv_rep, ())) field_reps)
+
+-------------------------------------------------------------------------
+--
+--        Making argument descriptors
+--
+--  An argument descriptor describes the layout of args on the stack,
+--  both for         * GC (stack-layout) purposes, and
+--                * saving/restoring registers when a heap-check fails
+--
+-- Void arguments aren't important, therefore (contrast constructSlowCall)
+--
+-------------------------------------------------------------------------
+
+-- bring in ARG_P, ARG_N, etc.
+#include "../includes/rts/storage/FunTypes.h"
+
+mkArgDescr :: DynFlags -> [Id] -> ArgDescr
+mkArgDescr dflags args
+  = let arg_bits = argBits dflags arg_reps
+        arg_reps = filter isNonV (map idArgRep args)
+           -- Getting rid of voids eases matching of standard patterns
+    in case stdPattern arg_reps of
+         Just spec_id -> ArgSpec spec_id
+         Nothing      -> ArgGen  arg_bits
+
+argBits :: DynFlags -> [ArgRep] -> [Bool]        -- True for non-ptr, False for ptr
+argBits _      []           = []
+argBits dflags (P   : args) = False : argBits dflags args
+argBits dflags (arg : args) = take (argRepSizeW dflags arg) (repeat True)
+                    ++ argBits dflags args
+
+----------------------
+stdPattern :: [ArgRep] -> Maybe Int
+stdPattern reps
+  = case reps of
+        []    -> Just ARG_NONE        -- just void args, probably
+        [N]   -> Just ARG_N
+        [P]   -> Just ARG_P
+        [F]   -> Just ARG_F
+        [D]   -> Just ARG_D
+        [L]   -> Just ARG_L
+        [V16] -> Just ARG_V16
+        [V32] -> Just ARG_V32
+        [V64] -> Just ARG_V64
+
+        [N,N] -> Just ARG_NN
+        [N,P] -> Just ARG_NP
+        [P,N] -> Just ARG_PN
+        [P,P] -> Just ARG_PP
+
+        [N,N,N] -> Just ARG_NNN
+        [N,N,P] -> Just ARG_NNP
+        [N,P,N] -> Just ARG_NPN
+        [N,P,P] -> Just ARG_NPP
+        [P,N,N] -> Just ARG_PNN
+        [P,N,P] -> Just ARG_PNP
+        [P,P,N] -> Just ARG_PPN
+        [P,P,P] -> Just ARG_PPP
+
+        [P,P,P,P]     -> Just ARG_PPPP
+        [P,P,P,P,P]   -> Just ARG_PPPPP
+        [P,P,P,P,P,P] -> Just ARG_PPPPPP
+
+        _ -> Nothing
+
+-------------------------------------------------------------------------
+--
+--        Generating the info table and code for a closure
+--
+-------------------------------------------------------------------------
+
+-- Here we make an info table of type 'CmmInfo'.  The concrete
+-- representation as a list of 'CmmAddr' is handled later
+-- in the pipeline by 'cmmToRawCmm'.
+-- When loading the free variables, a function closure pointer may be tagged,
+-- so we must take it into account.
+
+emitClosureProcAndInfoTable :: Bool                    -- top-level?
+                            -> Id                      -- name of the closure
+                            -> LambdaFormInfo
+                            -> CmmInfoTable
+                            -> [NonVoid Id]            -- incoming arguments
+                            -> ((Int, LocalReg, [LocalReg]) -> FCode ()) -- function body
+                            -> FCode ()
+emitClosureProcAndInfoTable top_lvl bndr lf_info info_tbl args body
+ = do   { dflags <- getDynFlags
+        -- Bind the binder itself, but only if it's not a top-level
+        -- binding. We need non-top let-bindings to refer to the
+        -- top-level binding, which this binding would incorrectly shadow.
+        ; node <- if top_lvl then return $ idToReg dflags (NonVoid bndr)
+                  else bindToReg (NonVoid bndr) lf_info
+        ; let node_points = nodeMustPointToIt dflags lf_info
+        ; arg_regs <- bindArgsToRegs args
+        ; let args' = if node_points then (node : arg_regs) else arg_regs
+              conv  = if nodeMustPointToIt dflags lf_info then NativeNodeCall
+                                                          else NativeDirectCall
+              (offset, _, _) = mkCallEntry dflags conv args' []
+        ; emitClosureAndInfoTable info_tbl conv args' $ body (offset, node, arg_regs)
+        }
+
+-- Data constructors need closures, but not with all the argument handling
+-- needed for functions. The shared part goes here.
+emitClosureAndInfoTable ::
+  CmmInfoTable -> Convention -> [LocalReg] -> FCode () -> FCode ()
+emitClosureAndInfoTable info_tbl conv args body
+  = do { (_, blks) <- getCodeScoped body
+       ; let entry_lbl = toEntryLbl (cit_lbl info_tbl)
+       ; emitProcWithConvention conv (Just info_tbl) entry_lbl args blks
+       }
diff --git a/compiler/GHC/StgToCmm/Monad.hs b/compiler/GHC/StgToCmm/Monad.hs
new file mode 100644
index 0000000000..716cbdab78
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Monad.hs
@@ -0,0 +1,861 @@
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE GADTs #-}
+
+-----------------------------------------------------------------------------
+--
+-- Monad for Stg to C-- code generation
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.Monad (
+        FCode,        -- type
+
+        initC, runC, fixC,
+        newUnique,
+
+        emitLabel,
+
+        emit, emitDecl,
+        emitProcWithConvention, emitProcWithStackFrame,
+        emitOutOfLine, emitAssign, emitStore,
+        emitComment, emitTick, emitUnwind,
+
+        getCmm, aGraphToGraph,
+        getCodeR, getCode, getCodeScoped, getHeapUsage,
+
+        mkCmmIfThenElse, mkCmmIfThen, mkCmmIfGoto,
+        mkCmmIfThenElse', mkCmmIfThen', mkCmmIfGoto',
+
+        mkCall, mkCmmCall,
+
+        forkClosureBody, forkLneBody, forkAlts, forkAltPair, codeOnly,
+
+        ConTagZ,
+
+        Sequel(..), ReturnKind(..),
+        withSequel, getSequel,
+
+        setTickyCtrLabel, getTickyCtrLabel,
+        tickScope, getTickScope,
+
+        withUpdFrameOff, getUpdFrameOff, initUpdFrameOff,
+
+        HeapUsage(..), VirtualHpOffset,        initHpUsage,
+        getHpUsage,  setHpUsage, heapHWM,
+        setVirtHp, getVirtHp, setRealHp,
+
+        getModuleName,
+
+        -- ideally we wouldn't export these, but some other modules access internal state
+        getState, setState, getSelfLoop, withSelfLoop, getInfoDown, getDynFlags, getThisPackage,
+
+        -- more localised access to monad state
+        CgIdInfo(..),
+        getBinds, setBinds,
+
+        -- out of general friendliness, we also export ...
+        CgInfoDownwards(..), CgState(..)        -- non-abstract
+    ) where
+
+import GhcPrelude hiding( sequence, succ )
+
+import Cmm
+import GHC.StgToCmm.Closure
+import DynFlags
+import Hoopl.Collections
+import MkGraph
+import BlockId
+import CLabel
+import SMRep
+import Module
+import Id
+import VarEnv
+import OrdList
+import BasicTypes( ConTagZ )
+import Unique
+import UniqSupply
+import FastString
+import Outputable
+import Util
+
+import Control.Monad
+import Data.List
+
+
+
+--------------------------------------------------------
+-- The FCode monad and its types
+--
+-- FCode is the monad plumbed through the Stg->Cmm code generator, and
+-- the Cmm parser.  It contains the following things:
+--
+--  - A writer monad, collecting:
+--    - code for the current function, in the form of a CmmAGraph.
+--      The function "emit" appends more code to this.
+--    - the top-level CmmDecls accumulated so far
+--
+--  - A state monad with:
+--    - the local bindings in scope
+--    - the current heap usage
+--    - a UniqSupply
+--
+--  - A reader monad, for CgInfoDownwards, containing
+--    - DynFlags,
+--    - the current Module
+--    - the update-frame offset
+--    - the ticky counter label
+--    - the Sequel (the continuation to return to)
+--    - the self-recursive tail call information
+
+--------------------------------------------------------
+
+newtype FCode a = FCode { doFCode :: CgInfoDownwards -> CgState -> (a, CgState) }
+    deriving (Functor)
+
+instance Applicative FCode where
+    pure val = FCode (\_info_down state -> (val, state))
+    {-# INLINE pure #-}
+    (<*>) = ap
+
+instance Monad FCode where
+    FCode m >>= k = FCode $
+        \info_down state ->
+            case m info_down state of
+              (m_result, new_state) ->
+                 case k m_result of
+                   FCode kcode -> kcode info_down new_state
+    {-# INLINE (>>=) #-}
+
+instance MonadUnique FCode where
+  getUniqueSupplyM = cgs_uniqs <$> getState
+  getUniqueM = FCode $ \_ st ->
+    let (u, us') = takeUniqFromSupply (cgs_uniqs st)
+    in (u, st { cgs_uniqs = us' })
+
+initC :: IO CgState
+initC  = do { uniqs <- mkSplitUniqSupply 'c'
+            ; return (initCgState uniqs) }
+
+runC :: DynFlags -> Module -> CgState -> FCode a -> (a,CgState)
+runC dflags mod st fcode = doFCode fcode (initCgInfoDown dflags mod) st
+
+fixC :: (a -> FCode a) -> FCode a
+fixC fcode = FCode $
+    \info_down state -> let (v, s) = doFCode (fcode v) info_down state
+                        in (v, s)
+
+--------------------------------------------------------
+--        The code generator environment
+--------------------------------------------------------
+
+-- This monadery has some information that it only passes
+-- *downwards*, as well as some ``state'' which is modified
+-- as we go along.
+
+data CgInfoDownwards        -- information only passed *downwards* by the monad
+  = MkCgInfoDown {
+        cgd_dflags    :: DynFlags,
+        cgd_mod       :: Module,            -- Module being compiled
+        cgd_updfr_off :: UpdFrameOffset,    -- Size of current update frame
+        cgd_ticky     :: CLabel,            -- Current destination for ticky counts
+        cgd_sequel    :: Sequel,            -- What to do at end of basic block
+        cgd_self_loop :: Maybe SelfLoopInfo,-- Which tail calls can be compiled
+                                            -- as local jumps? See Note
+                                            -- [Self-recursive tail calls] in
+                                            -- GHC.StgToCmm.Expr
+        cgd_tick_scope:: CmmTickScope       -- Tick scope for new blocks & ticks
+  }
+
+type CgBindings = IdEnv CgIdInfo
+
+data CgIdInfo
+  = CgIdInfo
+        { cg_id :: Id   -- Id that this is the info for
+        , cg_lf  :: LambdaFormInfo
+        , cg_loc :: CgLoc                     -- CmmExpr for the *tagged* value
+        }
+
+instance Outputable CgIdInfo where
+  ppr (CgIdInfo { cg_id = id, cg_loc = loc })
+    = ppr id <+> text "-->" <+> ppr loc
+
+-- Sequel tells what to do with the result of this expression
+data Sequel
+  = Return              -- Return result(s) to continuation found on the stack.
+
+  | AssignTo
+        [LocalReg]      -- Put result(s) in these regs and fall through
+                        -- NB: no void arguments here
+                        --
+        Bool            -- Should we adjust the heap pointer back to
+                        -- recover space that's unused on this path?
+                        -- We need to do this only if the expression
+                        -- may allocate (e.g. it's a foreign call or
+                        -- allocating primOp)
+
+instance Outputable Sequel where
+    ppr Return = text "Return"
+    ppr (AssignTo regs b) = text "AssignTo" <+> ppr regs <+> ppr b
+
+-- See Note [sharing continuations] below
+data ReturnKind
+  = AssignedDirectly
+  | ReturnedTo BlockId ByteOff
+
+-- Note [sharing continuations]
+--
+-- ReturnKind says how the expression being compiled returned its
+-- results: either by assigning directly to the registers specified
+-- by the Sequel, or by returning to a continuation that does the
+-- assignments.  The point of this is we might be able to re-use the
+-- continuation in a subsequent heap-check.  Consider:
+--
+--    case f x of z
+--      True  -> <True code>
+--      False -> <False code>
+--
+-- Naively we would generate
+--
+--    R2 = x   -- argument to f
+--    Sp[young(L1)] = L1
+--    call f returns to L1
+--  L1:
+--    z = R1
+--    if (z & 1) then Ltrue else Lfalse
+--  Ltrue:
+--    Hp = Hp + 24
+--    if (Hp > HpLim) then L4 else L7
+--  L4:
+--    HpAlloc = 24
+--    goto L5
+--  L5:
+--    R1 = z
+--    Sp[young(L6)] = L6
+--    call stg_gc_unpt_r1 returns to L6
+--  L6:
+--    z = R1
+--    goto L1
+--  L7:
+--    <True code>
+--  Lfalse:
+--    <False code>
+--
+-- We want the gc call in L4 to return to L1, and discard L6.  Note
+-- that not only can we share L1 and L6, but the assignment of the
+-- return address in L4 is unnecessary because the return address for
+-- L1 is already on the stack.  We used to catch the sharing of L1 and
+-- L6 in the common-block-eliminator, but not the unnecessary return
+-- address assignment.
+--
+-- Since this case is so common I decided to make it more explicit and
+-- robust by programming the sharing directly, rather than relying on
+-- the common-block eliminator to catch it.  This makes
+-- common-block-elimination an optional optimisation, and furthermore
+-- generates less code in the first place that we have to subsequently
+-- clean up.
+--
+-- There are some rarer cases of common blocks that we don't catch
+-- this way, but that's ok.  Common-block-elimination is still available
+-- to catch them when optimisation is enabled.  Some examples are:
+--
+--   - when both the True and False branches do a heap check, we
+--     can share the heap-check failure code L4a and maybe L4
+--
+--   - in a case-of-case, there might be multiple continuations that
+--     we can common up.
+--
+-- It is always safe to use AssignedDirectly.  Expressions that jump
+-- to the continuation from multiple places (e.g. case expressions)
+-- fall back to AssignedDirectly.
+--
+
+
+initCgInfoDown :: DynFlags -> Module -> CgInfoDownwards
+initCgInfoDown dflags mod
+  = MkCgInfoDown { cgd_dflags    = dflags
+                 , cgd_mod       = mod
+                 , cgd_updfr_off = initUpdFrameOff dflags
+                 , cgd_ticky     = mkTopTickyCtrLabel
+                 , cgd_sequel    = initSequel
+                 , cgd_self_loop = Nothing
+                 , cgd_tick_scope= GlobalScope }
+
+initSequel :: Sequel
+initSequel = Return
+
+initUpdFrameOff :: DynFlags -> UpdFrameOffset
+initUpdFrameOff dflags = widthInBytes (wordWidth dflags) -- space for the RA
+
+
+--------------------------------------------------------
+--        The code generator state
+--------------------------------------------------------
+
+data CgState
+  = MkCgState {
+     cgs_stmts :: CmmAGraph,          -- Current procedure
+
+     cgs_tops  :: OrdList CmmDecl,
+        -- Other procedures and data blocks in this compilation unit
+        -- Both are ordered only so that we can
+        -- reduce forward references, when it's easy to do so
+
+     cgs_binds :: CgBindings,
+
+     cgs_hp_usg  :: HeapUsage,
+
+     cgs_uniqs :: UniqSupply }
+
+data HeapUsage   -- See Note [Virtual and real heap pointers]
+  = HeapUsage {
+        virtHp :: VirtualHpOffset,       -- Virtual offset of highest-allocated word
+                                         --   Incremented whenever we allocate
+        realHp :: VirtualHpOffset        -- realHp: Virtual offset of real heap ptr
+                                         --   Used in instruction addressing modes
+    }
+
+type VirtualHpOffset = WordOff
+
+
+{- Note [Virtual and real heap pointers]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The code generator can allocate one or more objects contiguously, performing
+one heap check to cover allocation of all the objects at once.  Let's call
+this little chunk of heap space an "allocation chunk".  The code generator
+will emit code to
+  * Perform a heap-exhaustion check
+  * Move the heap pointer to the end of the allocation chunk
+  * Allocate multiple objects within the chunk
+
+The code generator uses VirtualHpOffsets to address words within a
+single allocation chunk; these start at one and increase positively.
+The first word of the chunk has VirtualHpOffset=1, the second has
+VirtualHpOffset=2, and so on.
+
+ * The field realHp tracks (the VirtualHpOffset) where the real Hp
+   register is pointing.  Typically it'll be pointing to the end of the
+   allocation chunk.
+
+ * The field virtHp gives the VirtualHpOffset of the highest-allocated
+   word so far.  It starts at zero (meaning no word has been allocated),
+   and increases whenever an object is allocated.
+
+The difference between realHp and virtHp gives the offset from the
+real Hp register of a particular word in the allocation chunk. This
+is what getHpRelOffset does.  Since the returned offset is relative
+to the real Hp register, it is valid only until you change the real
+Hp register.  (Changing virtHp doesn't matter.)
+-}
+
+
+initCgState :: UniqSupply -> CgState
+initCgState uniqs
+  = MkCgState { cgs_stmts  = mkNop
+              , cgs_tops   = nilOL
+              , cgs_binds  = emptyVarEnv
+              , cgs_hp_usg = initHpUsage
+              , cgs_uniqs  = uniqs }
+
+stateIncUsage :: CgState -> CgState -> CgState
+-- stateIncUsage@ e1 e2 incorporates in e1
+-- the heap high water mark found in e2.
+stateIncUsage s1 s2@(MkCgState { cgs_hp_usg = hp_usg })
+     = s1 { cgs_hp_usg  = cgs_hp_usg  s1 `maxHpHw`  virtHp hp_usg }
+       `addCodeBlocksFrom` s2
+
+addCodeBlocksFrom :: CgState -> CgState -> CgState
+-- Add code blocks from the latter to the former
+-- (The cgs_stmts will often be empty, but not always; see codeOnly)
+s1 `addCodeBlocksFrom` s2
+  = s1 { cgs_stmts = cgs_stmts s1 MkGraph.<*> cgs_stmts s2,
+         cgs_tops  = cgs_tops  s1 `appOL` cgs_tops  s2 }
+
+
+-- The heap high water mark is the larger of virtHp and hwHp.  The latter is
+-- only records the high water marks of forked-off branches, so to find the
+-- heap high water mark you have to take the max of virtHp and hwHp.  Remember,
+-- virtHp never retreats!
+--
+-- Note Jan 04: ok, so why do we only look at the virtual Hp??
+
+heapHWM :: HeapUsage -> VirtualHpOffset
+heapHWM = virtHp
+
+initHpUsage :: HeapUsage
+initHpUsage = HeapUsage { virtHp = 0, realHp = 0 }
+
+maxHpHw :: HeapUsage -> VirtualHpOffset -> HeapUsage
+hp_usg `maxHpHw` hw = hp_usg { virtHp = virtHp hp_usg `max` hw }
+
+--------------------------------------------------------
+-- Operators for getting and setting the state and "info_down".
+--------------------------------------------------------
+
+getState :: FCode CgState
+getState = FCode $ \_info_down state -> (state, state)
+
+setState :: CgState -> FCode ()
+setState state = FCode $ \_info_down _ -> ((), state)
+
+getHpUsage :: FCode HeapUsage
+getHpUsage = do
+        state <- getState
+        return $ cgs_hp_usg state
+
+setHpUsage :: HeapUsage -> FCode ()
+setHpUsage new_hp_usg = do
+        state <- getState
+        setState $ state {cgs_hp_usg = new_hp_usg}
+
+setVirtHp :: VirtualHpOffset -> FCode ()
+setVirtHp new_virtHp
+  = do  { hp_usage <- getHpUsage
+        ; setHpUsage (hp_usage {virtHp = new_virtHp}) }
+
+getVirtHp :: FCode VirtualHpOffset
+getVirtHp
+  = do  { hp_usage <- getHpUsage
+        ; return (virtHp hp_usage) }
+
+setRealHp ::  VirtualHpOffset -> FCode ()
+setRealHp new_realHp
+  = do  { hp_usage <- getHpUsage
+        ; setHpUsage (hp_usage {realHp = new_realHp}) }
+
+getBinds :: FCode CgBindings
+getBinds = do
+        state <- getState
+        return $ cgs_binds state
+
+setBinds :: CgBindings -> FCode ()
+setBinds new_binds = do
+        state <- getState
+        setState $ state {cgs_binds = new_binds}
+
+withState :: FCode a -> CgState -> FCode (a,CgState)
+withState (FCode fcode) newstate = FCode $ \info_down state ->
+  case fcode info_down newstate of
+    (retval, state2) -> ((retval,state2), state)
+
+newUniqSupply :: FCode UniqSupply
+newUniqSupply = do
+        state <- getState
+        let (us1, us2) = splitUniqSupply (cgs_uniqs state)
+        setState $ state { cgs_uniqs = us1 }
+        return us2
+
+newUnique :: FCode Unique
+newUnique = do
+        state <- getState
+        let (u,us') = takeUniqFromSupply (cgs_uniqs state)
+        setState $ state { cgs_uniqs = us' }
+        return u
+
+------------------
+getInfoDown :: FCode CgInfoDownwards
+getInfoDown = FCode $ \info_down state -> (info_down,state)
+
+getSelfLoop :: FCode (Maybe SelfLoopInfo)
+getSelfLoop = do
+        info_down <- getInfoDown
+        return $ cgd_self_loop info_down
+
+withSelfLoop :: SelfLoopInfo -> FCode a -> FCode a
+withSelfLoop self_loop code = do
+        info_down <- getInfoDown
+        withInfoDown code (info_down {cgd_self_loop = Just self_loop})
+
+instance HasDynFlags FCode where
+    getDynFlags = liftM cgd_dflags getInfoDown
+
+getThisPackage :: FCode UnitId
+getThisPackage = liftM thisPackage getDynFlags
+
+withInfoDown :: FCode a -> CgInfoDownwards -> FCode a
+withInfoDown (FCode fcode) info_down = FCode $ \_ state -> fcode info_down state
+
+-- ----------------------------------------------------------------------------
+-- Get the current module name
+
+getModuleName :: FCode Module
+getModuleName = do { info <- getInfoDown; return (cgd_mod info) }
+
+-- ----------------------------------------------------------------------------
+-- Get/set the end-of-block info
+
+withSequel :: Sequel -> FCode a -> FCode a
+withSequel sequel code
+  = do  { info  <- getInfoDown
+        ; withInfoDown code (info {cgd_sequel = sequel, cgd_self_loop = Nothing }) }
+
+getSequel :: FCode Sequel
+getSequel = do  { info <- getInfoDown
+                ; return (cgd_sequel info) }
+
+-- ----------------------------------------------------------------------------
+-- Get/set the size of the update frame
+
+-- We keep track of the size of the update frame so that we
+-- can set the stack pointer to the proper address on return
+-- (or tail call) from the closure.
+-- There should be at most one update frame for each closure.
+-- Note: I'm including the size of the original return address
+-- in the size of the update frame -- hence the default case on `get'.
+
+withUpdFrameOff :: UpdFrameOffset -> FCode a -> FCode a
+withUpdFrameOff size code
+  = do  { info  <- getInfoDown
+        ; withInfoDown code (info {cgd_updfr_off = size }) }
+
+getUpdFrameOff :: FCode UpdFrameOffset
+getUpdFrameOff
+  = do  { info  <- getInfoDown
+        ; return $ cgd_updfr_off info }
+
+-- ----------------------------------------------------------------------------
+-- Get/set the current ticky counter label
+
+getTickyCtrLabel :: FCode CLabel
+getTickyCtrLabel = do
+        info <- getInfoDown
+        return (cgd_ticky info)
+
+setTickyCtrLabel :: CLabel -> FCode a -> FCode a
+setTickyCtrLabel ticky code = do
+        info <- getInfoDown
+        withInfoDown code (info {cgd_ticky = ticky})
+
+-- ----------------------------------------------------------------------------
+-- Manage tick scopes
+
+-- | The current tick scope. We will assign this to generated blocks.
+getTickScope :: FCode CmmTickScope
+getTickScope = do
+        info <- getInfoDown
+        return (cgd_tick_scope info)
+
+-- | Places blocks generated by the given code into a fresh
+-- (sub-)scope. This will make sure that Cmm annotations in our scope
+-- will apply to the Cmm blocks generated therein - but not the other
+-- way around.
+tickScope :: FCode a -> FCode a
+tickScope code = do
+        info <- getInfoDown
+        if debugLevel (cgd_dflags info) == 0 then code else do
+          u <- newUnique
+          let scope' = SubScope u (cgd_tick_scope info)
+          withInfoDown code info{ cgd_tick_scope = scope' }
+
+
+--------------------------------------------------------
+--                 Forking
+--------------------------------------------------------
+
+forkClosureBody :: FCode () -> FCode ()
+-- forkClosureBody compiles body_code in environment where:
+--   - sequel, update stack frame and self loop info are
+--     set to fresh values
+--   - state is set to a fresh value, except for local bindings
+--     that are passed in unchanged. It's up to the enclosed code to
+--     re-bind the free variables to a field of the closure.
+
+forkClosureBody body_code
+  = do  { dflags <- getDynFlags
+        ; info   <- getInfoDown
+        ; us     <- newUniqSupply
+        ; state  <- getState
+        ; let body_info_down = info { cgd_sequel    = initSequel
+                                    , cgd_updfr_off = initUpdFrameOff dflags
+                                    , cgd_self_loop = Nothing }
+              fork_state_in = (initCgState us) { cgs_binds = cgs_binds state }
+              ((),fork_state_out) = doFCode body_code body_info_down fork_state_in
+        ; setState $ state `addCodeBlocksFrom` fork_state_out }
+
+forkLneBody :: FCode a -> FCode a
+-- 'forkLneBody' takes a body of let-no-escape binding and compiles
+-- it in the *current* environment, returning the graph thus constructed.
+--
+-- The current environment is passed on completely unchanged to
+-- the successor.  In particular, any heap usage from the enclosed
+-- code is discarded; it should deal with its own heap consumption.
+forkLneBody body_code
+  = do  { info_down <- getInfoDown
+        ; us        <- newUniqSupply
+        ; state     <- getState
+        ; let fork_state_in = (initCgState us) { cgs_binds = cgs_binds state }
+              (result, fork_state_out) = doFCode body_code info_down fork_state_in
+        ; setState $ state `addCodeBlocksFrom` fork_state_out
+        ; return result }
+
+codeOnly :: FCode () -> FCode ()
+-- Emit any code from the inner thing into the outer thing
+-- Do not affect anything else in the outer state
+-- Used in almost-circular code to prevent false loop dependencies
+codeOnly body_code
+  = do  { info_down <- getInfoDown
+        ; us        <- newUniqSupply
+        ; state     <- getState
+        ; let   fork_state_in = (initCgState us) { cgs_binds   = cgs_binds state
+                                                 , cgs_hp_usg  = cgs_hp_usg state }
+                ((), fork_state_out) = doFCode body_code info_down fork_state_in
+        ; setState $ state `addCodeBlocksFrom` fork_state_out }
+
+forkAlts :: [FCode a] -> FCode [a]
+-- (forkAlts' bs d) takes fcodes 'bs' for the branches of a 'case', and
+-- an fcode for the default case 'd', and compiles each in the current
+-- environment.  The current environment is passed on unmodified, except
+-- that the virtual Hp is moved on to the worst virtual Hp for the branches
+
+forkAlts branch_fcodes
+  = do  { info_down <- getInfoDown
+        ; us <- newUniqSupply
+        ; state <- getState
+        ; let compile us branch
+                = (us2, doFCode branch info_down branch_state)
+                where
+                  (us1,us2) = splitUniqSupply us
+                  branch_state = (initCgState us1) {
+                                        cgs_binds  = cgs_binds state
+                                      , cgs_hp_usg = cgs_hp_usg state }
+              (_us, results) = mapAccumL compile us branch_fcodes
+              (branch_results, branch_out_states) = unzip results
+        ; setState $ foldl' stateIncUsage state branch_out_states
+                -- NB foldl.  state is the *left* argument to stateIncUsage
+        ; return branch_results }
+
+forkAltPair :: FCode a -> FCode a -> FCode (a,a)
+-- Most common use of 'forkAlts'; having this helper function avoids
+-- accidental use of failible pattern-matches in @do@-notation
+forkAltPair x y = do
+  xy' <- forkAlts [x,y]
+  case xy' of
+    [x',y'] -> return (x',y')
+    _ -> panic "forkAltPair"
+
+-- collect the code emitted by an FCode computation
+getCodeR :: FCode a -> FCode (a, CmmAGraph)
+getCodeR fcode
+  = do  { state1 <- getState
+        ; (a, state2) <- withState fcode (state1 { cgs_stmts = mkNop })
+        ; setState $ state2 { cgs_stmts = cgs_stmts state1  }
+        ; return (a, cgs_stmts state2) }
+
+getCode :: FCode a -> FCode CmmAGraph
+getCode fcode = do { (_,stmts) <- getCodeR fcode; return stmts }
+
+-- | Generate code into a fresh tick (sub-)scope and gather generated code
+getCodeScoped :: FCode a -> FCode (a, CmmAGraphScoped)
+getCodeScoped fcode
+  = do  { state1 <- getState
+        ; ((a, tscope), state2) <-
+            tickScope $
+            flip withState state1 { cgs_stmts = mkNop } $
+            do { a   <- fcode
+               ; scp <- getTickScope
+               ; return (a, scp) }
+        ; setState $ state2 { cgs_stmts = cgs_stmts state1  }
+        ; return (a, (cgs_stmts state2, tscope)) }
+
+
+-- 'getHeapUsage' applies a function to the amount of heap that it uses.
+-- It initialises the heap usage to zeros, and passes on an unchanged
+-- heap usage.
+--
+-- It is usually a prelude to performing a GC check, so everything must
+-- be in a tidy and consistent state.
+--
+-- Note the slightly subtle fixed point behaviour needed here
+
+getHeapUsage :: (VirtualHpOffset -> FCode a) -> FCode a
+getHeapUsage fcode
+  = do  { info_down <- getInfoDown
+        ; state <- getState
+        ; let   fstate_in = state { cgs_hp_usg  = initHpUsage }
+                (r, fstate_out) = doFCode (fcode hp_hw) info_down fstate_in
+                hp_hw = heapHWM (cgs_hp_usg fstate_out)        -- Loop here!
+
+        ; setState $ fstate_out { cgs_hp_usg = cgs_hp_usg state }
+        ; return r }
+
+-- ----------------------------------------------------------------------------
+-- Combinators for emitting code
+
+emitCgStmt :: CgStmt -> FCode ()
+emitCgStmt stmt
+  = do  { state <- getState
+        ; setState $ state { cgs_stmts = cgs_stmts state `snocOL` stmt }
+        }
+
+emitLabel :: BlockId -> FCode ()
+emitLabel id = do tscope <- getTickScope
+                  emitCgStmt (CgLabel id tscope)
+
+emitComment :: FastString -> FCode ()
+emitComment s
+  | debugIsOn = emitCgStmt (CgStmt (CmmComment s))
+  | otherwise = return ()
+
+emitTick :: CmmTickish -> FCode ()
+emitTick = emitCgStmt . CgStmt . CmmTick
+
+emitUnwind :: [(GlobalReg, Maybe CmmExpr)] -> FCode ()
+emitUnwind regs = do
+  dflags <- getDynFlags
+  when (debugLevel dflags > 0) $ do
+     emitCgStmt $ CgStmt $ CmmUnwind regs
+
+emitAssign :: CmmReg  -> CmmExpr -> FCode ()
+emitAssign l r = emitCgStmt (CgStmt (CmmAssign l r))
+
+emitStore :: CmmExpr  -> CmmExpr -> FCode ()
+emitStore l r = emitCgStmt (CgStmt (CmmStore l r))
+
+emit :: CmmAGraph -> FCode ()
+emit ag
+  = do  { state <- getState
+        ; setState $ state { cgs_stmts = cgs_stmts state MkGraph.<*> ag } }
+
+emitDecl :: CmmDecl -> FCode ()
+emitDecl decl
+  = do  { state <- getState
+        ; setState $ state { cgs_tops = cgs_tops state `snocOL` decl } }
+
+emitOutOfLine :: BlockId -> CmmAGraphScoped -> FCode ()
+emitOutOfLine l (stmts, tscope) = emitCgStmt (CgFork l stmts tscope)
+
+emitProcWithStackFrame
+   :: Convention                        -- entry convention
+   -> Maybe CmmInfoTable                -- info table?
+   -> CLabel                            -- label for the proc
+   -> [CmmFormal]                       -- stack frame
+   -> [CmmFormal]                       -- arguments
+   -> CmmAGraphScoped                   -- code
+   -> Bool                              -- do stack layout?
+   -> FCode ()
+
+emitProcWithStackFrame _conv mb_info lbl _stk_args [] blocks False
+  = do  { dflags <- getDynFlags
+        ; emitProc mb_info lbl [] blocks (widthInBytes (wordWidth dflags)) False
+        }
+emitProcWithStackFrame conv mb_info lbl stk_args args (graph, tscope) True
+        -- do layout
+  = do  { dflags <- getDynFlags
+        ; let (offset, live, entry) = mkCallEntry dflags conv args stk_args
+              graph' = entry MkGraph.<*> graph
+        ; emitProc mb_info lbl live (graph', tscope) offset True
+        }
+emitProcWithStackFrame _ _ _ _ _ _ _ = panic "emitProcWithStackFrame"
+
+emitProcWithConvention :: Convention -> Maybe CmmInfoTable -> CLabel
+                       -> [CmmFormal]
+                       -> CmmAGraphScoped
+                       -> FCode ()
+emitProcWithConvention conv mb_info lbl args blocks
+  = emitProcWithStackFrame conv mb_info lbl [] args blocks True
+
+emitProc :: Maybe CmmInfoTable -> CLabel -> [GlobalReg] -> CmmAGraphScoped
+         -> Int -> Bool -> FCode ()
+emitProc mb_info lbl live blocks offset do_layout
+  = do  { dflags <- getDynFlags
+        ; l <- newBlockId
+        ; let
+              blks :: CmmGraph
+              blks = labelAGraph l blocks
+
+              infos | Just info <- mb_info = mapSingleton (g_entry blks) info
+                    | otherwise            = mapEmpty
+
+              sinfo = StackInfo { arg_space = offset
+                                , updfr_space = Just (initUpdFrameOff dflags)
+                                , do_layout = do_layout }
+
+              tinfo = TopInfo { info_tbls = infos
+                              , stack_info=sinfo}
+
+              proc_block = CmmProc tinfo lbl live blks
+
+        ; state <- getState
+        ; setState $ state { cgs_tops = cgs_tops state `snocOL` proc_block } }
+
+getCmm :: FCode () -> FCode CmmGroup
+-- Get all the CmmTops (there should be no stmts)
+-- Return a single Cmm which may be split from other Cmms by
+-- object splitting (at a later stage)
+getCmm code
+  = do  { state1 <- getState
+        ; ((), state2) <- withState code (state1 { cgs_tops  = nilOL })
+        ; setState $ state2 { cgs_tops = cgs_tops state1 }
+        ; return (fromOL (cgs_tops state2)) }
+
+
+mkCmmIfThenElse :: CmmExpr -> CmmAGraph -> CmmAGraph -> FCode CmmAGraph
+mkCmmIfThenElse e tbranch fbranch = mkCmmIfThenElse' e tbranch fbranch Nothing
+
+mkCmmIfThenElse' :: CmmExpr -> CmmAGraph -> CmmAGraph
+                 -> Maybe Bool -> FCode CmmAGraph
+mkCmmIfThenElse' e tbranch fbranch likely = do
+  tscp  <- getTickScope
+  endif <- newBlockId
+  tid   <- newBlockId
+  fid   <- newBlockId
+
+  let
+    (test, then_, else_, likely') = case likely of
+      Just False | Just e' <- maybeInvertCmmExpr e
+        -- currently NCG doesn't know about likely
+        -- annotations. We manually switch then and
+        -- else branch so the likely false branch
+        -- becomes a fallthrough.
+        -> (e', fbranch, tbranch, Just True)
+      _ -> (e, tbranch, fbranch, likely)
+
+  return $ catAGraphs [ mkCbranch test tid fid likely'
+                      , mkLabel tid tscp, then_, mkBranch endif
+                      , mkLabel fid tscp, else_, mkLabel endif tscp ]
+
+mkCmmIfGoto :: CmmExpr -> BlockId -> FCode CmmAGraph
+mkCmmIfGoto e tid = mkCmmIfGoto' e tid Nothing
+
+mkCmmIfGoto' :: CmmExpr -> BlockId -> Maybe Bool -> FCode CmmAGraph
+mkCmmIfGoto' e tid l = do
+  endif <- newBlockId
+  tscp  <- getTickScope
+  return $ catAGraphs [ mkCbranch e tid endif l, mkLabel endif tscp ]
+
+mkCmmIfThen :: CmmExpr -> CmmAGraph -> FCode CmmAGraph
+mkCmmIfThen e tbranch = mkCmmIfThen' e tbranch Nothing
+
+mkCmmIfThen' :: CmmExpr -> CmmAGraph -> Maybe Bool -> FCode CmmAGraph
+mkCmmIfThen' e tbranch l = do
+  endif <- newBlockId
+  tid   <- newBlockId
+  tscp  <- getTickScope
+  return $ catAGraphs [ mkCbranch e tid endif l
+                      , mkLabel tid tscp, tbranch, mkLabel endif tscp ]
+
+mkCall :: CmmExpr -> (Convention, Convention) -> [CmmFormal] -> [CmmExpr]
+       -> UpdFrameOffset -> [CmmExpr] -> FCode CmmAGraph
+mkCall f (callConv, retConv) results actuals updfr_off extra_stack = do
+  dflags <- getDynFlags
+  k      <- newBlockId
+  tscp   <- getTickScope
+  let area = Young k
+      (off, _, copyin) = copyInOflow dflags retConv area results []
+      copyout = mkCallReturnsTo dflags f callConv actuals k off updfr_off extra_stack
+  return $ catAGraphs [copyout, mkLabel k tscp, copyin]
+
+mkCmmCall :: CmmExpr -> [CmmFormal] -> [CmmExpr] -> UpdFrameOffset
+          -> FCode CmmAGraph
+mkCmmCall f results actuals updfr_off
+   = mkCall f (NativeDirectCall, NativeReturn) results actuals updfr_off []
+
+
+-- ----------------------------------------------------------------------------
+-- turn CmmAGraph into CmmGraph, for making a new proc.
+
+aGraphToGraph :: CmmAGraphScoped -> FCode CmmGraph
+aGraphToGraph stmts
+  = do  { l <- newBlockId
+        ; return (labelAGraph l stmts) }
diff --git a/compiler/GHC/StgToCmm/Prim.hs b/compiler/GHC/StgToCmm/Prim.hs
new file mode 100644
index 0000000000..dc69a51916
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Prim.hs
@@ -0,0 +1,2622 @@
+{-# LANGUAGE CPP #-}
+-- emitPrimOp is quite large
+{-# OPTIONS_GHC -fmax-pmcheck-iterations=4000000 #-}
+
+----------------------------------------------------------------------------
+--
+-- Stg to C--: primitive operations
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.Prim (
+   cgOpApp,
+   cgPrimOp, -- internal(ish), used by cgCase to get code for a
+             -- comparison without also turning it into a Bool.
+   shouldInlinePrimOp
+ ) where
+
+#include "HsVersions.h"
+
+import GhcPrelude hiding ((<*>))
+
+import GHC.StgToCmm.Layout
+import GHC.StgToCmm.Foreign
+import GHC.StgToCmm.Env
+import GHC.StgToCmm.Monad
+import GHC.StgToCmm.Utils
+import GHC.StgToCmm.Ticky
+import GHC.StgToCmm.Heap
+import GHC.StgToCmm.Prof ( costCentreFrom )
+
+import DynFlags
+import GHC.Platform
+import BasicTypes
+import BlockId
+import MkGraph
+import StgSyn
+import Cmm
+import Type     ( Type, tyConAppTyCon )
+import TyCon
+import CLabel
+import CmmUtils
+import PrimOp
+import SMRep
+import FastString
+import Outputable
+import Util
+import Data.Maybe
+
+import Data.Bits ((.&.), bit)
+import Control.Monad (liftM, when, unless)
+
+------------------------------------------------------------------------
+--      Primitive operations and foreign calls
+------------------------------------------------------------------------
+
+{- Note [Foreign call results]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+A foreign call always returns an unboxed tuple of results, one
+of which is the state token.  This seems to happen even for pure
+calls.
+
+Even if we returned a single result for pure calls, it'd still be
+right to wrap it in a singleton unboxed tuple, because the result
+might be a Haskell closure pointer, we don't want to evaluate it. -}
+
+----------------------------------
+cgOpApp :: StgOp        -- The op
+        -> [StgArg]     -- Arguments
+        -> Type         -- Result type (always an unboxed tuple)
+        -> FCode ReturnKind
+
+-- Foreign calls
+cgOpApp (StgFCallOp fcall ty) stg_args res_ty
+  = cgForeignCall fcall ty stg_args res_ty
+      -- Note [Foreign call results]
+
+-- tagToEnum# is special: we need to pull the constructor
+-- out of the table, and perform an appropriate return.
+
+cgOpApp (StgPrimOp TagToEnumOp) [arg] res_ty
+  = ASSERT(isEnumerationTyCon tycon)
+    do  { dflags <- getDynFlags
+        ; args' <- getNonVoidArgAmodes [arg]
+        ; let amode = case args' of [amode] -> amode
+                                    _ -> panic "TagToEnumOp had void arg"
+        ; emitReturn [tagToClosure dflags tycon amode] }
+   where
+          -- If you're reading this code in the attempt to figure
+          -- out why the compiler panic'ed here, it is probably because
+          -- you used tagToEnum# in a non-monomorphic setting, e.g.,
+          --         intToTg :: Enum a => Int -> a ; intToTg (I# x#) = tagToEnum# x#
+          -- That won't work.
+        tycon = tyConAppTyCon res_ty
+
+cgOpApp (StgPrimOp primop) args res_ty = do
+    dflags <- getDynFlags
+    cmm_args <- getNonVoidArgAmodes args
+    case shouldInlinePrimOp dflags primop cmm_args of
+        Nothing -> do  -- out-of-line
+          let fun = CmmLit (CmmLabel (mkRtsPrimOpLabel primop))
+          emitCall (NativeNodeCall, NativeReturn) fun cmm_args
+
+        Just f  -- inline
+          | ReturnsPrim VoidRep <- result_info
+          -> do f []
+                emitReturn []
+
+          | ReturnsPrim rep <- result_info
+          -> do dflags <- getDynFlags
+                res <- newTemp (primRepCmmType dflags rep)
+                f [res]
+                emitReturn [CmmReg (CmmLocal res)]
+
+          | ReturnsAlg tycon <- result_info, isUnboxedTupleTyCon tycon
+          -> do (regs, _hints) <- newUnboxedTupleRegs res_ty
+                f regs
+                emitReturn (map (CmmReg . CmmLocal) regs)
+
+          | otherwise -> panic "cgPrimop"
+          where
+             result_info = getPrimOpResultInfo primop
+
+cgOpApp (StgPrimCallOp primcall) args _res_ty
+  = do  { cmm_args <- getNonVoidArgAmodes args
+        ; let fun = CmmLit (CmmLabel (mkPrimCallLabel primcall))
+        ; emitCall (NativeNodeCall, NativeReturn) fun cmm_args }
+
+-- | Interpret the argument as an unsigned value, assuming the value
+-- is given in two-complement form in the given width.
+--
+-- Example: @asUnsigned W64 (-1)@ is 18446744073709551615.
+--
+-- This function is used to work around the fact that many array
+-- primops take Int# arguments, but we interpret them as unsigned
+-- quantities in the code gen. This means that we have to be careful
+-- every time we work on e.g. a CmmInt literal that corresponds to the
+-- array size, as it might contain a negative Integer value if the
+-- user passed a value larger than 2^(wORD_SIZE_IN_BITS-1) as the Int#
+-- literal.
+asUnsigned :: Width -> Integer -> Integer
+asUnsigned w n = n .&. (bit (widthInBits w) - 1)
+
+-- TODO: Several primop implementations (e.g. 'doNewByteArrayOp') use
+--     ByteOff (or some other fixed width signed type) to represent
+--     array sizes or indices. This means that these will overflow for
+--     large enough sizes.
+
+-- | Decide whether an out-of-line primop should be replaced by an
+-- inline implementation. This might happen e.g. if there's enough
+-- static information, such as statically know arguments, to emit a
+-- more efficient implementation inline.
+--
+-- Returns 'Nothing' if this primop should use its out-of-line
+-- implementation (defined elsewhere) and 'Just' together with a code
+-- generating function that takes the output regs as arguments
+-- otherwise.
+shouldInlinePrimOp :: DynFlags
+                   -> PrimOp     -- ^ The primop
+                   -> [CmmExpr]  -- ^ The primop arguments
+                   -> Maybe ([LocalReg] -> FCode ())
+
+shouldInlinePrimOp dflags NewByteArrayOp_Char [(CmmLit (CmmInt n w))]
+  | asUnsigned w n <= fromIntegral (maxInlineAllocSize dflags) =
+      Just $ \ [res] -> doNewByteArrayOp res (fromInteger n)
+
+shouldInlinePrimOp dflags NewArrayOp [(CmmLit (CmmInt n w)), init]
+  | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) =
+      Just $ \ [res] ->
+      doNewArrayOp res (arrPtrsRep dflags (fromInteger n)) mkMAP_DIRTY_infoLabel
+      [ (mkIntExpr dflags (fromInteger n),
+         fixedHdrSize dflags + oFFSET_StgMutArrPtrs_ptrs dflags)
+      , (mkIntExpr dflags (nonHdrSizeW (arrPtrsRep dflags (fromInteger n))),
+         fixedHdrSize dflags + oFFSET_StgMutArrPtrs_size dflags)
+      ]
+      (fromInteger n) init
+
+shouldInlinePrimOp _ CopyArrayOp
+    [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] =
+        Just $ \ [] -> doCopyArrayOp src src_off dst dst_off (fromInteger n)
+
+shouldInlinePrimOp _ CopyMutableArrayOp
+    [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] =
+        Just $ \ [] -> doCopyMutableArrayOp src src_off dst dst_off (fromInteger n)
+
+shouldInlinePrimOp _ CopyArrayArrayOp
+    [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] =
+        Just $ \ [] -> doCopyArrayOp src src_off dst dst_off (fromInteger n)
+
+shouldInlinePrimOp _ CopyMutableArrayArrayOp
+    [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] =
+        Just $ \ [] -> doCopyMutableArrayOp src src_off dst dst_off (fromInteger n)
+
+shouldInlinePrimOp dflags CloneArrayOp [src, src_off, (CmmLit (CmmInt n w))]
+  | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) =
+      Just $ \ [res] -> emitCloneArray mkMAP_FROZEN_CLEAN_infoLabel res src src_off (fromInteger n)
+
+shouldInlinePrimOp dflags CloneMutableArrayOp [src, src_off, (CmmLit (CmmInt n w))]
+  | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) =
+      Just $ \ [res] -> emitCloneArray mkMAP_DIRTY_infoLabel res src src_off (fromInteger n)
+
+shouldInlinePrimOp dflags FreezeArrayOp [src, src_off, (CmmLit (CmmInt n w))]
+  | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) =
+      Just $ \ [res] -> emitCloneArray mkMAP_FROZEN_CLEAN_infoLabel res src src_off (fromInteger n)
+
+shouldInlinePrimOp dflags ThawArrayOp [src, src_off, (CmmLit (CmmInt n w))]
+  | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) =
+      Just $ \ [res] -> emitCloneArray mkMAP_DIRTY_infoLabel res src src_off (fromInteger n)
+
+shouldInlinePrimOp dflags NewSmallArrayOp [(CmmLit (CmmInt n w)), init]
+  | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) =
+      Just $ \ [res] ->
+      doNewArrayOp res (smallArrPtrsRep (fromInteger n)) mkSMAP_DIRTY_infoLabel
+      [ (mkIntExpr dflags (fromInteger n),
+         fixedHdrSize dflags + oFFSET_StgSmallMutArrPtrs_ptrs dflags)
+      ]
+      (fromInteger n) init
+
+shouldInlinePrimOp _ CopySmallArrayOp
+    [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] =
+        Just $ \ [] -> doCopySmallArrayOp src src_off dst dst_off (fromInteger n)
+
+shouldInlinePrimOp _ CopySmallMutableArrayOp
+    [src, src_off, dst, dst_off, (CmmLit (CmmInt n _))] =
+        Just $ \ [] -> doCopySmallMutableArrayOp src src_off dst dst_off (fromInteger n)
+
+shouldInlinePrimOp dflags CloneSmallArrayOp [src, src_off, (CmmLit (CmmInt n w))]
+  | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) =
+      Just $ \ [res] -> emitCloneSmallArray mkSMAP_FROZEN_CLEAN_infoLabel res src src_off (fromInteger n)
+
+shouldInlinePrimOp dflags CloneSmallMutableArrayOp [src, src_off, (CmmLit (CmmInt n w))]
+  | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) =
+      Just $ \ [res] -> emitCloneSmallArray mkSMAP_DIRTY_infoLabel res src src_off (fromInteger n)
+
+shouldInlinePrimOp dflags FreezeSmallArrayOp [src, src_off, (CmmLit (CmmInt n w))]
+  | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) =
+      Just $ \ [res] -> emitCloneSmallArray mkSMAP_FROZEN_CLEAN_infoLabel res src src_off (fromInteger n)
+
+shouldInlinePrimOp dflags ThawSmallArrayOp [src, src_off, (CmmLit (CmmInt n w))]
+  | wordsToBytes dflags (asUnsigned w n) <= fromIntegral (maxInlineAllocSize dflags) =
+      Just $ \ [res] -> emitCloneSmallArray mkSMAP_DIRTY_infoLabel res src src_off (fromInteger n)
+
+shouldInlinePrimOp dflags primop args
+  | primOpOutOfLine primop = Nothing
+  | otherwise = Just $ \ regs -> emitPrimOp dflags regs primop args
+
+-- TODO: Several primops, such as 'copyArray#', only have an inline
+-- implementation (below) but could possibly have both an inline
+-- implementation and an out-of-line implementation, just like
+-- 'newArray#'. This would lower the amount of code generated,
+-- hopefully without a performance impact (needs to be measured).
+
+---------------------------------------------------
+cgPrimOp   :: [LocalReg]        -- where to put the results
+           -> PrimOp            -- the op
+           -> [StgArg]          -- arguments
+           -> FCode ()
+
+cgPrimOp results op args
+  = do dflags <- getDynFlags
+       arg_exprs <- getNonVoidArgAmodes args
+       emitPrimOp dflags results op arg_exprs
+
+
+------------------------------------------------------------------------
+--      Emitting code for a primop
+------------------------------------------------------------------------
+
+emitPrimOp :: DynFlags
+           -> [LocalReg]        -- where to put the results
+           -> PrimOp            -- the op
+           -> [CmmExpr]         -- arguments
+           -> FCode ()
+
+-- First we handle various awkward cases specially.  The remaining
+-- easy cases are then handled by translateOp, defined below.
+
+emitPrimOp _ [res] ParOp [arg]
+  =
+        -- for now, just implement this in a C function
+        -- later, we might want to inline it.
+    emitCCall
+        [(res,NoHint)]
+        (CmmLit (CmmLabel (mkForeignLabel (fsLit "newSpark") Nothing ForeignLabelInExternalPackage IsFunction)))
+        [(baseExpr, AddrHint), (arg,AddrHint)]
+
+emitPrimOp dflags [res] SparkOp [arg]
+  = do
+        -- returns the value of arg in res.  We're going to therefore
+        -- refer to arg twice (once to pass to newSpark(), and once to
+        -- assign to res), so put it in a temporary.
+        tmp <- assignTemp arg
+        tmp2 <- newTemp (bWord dflags)
+        emitCCall
+            [(tmp2,NoHint)]
+            (CmmLit (CmmLabel (mkForeignLabel (fsLit "newSpark") Nothing ForeignLabelInExternalPackage IsFunction)))
+            [(baseExpr, AddrHint), ((CmmReg (CmmLocal tmp)), AddrHint)]
+        emitAssign (CmmLocal res) (CmmReg (CmmLocal tmp))
+
+emitPrimOp dflags [res] GetCCSOfOp [arg]
+  = emitAssign (CmmLocal res) val
+  where
+    val
+     | gopt Opt_SccProfilingOn dflags = costCentreFrom dflags (cmmUntag dflags arg)
+     | otherwise                      = CmmLit (zeroCLit dflags)
+
+emitPrimOp _ [res] GetCurrentCCSOp [_dummy_arg]
+   = emitAssign (CmmLocal res) cccsExpr
+
+emitPrimOp _ [res] MyThreadIdOp []
+   = emitAssign (CmmLocal res) currentTSOExpr
+
+emitPrimOp dflags [res] ReadMutVarOp [mutv]
+   = emitAssign (CmmLocal res) (cmmLoadIndexW dflags mutv (fixedHdrSizeW dflags) (gcWord dflags))
+
+emitPrimOp dflags res@[] WriteMutVarOp [mutv,var]
+   = do -- Without this write barrier, other CPUs may see this pointer before
+        -- the writes for the closure it points to have occurred.
+        emitPrimCall res MO_WriteBarrier []
+        emitStore (cmmOffsetW dflags mutv (fixedHdrSizeW dflags)) var
+        emitCCall
+                [{-no results-}]
+                (CmmLit (CmmLabel mkDirty_MUT_VAR_Label))
+                [(baseExpr, AddrHint), (mutv,AddrHint)]
+
+--  #define sizzeofByteArrayzh(r,a) \
+--     r = ((StgArrBytes *)(a))->bytes
+emitPrimOp dflags [res] SizeofByteArrayOp [arg]
+   = emit $ mkAssign (CmmLocal res) (cmmLoadIndexW dflags arg (fixedHdrSizeW dflags) (bWord dflags))
+
+--  #define sizzeofMutableByteArrayzh(r,a) \
+--      r = ((StgArrBytes *)(a))->bytes
+emitPrimOp dflags [res] SizeofMutableByteArrayOp [arg]
+   = emitPrimOp dflags [res] SizeofByteArrayOp [arg]
+
+--  #define getSizzeofMutableByteArrayzh(r,a) \
+--      r = ((StgArrBytes *)(a))->bytes
+emitPrimOp dflags [res] GetSizeofMutableByteArrayOp [arg]
+   = emitAssign (CmmLocal res) (cmmLoadIndexW dflags arg (fixedHdrSizeW dflags) (bWord dflags))
+
+
+--  #define touchzh(o)                  /* nothing */
+emitPrimOp _ res@[] TouchOp args@[_arg]
+   = do emitPrimCall res MO_Touch args
+
+--  #define byteArrayContentszh(r,a) r = BYTE_ARR_CTS(a)
+emitPrimOp dflags [res] ByteArrayContents_Char [arg]
+   = emitAssign (CmmLocal res) (cmmOffsetB dflags arg (arrWordsHdrSize dflags))
+
+--  #define stableNameToIntzh(r,s)   (r = ((StgStableName *)s)->sn)
+emitPrimOp dflags [res] StableNameToIntOp [arg]
+   = emitAssign (CmmLocal res) (cmmLoadIndexW dflags arg (fixedHdrSizeW dflags) (bWord dflags))
+
+emitPrimOp dflags [res] ReallyUnsafePtrEqualityOp [arg1,arg2]
+   = emitAssign (CmmLocal res) (CmmMachOp (mo_wordEq dflags) [arg1,arg2])
+
+--  #define addrToHValuezh(r,a) r=(P_)a
+emitPrimOp _      [res] AddrToAnyOp [arg]
+   = emitAssign (CmmLocal res) arg
+
+--  #define hvalueToAddrzh(r, a) r=(W_)a
+emitPrimOp _      [res] AnyToAddrOp [arg]
+   = emitAssign (CmmLocal res) arg
+
+{- Freezing arrays-of-ptrs requires changing an info table, for the
+   benefit of the generational collector.  It needs to scavenge mutable
+   objects, even if they are in old space.  When they become immutable,
+   they can be removed from this scavenge list.  -}
+
+--  #define unsafeFreezzeArrayzh(r,a)
+--      {
+--        SET_INFO((StgClosure *)a,&stg_MUT_ARR_PTRS_FROZEN_DIRTY_info);
+--        r = a;
+--      }
+emitPrimOp _      [res] UnsafeFreezeArrayOp [arg]
+   = emit $ catAGraphs
+   [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN_DIRTY_infoLabel)),
+     mkAssign (CmmLocal res) arg ]
+emitPrimOp _      [res] UnsafeFreezeArrayArrayOp [arg]
+   = emit $ catAGraphs
+   [ setInfo arg (CmmLit (CmmLabel mkMAP_FROZEN_DIRTY_infoLabel)),
+     mkAssign (CmmLocal res) arg ]
+emitPrimOp _      [res] UnsafeFreezeSmallArrayOp [arg]
+   = emit $ catAGraphs
+   [ setInfo arg (CmmLit (CmmLabel mkSMAP_FROZEN_DIRTY_infoLabel)),
+     mkAssign (CmmLocal res) arg ]
+
+--  #define unsafeFreezzeByteArrayzh(r,a)       r=(a)
+emitPrimOp _      [res] UnsafeFreezeByteArrayOp [arg]
+   = emitAssign (CmmLocal res) arg
+
+-- Reading/writing pointer arrays
+
+emitPrimOp _      [res] ReadArrayOp  [obj,ix]    = doReadPtrArrayOp res obj ix
+emitPrimOp _      [res] IndexArrayOp [obj,ix]    = doReadPtrArrayOp res obj ix
+emitPrimOp _      []  WriteArrayOp [obj,ix,v]  = doWritePtrArrayOp obj ix v
+
+emitPrimOp _      [res] IndexArrayArrayOp_ByteArray         [obj,ix]   = doReadPtrArrayOp res obj ix
+emitPrimOp _      [res] IndexArrayArrayOp_ArrayArray        [obj,ix]   = doReadPtrArrayOp res obj ix
+emitPrimOp _      [res] ReadArrayArrayOp_ByteArray          [obj,ix]   = doReadPtrArrayOp res obj ix
+emitPrimOp _      [res] ReadArrayArrayOp_MutableByteArray   [obj,ix]   = doReadPtrArrayOp res obj ix
+emitPrimOp _      [res] ReadArrayArrayOp_ArrayArray         [obj,ix]   = doReadPtrArrayOp res obj ix
+emitPrimOp _      [res] ReadArrayArrayOp_MutableArrayArray  [obj,ix]   = doReadPtrArrayOp res obj ix
+emitPrimOp _      []  WriteArrayArrayOp_ByteArray         [obj,ix,v] = doWritePtrArrayOp obj ix v
+emitPrimOp _      []  WriteArrayArrayOp_MutableByteArray  [obj,ix,v] = doWritePtrArrayOp obj ix v
+emitPrimOp _      []  WriteArrayArrayOp_ArrayArray        [obj,ix,v] = doWritePtrArrayOp obj ix v
+emitPrimOp _      []  WriteArrayArrayOp_MutableArrayArray [obj,ix,v] = doWritePtrArrayOp obj ix v
+
+emitPrimOp _      [res] ReadSmallArrayOp  [obj,ix] = doReadSmallPtrArrayOp res obj ix
+emitPrimOp _      [res] IndexSmallArrayOp [obj,ix] = doReadSmallPtrArrayOp res obj ix
+emitPrimOp _      []  WriteSmallArrayOp [obj,ix,v] = doWriteSmallPtrArrayOp obj ix v
+
+-- Getting the size of pointer arrays
+
+emitPrimOp dflags [res] SizeofArrayOp [arg]
+   = emit $ mkAssign (CmmLocal res) (cmmLoadIndexW dflags arg
+    (fixedHdrSizeW dflags + bytesToWordsRoundUp dflags (oFFSET_StgMutArrPtrs_ptrs dflags))
+        (bWord dflags))
+emitPrimOp dflags [res] SizeofMutableArrayOp [arg]
+   = emitPrimOp dflags [res] SizeofArrayOp [arg]
+emitPrimOp dflags [res] SizeofArrayArrayOp [arg]
+   = emitPrimOp dflags [res] SizeofArrayOp [arg]
+emitPrimOp dflags [res] SizeofMutableArrayArrayOp [arg]
+   = emitPrimOp dflags [res] SizeofArrayOp [arg]
+
+emitPrimOp dflags [res] SizeofSmallArrayOp [arg] =
+    emit $ mkAssign (CmmLocal res)
+    (cmmLoadIndexW dflags arg
+     (fixedHdrSizeW dflags + bytesToWordsRoundUp dflags (oFFSET_StgSmallMutArrPtrs_ptrs dflags))
+        (bWord dflags))
+emitPrimOp dflags [res] SizeofSmallMutableArrayOp [arg] =
+    emitPrimOp dflags [res] SizeofSmallArrayOp [arg]
+
+-- IndexXXXoffAddr
+
+emitPrimOp dflags res IndexOffAddrOp_Char             args = doIndexOffAddrOp   (Just (mo_u_8ToWord dflags)) b8 res args
+emitPrimOp dflags res IndexOffAddrOp_WideChar         args = doIndexOffAddrOp   (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp dflags res IndexOffAddrOp_Int              args = doIndexOffAddrOp   Nothing (bWord dflags) res args
+emitPrimOp dflags res IndexOffAddrOp_Word             args = doIndexOffAddrOp   Nothing (bWord dflags) res args
+emitPrimOp dflags res IndexOffAddrOp_Addr             args = doIndexOffAddrOp   Nothing (bWord dflags) res args
+emitPrimOp _      res IndexOffAddrOp_Float            args = doIndexOffAddrOp   Nothing f32 res args
+emitPrimOp _      res IndexOffAddrOp_Double           args = doIndexOffAddrOp   Nothing f64 res args
+emitPrimOp dflags res IndexOffAddrOp_StablePtr        args = doIndexOffAddrOp   Nothing (bWord dflags) res args
+emitPrimOp dflags res IndexOffAddrOp_Int8             args = doIndexOffAddrOp   (Just (mo_s_8ToWord dflags)) b8  res args
+emitPrimOp dflags res IndexOffAddrOp_Int16            args = doIndexOffAddrOp   (Just (mo_s_16ToWord dflags)) b16 res args
+emitPrimOp dflags res IndexOffAddrOp_Int32            args = doIndexOffAddrOp   (Just (mo_s_32ToWord dflags)) b32 res args
+emitPrimOp _      res IndexOffAddrOp_Int64            args = doIndexOffAddrOp   Nothing b64 res args
+emitPrimOp dflags res IndexOffAddrOp_Word8            args = doIndexOffAddrOp   (Just (mo_u_8ToWord dflags)) b8  res args
+emitPrimOp dflags res IndexOffAddrOp_Word16           args = doIndexOffAddrOp   (Just (mo_u_16ToWord dflags)) b16 res args
+emitPrimOp dflags res IndexOffAddrOp_Word32           args = doIndexOffAddrOp   (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp _      res IndexOffAddrOp_Word64           args = doIndexOffAddrOp   Nothing b64 res args
+
+-- ReadXXXoffAddr, which are identical, for our purposes, to IndexXXXoffAddr.
+
+emitPrimOp dflags res ReadOffAddrOp_Char             args = doIndexOffAddrOp   (Just (mo_u_8ToWord dflags)) b8 res args
+emitPrimOp dflags res ReadOffAddrOp_WideChar         args = doIndexOffAddrOp   (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp dflags res ReadOffAddrOp_Int              args = doIndexOffAddrOp   Nothing (bWord dflags) res args
+emitPrimOp dflags res ReadOffAddrOp_Word             args = doIndexOffAddrOp   Nothing (bWord dflags) res args
+emitPrimOp dflags res ReadOffAddrOp_Addr             args = doIndexOffAddrOp   Nothing (bWord dflags) res args
+emitPrimOp _      res ReadOffAddrOp_Float            args = doIndexOffAddrOp   Nothing f32 res args
+emitPrimOp _      res ReadOffAddrOp_Double           args = doIndexOffAddrOp   Nothing f64 res args
+emitPrimOp dflags res ReadOffAddrOp_StablePtr        args = doIndexOffAddrOp   Nothing (bWord dflags) res args
+emitPrimOp dflags res ReadOffAddrOp_Int8             args = doIndexOffAddrOp   (Just (mo_s_8ToWord dflags)) b8  res args
+emitPrimOp dflags res ReadOffAddrOp_Int16            args = doIndexOffAddrOp   (Just (mo_s_16ToWord dflags)) b16 res args
+emitPrimOp dflags res ReadOffAddrOp_Int32            args = doIndexOffAddrOp   (Just (mo_s_32ToWord dflags)) b32 res args
+emitPrimOp _      res ReadOffAddrOp_Int64            args = doIndexOffAddrOp   Nothing b64 res args
+emitPrimOp dflags res ReadOffAddrOp_Word8            args = doIndexOffAddrOp   (Just (mo_u_8ToWord dflags)) b8  res args
+emitPrimOp dflags res ReadOffAddrOp_Word16           args = doIndexOffAddrOp   (Just (mo_u_16ToWord dflags)) b16 res args
+emitPrimOp dflags res ReadOffAddrOp_Word32           args = doIndexOffAddrOp   (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp _      res ReadOffAddrOp_Word64           args = doIndexOffAddrOp   Nothing b64 res args
+
+-- IndexXXXArray
+
+emitPrimOp dflags res IndexByteArrayOp_Char             args = doIndexByteArrayOp   (Just (mo_u_8ToWord dflags)) b8 res args
+emitPrimOp dflags res IndexByteArrayOp_WideChar         args = doIndexByteArrayOp   (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp dflags res IndexByteArrayOp_Int              args = doIndexByteArrayOp   Nothing (bWord dflags) res args
+emitPrimOp dflags res IndexByteArrayOp_Word             args = doIndexByteArrayOp   Nothing (bWord dflags) res args
+emitPrimOp dflags res IndexByteArrayOp_Addr             args = doIndexByteArrayOp   Nothing (bWord dflags) res args
+emitPrimOp _      res IndexByteArrayOp_Float            args = doIndexByteArrayOp   Nothing f32 res args
+emitPrimOp _      res IndexByteArrayOp_Double           args = doIndexByteArrayOp   Nothing f64 res args
+emitPrimOp dflags res IndexByteArrayOp_StablePtr        args = doIndexByteArrayOp   Nothing (bWord dflags) res args
+emitPrimOp dflags res IndexByteArrayOp_Int8             args = doIndexByteArrayOp   (Just (mo_s_8ToWord dflags)) b8  res args
+emitPrimOp dflags res IndexByteArrayOp_Int16            args = doIndexByteArrayOp   (Just (mo_s_16ToWord dflags)) b16  res args
+emitPrimOp dflags res IndexByteArrayOp_Int32            args = doIndexByteArrayOp   (Just (mo_s_32ToWord dflags)) b32  res args
+emitPrimOp _      res IndexByteArrayOp_Int64            args = doIndexByteArrayOp   Nothing b64  res args
+emitPrimOp dflags res IndexByteArrayOp_Word8            args = doIndexByteArrayOp   (Just (mo_u_8ToWord dflags)) b8  res args
+emitPrimOp dflags res IndexByteArrayOp_Word16           args = doIndexByteArrayOp   (Just (mo_u_16ToWord dflags)) b16  res args
+emitPrimOp dflags res IndexByteArrayOp_Word32           args = doIndexByteArrayOp   (Just (mo_u_32ToWord dflags)) b32  res args
+emitPrimOp _      res IndexByteArrayOp_Word64           args = doIndexByteArrayOp   Nothing b64  res args
+
+-- ReadXXXArray, identical to IndexXXXArray.
+
+emitPrimOp dflags res ReadByteArrayOp_Char             args = doIndexByteArrayOp   (Just (mo_u_8ToWord dflags)) b8 res args
+emitPrimOp dflags res ReadByteArrayOp_WideChar         args = doIndexByteArrayOp   (Just (mo_u_32ToWord dflags)) b32 res args
+emitPrimOp dflags res ReadByteArrayOp_Int              args = doIndexByteArrayOp   Nothing (bWord dflags) res args
+emitPrimOp dflags res ReadByteArrayOp_Word             args = doIndexByteArrayOp   Nothing (bWord dflags) res args
+emitPrimOp dflags res ReadByteArrayOp_Addr             args = doIndexByteArrayOp   Nothing (bWord dflags) res args
+emitPrimOp _      res ReadByteArrayOp_Float            args = doIndexByteArrayOp   Nothing f32 res args
+emitPrimOp _      res ReadByteArrayOp_Double           args = doIndexByteArrayOp   Nothing f64 res args
+emitPrimOp dflags res ReadByteArrayOp_StablePtr        args = doIndexByteArrayOp   Nothing (bWord dflags) res args
+emitPrimOp dflags res ReadByteArrayOp_Int8             args = doIndexByteArrayOp   (Just (mo_s_8ToWord dflags)) b8  res args
+emitPrimOp dflags res ReadByteArrayOp_Int16            args = doIndexByteArrayOp   (Just (mo_s_16ToWord dflags)) b16  res args
+emitPrimOp dflags res ReadByteArrayOp_Int32            args = doIndexByteArrayOp   (Just (mo_s_32ToWord dflags)) b32  res args
+emitPrimOp _      res ReadByteArrayOp_Int64            args = doIndexByteArrayOp   Nothing b64  res args
+emitPrimOp dflags res ReadByteArrayOp_Word8            args = doIndexByteArrayOp   (Just (mo_u_8ToWord dflags)) b8  res args
+emitPrimOp dflags res ReadByteArrayOp_Word16           args = doIndexByteArrayOp   (Just (mo_u_16ToWord dflags)) b16  res args
+emitPrimOp dflags res ReadByteArrayOp_Word32           args = doIndexByteArrayOp   (Just (mo_u_32ToWord dflags)) b32  res args
+emitPrimOp _      res ReadByteArrayOp_Word64           args = doIndexByteArrayOp   Nothing b64  res args
+
+-- IndexWord8ArrayAsXXX
+
+emitPrimOp dflags res IndexByteArrayOp_Word8AsChar      args = doIndexByteArrayOpAs   (Just (mo_u_8ToWord dflags)) b8 b8 res args
+emitPrimOp dflags res IndexByteArrayOp_Word8AsWideChar  args = doIndexByteArrayOpAs   (Just (mo_u_32ToWord dflags)) b32 b8 res args
+emitPrimOp dflags res IndexByteArrayOp_Word8AsInt       args = doIndexByteArrayOpAs   Nothing (bWord dflags) b8 res args
+emitPrimOp dflags res IndexByteArrayOp_Word8AsWord      args = doIndexByteArrayOpAs   Nothing (bWord dflags) b8 res args
+emitPrimOp dflags res IndexByteArrayOp_Word8AsAddr      args = doIndexByteArrayOpAs   Nothing (bWord dflags) b8 res args
+emitPrimOp _      res IndexByteArrayOp_Word8AsFloat     args = doIndexByteArrayOpAs   Nothing f32 b8 res args
+emitPrimOp _      res IndexByteArrayOp_Word8AsDouble    args = doIndexByteArrayOpAs   Nothing f64 b8 res args
+emitPrimOp dflags res IndexByteArrayOp_Word8AsStablePtr args = doIndexByteArrayOpAs   Nothing (bWord dflags) b8 res args
+emitPrimOp dflags res IndexByteArrayOp_Word8AsInt16     args = doIndexByteArrayOpAs   (Just (mo_s_16ToWord dflags)) b16 b8 res args
+emitPrimOp dflags res IndexByteArrayOp_Word8AsInt32     args = doIndexByteArrayOpAs   (Just (mo_s_32ToWord dflags)) b32 b8 res args
+emitPrimOp _      res IndexByteArrayOp_Word8AsInt64     args = doIndexByteArrayOpAs   Nothing b64 b8 res args
+emitPrimOp dflags res IndexByteArrayOp_Word8AsWord16    args = doIndexByteArrayOpAs   (Just (mo_u_16ToWord dflags)) b16 b8 res args
+emitPrimOp dflags res IndexByteArrayOp_Word8AsWord32    args = doIndexByteArrayOpAs   (Just (mo_u_32ToWord dflags)) b32 b8 res args
+emitPrimOp _      res IndexByteArrayOp_Word8AsWord64    args = doIndexByteArrayOpAs   Nothing b64 b8 res args
+
+-- ReadInt8ArrayAsXXX, identical to IndexInt8ArrayAsXXX
+
+emitPrimOp dflags res ReadByteArrayOp_Word8AsChar      args = doIndexByteArrayOpAs   (Just (mo_u_8ToWord dflags)) b8 b8 res args
+emitPrimOp dflags res ReadByteArrayOp_Word8AsWideChar  args = doIndexByteArrayOpAs   (Just (mo_u_32ToWord dflags)) b32 b8 res args
+emitPrimOp dflags res ReadByteArrayOp_Word8AsInt       args = doIndexByteArrayOpAs   Nothing (bWord dflags) b8 res args
+emitPrimOp dflags res ReadByteArrayOp_Word8AsWord      args = doIndexByteArrayOpAs   Nothing (bWord dflags) b8 res args
+emitPrimOp dflags res ReadByteArrayOp_Word8AsAddr      args = doIndexByteArrayOpAs   Nothing (bWord dflags) b8 res args
+emitPrimOp _      res ReadByteArrayOp_Word8AsFloat     args = doIndexByteArrayOpAs   Nothing f32 b8 res args
+emitPrimOp _      res ReadByteArrayOp_Word8AsDouble    args = doIndexByteArrayOpAs   Nothing f64 b8 res args
+emitPrimOp dflags res ReadByteArrayOp_Word8AsStablePtr args = doIndexByteArrayOpAs   Nothing (bWord dflags) b8 res args
+emitPrimOp dflags res ReadByteArrayOp_Word8AsInt16     args = doIndexByteArrayOpAs   (Just (mo_s_16ToWord dflags)) b16 b8 res args
+emitPrimOp dflags res ReadByteArrayOp_Word8AsInt32     args = doIndexByteArrayOpAs   (Just (mo_s_32ToWord dflags)) b32 b8 res args
+emitPrimOp _      res ReadByteArrayOp_Word8AsInt64     args = doIndexByteArrayOpAs   Nothing b64 b8 res args
+emitPrimOp dflags res ReadByteArrayOp_Word8AsWord16    args = doIndexByteArrayOpAs   (Just (mo_u_16ToWord dflags)) b16 b8 res args
+emitPrimOp dflags res ReadByteArrayOp_Word8AsWord32    args = doIndexByteArrayOpAs   (Just (mo_u_32ToWord dflags)) b32 b8 res args
+emitPrimOp _      res ReadByteArrayOp_Word8AsWord64    args = doIndexByteArrayOpAs   Nothing b64 b8 res args
+
+-- WriteXXXoffAddr
+
+emitPrimOp dflags res WriteOffAddrOp_Char             args = doWriteOffAddrOp (Just (mo_WordTo8 dflags))  b8 res args
+emitPrimOp dflags res WriteOffAddrOp_WideChar         args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args
+emitPrimOp dflags res WriteOffAddrOp_Int              args = doWriteOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp dflags res WriteOffAddrOp_Word             args = doWriteOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp dflags res WriteOffAddrOp_Addr             args = doWriteOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp _      res WriteOffAddrOp_Float            args = doWriteOffAddrOp Nothing f32 res args
+emitPrimOp _      res WriteOffAddrOp_Double           args = doWriteOffAddrOp Nothing f64 res args
+emitPrimOp dflags res WriteOffAddrOp_StablePtr        args = doWriteOffAddrOp Nothing (bWord dflags) res args
+emitPrimOp dflags res WriteOffAddrOp_Int8             args = doWriteOffAddrOp (Just (mo_WordTo8 dflags))  b8 res args
+emitPrimOp dflags res WriteOffAddrOp_Int16            args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) b16 res args
+emitPrimOp dflags res WriteOffAddrOp_Int32            args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args
+emitPrimOp _      res WriteOffAddrOp_Int64            args = doWriteOffAddrOp Nothing b64 res args
+emitPrimOp dflags res WriteOffAddrOp_Word8            args = doWriteOffAddrOp (Just (mo_WordTo8 dflags))  b8 res args
+emitPrimOp dflags res WriteOffAddrOp_Word16           args = doWriteOffAddrOp (Just (mo_WordTo16 dflags)) b16 res args
+emitPrimOp dflags res WriteOffAddrOp_Word32           args = doWriteOffAddrOp (Just (mo_WordTo32 dflags)) b32 res args
+emitPrimOp _      res WriteOffAddrOp_Word64           args = doWriteOffAddrOp Nothing b64 res args
+
+-- WriteXXXArray
+
+emitPrimOp dflags res WriteByteArrayOp_Char             args = doWriteByteArrayOp (Just (mo_WordTo8 dflags))  b8 res args
+emitPrimOp dflags res WriteByteArrayOp_WideChar         args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args
+emitPrimOp dflags res WriteByteArrayOp_Int              args = doWriteByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp dflags res WriteByteArrayOp_Word             args = doWriteByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp dflags res WriteByteArrayOp_Addr             args = doWriteByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp _      res WriteByteArrayOp_Float            args = doWriteByteArrayOp Nothing f32 res args
+emitPrimOp _      res WriteByteArrayOp_Double           args = doWriteByteArrayOp Nothing f64 res args
+emitPrimOp dflags res WriteByteArrayOp_StablePtr        args = doWriteByteArrayOp Nothing (bWord dflags) res args
+emitPrimOp dflags res WriteByteArrayOp_Int8             args = doWriteByteArrayOp (Just (mo_WordTo8 dflags))  b8 res args
+emitPrimOp dflags res WriteByteArrayOp_Int16            args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b16 res args
+emitPrimOp dflags res WriteByteArrayOp_Int32            args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args
+emitPrimOp _      res WriteByteArrayOp_Int64            args = doWriteByteArrayOp Nothing b64 res args
+emitPrimOp dflags res WriteByteArrayOp_Word8            args = doWriteByteArrayOp (Just (mo_WordTo8 dflags))  b8  res args
+emitPrimOp dflags res WriteByteArrayOp_Word16           args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b16 res args
+emitPrimOp dflags res WriteByteArrayOp_Word32           args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b32 res args
+emitPrimOp _      res WriteByteArrayOp_Word64           args = doWriteByteArrayOp Nothing b64 res args
+
+-- WriteInt8ArrayAsXXX
+
+emitPrimOp dflags res WriteByteArrayOp_Word8AsChar       args = doWriteByteArrayOp (Just (mo_WordTo8 dflags))  b8 res args
+emitPrimOp dflags res WriteByteArrayOp_Word8AsWideChar   args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b8 res args
+emitPrimOp _      res WriteByteArrayOp_Word8AsInt        args = doWriteByteArrayOp Nothing b8 res args
+emitPrimOp _      res WriteByteArrayOp_Word8AsWord       args = doWriteByteArrayOp Nothing b8 res args
+emitPrimOp _      res WriteByteArrayOp_Word8AsAddr       args = doWriteByteArrayOp Nothing b8 res args
+emitPrimOp _      res WriteByteArrayOp_Word8AsFloat      args = doWriteByteArrayOp Nothing b8 res args
+emitPrimOp _      res WriteByteArrayOp_Word8AsDouble     args = doWriteByteArrayOp Nothing b8 res args
+emitPrimOp _      res WriteByteArrayOp_Word8AsStablePtr  args = doWriteByteArrayOp Nothing b8 res args
+emitPrimOp dflags res WriteByteArrayOp_Word8AsInt16      args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b8 res args
+emitPrimOp dflags res WriteByteArrayOp_Word8AsInt32      args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b8 res args
+emitPrimOp _      res WriteByteArrayOp_Word8AsInt64      args = doWriteByteArrayOp Nothing b8 res args
+emitPrimOp dflags res WriteByteArrayOp_Word8AsWord16     args = doWriteByteArrayOp (Just (mo_WordTo16 dflags)) b8 res args
+emitPrimOp dflags res WriteByteArrayOp_Word8AsWord32     args = doWriteByteArrayOp (Just (mo_WordTo32 dflags)) b8 res args
+emitPrimOp _      res WriteByteArrayOp_Word8AsWord64     args = doWriteByteArrayOp Nothing b8 res args
+
+-- Copying and setting byte arrays
+emitPrimOp _      [] CopyByteArrayOp [src,src_off,dst,dst_off,n] =
+    doCopyByteArrayOp src src_off dst dst_off n
+emitPrimOp _      [] CopyMutableByteArrayOp [src,src_off,dst,dst_off,n] =
+    doCopyMutableByteArrayOp src src_off dst dst_off n
+emitPrimOp _      [] CopyByteArrayToAddrOp [src,src_off,dst,n] =
+    doCopyByteArrayToAddrOp src src_off dst n
+emitPrimOp _      [] CopyMutableByteArrayToAddrOp [src,src_off,dst,n] =
+    doCopyMutableByteArrayToAddrOp src src_off dst n
+emitPrimOp _      [] CopyAddrToByteArrayOp [src,dst,dst_off,n] =
+    doCopyAddrToByteArrayOp src dst dst_off n
+emitPrimOp _      [] SetByteArrayOp [ba,off,len,c] =
+    doSetByteArrayOp ba off len c
+
+-- Comparing byte arrays
+emitPrimOp _      [res] CompareByteArraysOp [ba1,ba1_off,ba2,ba2_off,n] =
+    doCompareByteArraysOp res ba1 ba1_off ba2 ba2_off n
+
+emitPrimOp _      [res] BSwap16Op [w] = emitBSwapCall res w W16
+emitPrimOp _      [res] BSwap32Op [w] = emitBSwapCall res w W32
+emitPrimOp _      [res] BSwap64Op [w] = emitBSwapCall res w W64
+emitPrimOp dflags [res] BSwapOp   [w] = emitBSwapCall res w (wordWidth dflags)
+
+emitPrimOp _      [res] BRev8Op  [w] = emitBRevCall res w W8
+emitPrimOp _      [res] BRev16Op [w] = emitBRevCall res w W16
+emitPrimOp _      [res] BRev32Op [w] = emitBRevCall res w W32
+emitPrimOp _      [res] BRev64Op [w] = emitBRevCall res w W64
+emitPrimOp dflags [res] BRevOp   [w] = emitBRevCall res w (wordWidth dflags)
+
+-- Population count
+emitPrimOp _      [res] PopCnt8Op  [w] = emitPopCntCall res w W8
+emitPrimOp _      [res] PopCnt16Op [w] = emitPopCntCall res w W16
+emitPrimOp _      [res] PopCnt32Op [w] = emitPopCntCall res w W32
+emitPrimOp _      [res] PopCnt64Op [w] = emitPopCntCall res w W64
+emitPrimOp dflags [res] PopCntOp   [w] = emitPopCntCall res w (wordWidth dflags)
+
+-- Parallel bit deposit
+emitPrimOp _      [res] Pdep8Op  [src, mask] = emitPdepCall res src mask W8
+emitPrimOp _      [res] Pdep16Op [src, mask] = emitPdepCall res src mask W16
+emitPrimOp _      [res] Pdep32Op [src, mask] = emitPdepCall res src mask W32
+emitPrimOp _      [res] Pdep64Op [src, mask] = emitPdepCall res src mask W64
+emitPrimOp dflags [res] PdepOp   [src, mask] = emitPdepCall res src mask (wordWidth dflags)
+
+-- Parallel bit extract
+emitPrimOp _      [res] Pext8Op  [src, mask] = emitPextCall res src mask W8
+emitPrimOp _      [res] Pext16Op [src, mask] = emitPextCall res src mask W16
+emitPrimOp _      [res] Pext32Op [src, mask] = emitPextCall res src mask W32
+emitPrimOp _      [res] Pext64Op [src, mask] = emitPextCall res src mask W64
+emitPrimOp dflags [res] PextOp   [src, mask] = emitPextCall res src mask (wordWidth dflags)
+
+-- count leading zeros
+emitPrimOp _      [res] Clz8Op  [w] = emitClzCall res w W8
+emitPrimOp _      [res] Clz16Op [w] = emitClzCall res w W16
+emitPrimOp _      [res] Clz32Op [w] = emitClzCall res w W32
+emitPrimOp _      [res] Clz64Op [w] = emitClzCall res w W64
+emitPrimOp dflags [res] ClzOp   [w] = emitClzCall res w (wordWidth dflags)
+
+-- count trailing zeros
+emitPrimOp _      [res] Ctz8Op [w]  = emitCtzCall res w W8
+emitPrimOp _      [res] Ctz16Op [w] = emitCtzCall res w W16
+emitPrimOp _      [res] Ctz32Op [w] = emitCtzCall res w W32
+emitPrimOp _      [res] Ctz64Op [w] = emitCtzCall res w W64
+emitPrimOp dflags [res] CtzOp   [w] = emitCtzCall res w (wordWidth dflags)
+
+-- Unsigned int to floating point conversions
+emitPrimOp _      [res] Word2FloatOp  [w] = emitPrimCall [res]
+                                            (MO_UF_Conv W32) [w]
+emitPrimOp _      [res] Word2DoubleOp [w] = emitPrimCall [res]
+                                            (MO_UF_Conv W64) [w]
+
+-- SIMD primops
+emitPrimOp dflags [res] (VecBroadcastOp vcat n w) [e] = do
+    checkVecCompatibility dflags vcat n w
+    doVecPackOp (vecElemInjectCast dflags vcat w) ty zeros (replicate n e) res
+  where
+    zeros :: CmmExpr
+    zeros = CmmLit $ CmmVec (replicate n zero)
+
+    zero :: CmmLit
+    zero = case vcat of
+             IntVec   -> CmmInt 0 w
+             WordVec  -> CmmInt 0 w
+             FloatVec -> CmmFloat 0 w
+
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags [res] (VecPackOp vcat n w) es = do
+    checkVecCompatibility dflags vcat n w
+    when (es `lengthIsNot` n) $
+        panic "emitPrimOp: VecPackOp has wrong number of arguments"
+    doVecPackOp (vecElemInjectCast dflags vcat w) ty zeros es res
+  where
+    zeros :: CmmExpr
+    zeros = CmmLit $ CmmVec (replicate n zero)
+
+    zero :: CmmLit
+    zero = case vcat of
+             IntVec   -> CmmInt 0 w
+             WordVec  -> CmmInt 0 w
+             FloatVec -> CmmFloat 0 w
+
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags res (VecUnpackOp vcat n w) [arg] = do
+    checkVecCompatibility dflags vcat n w
+    when (res `lengthIsNot` n) $
+        panic "emitPrimOp: VecUnpackOp has wrong number of results"
+    doVecUnpackOp (vecElemProjectCast dflags vcat w) ty arg res
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags [res] (VecInsertOp vcat n w) [v,e,i] = do
+    checkVecCompatibility dflags vcat n w
+    doVecInsertOp (vecElemInjectCast dflags vcat w) ty v e i res
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags res (VecIndexByteArrayOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexByteArrayOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags res (VecReadByteArrayOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexByteArrayOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags res (VecWriteByteArrayOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doWriteByteArrayOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags res (VecIndexOffAddrOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexOffAddrOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags res (VecReadOffAddrOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexOffAddrOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags res (VecWriteOffAddrOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doWriteOffAddrOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecVmmType vcat n w
+
+emitPrimOp dflags res (VecIndexScalarByteArrayOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexByteArrayOpAs Nothing vecty ty res args
+  where
+    vecty :: CmmType
+    vecty = vecVmmType vcat n w
+
+    ty :: CmmType
+    ty = vecCmmCat vcat w
+
+emitPrimOp dflags res (VecReadScalarByteArrayOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexByteArrayOpAs Nothing vecty ty res args
+  where
+    vecty :: CmmType
+    vecty = vecVmmType vcat n w
+
+    ty :: CmmType
+    ty = vecCmmCat vcat w
+
+emitPrimOp dflags res (VecWriteScalarByteArrayOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doWriteByteArrayOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecCmmCat vcat w
+
+emitPrimOp dflags res (VecIndexScalarOffAddrOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexOffAddrOpAs Nothing vecty ty res args
+  where
+    vecty :: CmmType
+    vecty = vecVmmType vcat n w
+
+    ty :: CmmType
+    ty = vecCmmCat vcat w
+
+emitPrimOp dflags res (VecReadScalarOffAddrOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doIndexOffAddrOpAs Nothing vecty ty res args
+  where
+    vecty :: CmmType
+    vecty = vecVmmType vcat n w
+
+    ty :: CmmType
+    ty = vecCmmCat vcat w
+
+emitPrimOp dflags res (VecWriteScalarOffAddrOp vcat n w) args = do
+    checkVecCompatibility dflags vcat n w
+    doWriteOffAddrOp Nothing ty res args
+  where
+    ty :: CmmType
+    ty = vecCmmCat vcat w
+
+-- Prefetch
+emitPrimOp _ [] PrefetchByteArrayOp3        args = doPrefetchByteArrayOp 3  args
+emitPrimOp _ [] PrefetchMutableByteArrayOp3 args = doPrefetchMutableByteArrayOp 3  args
+emitPrimOp _ [] PrefetchAddrOp3             args = doPrefetchAddrOp  3  args
+emitPrimOp _ [] PrefetchValueOp3            args = doPrefetchValueOp 3 args
+
+emitPrimOp _ [] PrefetchByteArrayOp2        args = doPrefetchByteArrayOp 2  args
+emitPrimOp _ [] PrefetchMutableByteArrayOp2 args = doPrefetchMutableByteArrayOp 2  args
+emitPrimOp _ [] PrefetchAddrOp2             args = doPrefetchAddrOp 2  args
+emitPrimOp _ [] PrefetchValueOp2           args = doPrefetchValueOp 2 args
+
+emitPrimOp _ [] PrefetchByteArrayOp1        args = doPrefetchByteArrayOp 1  args
+emitPrimOp _ [] PrefetchMutableByteArrayOp1 args = doPrefetchMutableByteArrayOp 1  args
+emitPrimOp _ [] PrefetchAddrOp1             args = doPrefetchAddrOp 1  args
+emitPrimOp _ [] PrefetchValueOp1            args = doPrefetchValueOp 1 args
+
+emitPrimOp _ [] PrefetchByteArrayOp0        args = doPrefetchByteArrayOp 0  args
+emitPrimOp _ [] PrefetchMutableByteArrayOp0 args = doPrefetchMutableByteArrayOp 0  args
+emitPrimOp _ [] PrefetchAddrOp0             args = doPrefetchAddrOp 0  args
+emitPrimOp _ [] PrefetchValueOp0            args = doPrefetchValueOp 0 args
+
+-- Atomic read-modify-write
+emitPrimOp dflags [res] FetchAddByteArrayOp_Int [mba, ix, n] =
+    doAtomicRMW res AMO_Add mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchSubByteArrayOp_Int [mba, ix, n] =
+    doAtomicRMW res AMO_Sub mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchAndByteArrayOp_Int [mba, ix, n] =
+    doAtomicRMW res AMO_And mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchNandByteArrayOp_Int [mba, ix, n] =
+    doAtomicRMW res AMO_Nand mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchOrByteArrayOp_Int [mba, ix, n] =
+    doAtomicRMW res AMO_Or mba ix (bWord dflags) n
+emitPrimOp dflags [res] FetchXorByteArrayOp_Int [mba, ix, n] =
+    doAtomicRMW res AMO_Xor mba ix (bWord dflags) n
+emitPrimOp dflags [res] AtomicReadByteArrayOp_Int [mba, ix] =
+    doAtomicReadByteArray res mba ix (bWord dflags)
+emitPrimOp dflags [] AtomicWriteByteArrayOp_Int [mba, ix, val] =
+    doAtomicWriteByteArray mba ix (bWord dflags) val
+emitPrimOp dflags [res] CasByteArrayOp_Int [mba, ix, old, new] =
+    doCasByteArray res mba ix (bWord dflags) old new
+
+-- The rest just translate straightforwardly
+emitPrimOp dflags [res] op [arg]
+   | nopOp op
+   = emitAssign (CmmLocal res) arg
+
+   | Just (mop,rep) <- narrowOp op
+   = emitAssign (CmmLocal res) $
+           CmmMachOp (mop rep (wordWidth dflags)) [CmmMachOp (mop (wordWidth dflags) rep) [arg]]
+
+emitPrimOp dflags r@[res] op args
+   | Just prim <- callishOp op
+   = do emitPrimCall r prim args
+
+   | Just mop <- translateOp dflags op
+   = let stmt = mkAssign (CmmLocal res) (CmmMachOp mop args) in
+     emit stmt
+
+emitPrimOp dflags results op args
+   = case callishPrimOpSupported dflags op args of
+          Left op   -> emit $ mkUnsafeCall (PrimTarget op) results args
+          Right gen -> gen results args
+
+-- Note [QuotRem optimization]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- `quot` and `rem` with constant divisor can be implemented with fast bit-ops
+-- (shift, .&.).
+--
+-- Currently we only support optimization (performed in CmmOpt) when the
+-- constant is a power of 2. #9041 tracks the implementation of the general
+-- optimization.
+--
+-- `quotRem` can be optimized in the same way. However as it returns two values,
+-- it is implemented as a "callish" primop which is harder to match and
+-- to transform later on. For simplicity, the current implementation detects cases
+-- that can be optimized (see `quotRemCanBeOptimized`) and converts STG quotRem
+-- primop into two CMM quot and rem primops.
+
+type GenericOp = [CmmFormal] -> [CmmActual] -> FCode ()
+
+callishPrimOpSupported :: DynFlags -> PrimOp -> [CmmExpr] -> Either CallishMachOp GenericOp
+callishPrimOpSupported dflags op args
+  = case op of
+      IntQuotRemOp   | ncg && (x86ish || ppc)
+                     , not quotRemCanBeOptimized
+                         -> Left (MO_S_QuotRem  (wordWidth dflags))
+                     | otherwise
+                         -> Right (genericIntQuotRemOp (wordWidth dflags))
+
+      Int8QuotRemOp  | ncg && (x86ish || ppc)
+                     , not quotRemCanBeOptimized
+                                     -> Left (MO_S_QuotRem W8)
+                     | otherwise     -> Right (genericIntQuotRemOp W8)
+
+      Int16QuotRemOp | ncg && (x86ish || ppc)
+                     , not quotRemCanBeOptimized
+                                     -> Left (MO_S_QuotRem W16)
+                     | otherwise     -> Right (genericIntQuotRemOp W16)
+
+
+      WordQuotRemOp  | ncg && (x86ish || ppc)
+                     , not quotRemCanBeOptimized
+                         -> Left (MO_U_QuotRem  (wordWidth dflags))
+                     | otherwise
+                         -> Right (genericWordQuotRemOp (wordWidth dflags))
+
+      WordQuotRem2Op | (ncg && (x86ish || ppc))
+                          || llvm     -> Left (MO_U_QuotRem2 (wordWidth dflags))
+                     | otherwise      -> Right (genericWordQuotRem2Op dflags)
+
+      Word8QuotRemOp | ncg && (x86ish || ppc)
+                     , not quotRemCanBeOptimized
+                                      -> Left (MO_U_QuotRem W8)
+                     | otherwise      -> Right (genericWordQuotRemOp W8)
+
+      Word16QuotRemOp| ncg && (x86ish || ppc)
+                     , not quotRemCanBeOptimized
+                                     -> Left (MO_U_QuotRem W16)
+                     | otherwise     -> Right (genericWordQuotRemOp W16)
+
+      WordAdd2Op     | (ncg && (x86ish || ppc))
+                         || llvm      -> Left (MO_Add2       (wordWidth dflags))
+                     | otherwise      -> Right genericWordAdd2Op
+
+      WordAddCOp     | (ncg && (x86ish || ppc))
+                         || llvm      -> Left (MO_AddWordC   (wordWidth dflags))
+                     | otherwise      -> Right genericWordAddCOp
+
+      WordSubCOp     | (ncg && (x86ish || ppc))
+                         || llvm      -> Left (MO_SubWordC   (wordWidth dflags))
+                     | otherwise      -> Right genericWordSubCOp
+
+      IntAddCOp      | (ncg && (x86ish || ppc))
+                         || llvm      -> Left (MO_AddIntC    (wordWidth dflags))
+                     | otherwise      -> Right genericIntAddCOp
+
+      IntSubCOp      | (ncg && (x86ish || ppc))
+                         || llvm      -> Left (MO_SubIntC    (wordWidth dflags))
+                     | otherwise      -> Right genericIntSubCOp
+
+      WordMul2Op     | ncg && (x86ish || ppc)
+                         || llvm      -> Left (MO_U_Mul2     (wordWidth dflags))
+                     | otherwise      -> Right genericWordMul2Op
+      FloatFabsOp    | (ncg && x86ish || ppc)
+                         || llvm      -> Left MO_F32_Fabs
+                     | otherwise      -> Right $ genericFabsOp W32
+      DoubleFabsOp   | (ncg && x86ish || ppc)
+                         || llvm      -> Left MO_F64_Fabs
+                     | otherwise      -> Right $ genericFabsOp W64
+
+      _ -> pprPanic "emitPrimOp: can't translate PrimOp " (ppr op)
+ where
+  -- See Note [QuotRem optimization]
+  quotRemCanBeOptimized = case args of
+    [_, CmmLit (CmmInt n _) ] -> isJust (exactLog2 n)
+    _                         -> False
+
+  ncg = case hscTarget dflags of
+           HscAsm -> True
+           _      -> False
+  llvm = case hscTarget dflags of
+           HscLlvm -> True
+           _       -> False
+  x86ish = case platformArch (targetPlatform dflags) of
+             ArchX86    -> True
+             ArchX86_64 -> True
+             _          -> False
+  ppc = case platformArch (targetPlatform dflags) of
+          ArchPPC      -> True
+          ArchPPC_64 _ -> True
+          _            -> False
+
+genericIntQuotRemOp :: Width -> GenericOp
+genericIntQuotRemOp width [res_q, res_r] [arg_x, arg_y]
+   = emit $ mkAssign (CmmLocal res_q)
+              (CmmMachOp (MO_S_Quot width) [arg_x, arg_y]) <*>
+            mkAssign (CmmLocal res_r)
+              (CmmMachOp (MO_S_Rem  width) [arg_x, arg_y])
+genericIntQuotRemOp _ _ _ = panic "genericIntQuotRemOp"
+
+genericWordQuotRemOp :: Width -> GenericOp
+genericWordQuotRemOp width [res_q, res_r] [arg_x, arg_y]
+    = emit $ mkAssign (CmmLocal res_q)
+               (CmmMachOp (MO_U_Quot width) [arg_x, arg_y]) <*>
+             mkAssign (CmmLocal res_r)
+               (CmmMachOp (MO_U_Rem  width) [arg_x, arg_y])
+genericWordQuotRemOp _ _ _ = panic "genericWordQuotRemOp"
+
+genericWordQuotRem2Op :: DynFlags -> GenericOp
+genericWordQuotRem2Op dflags [res_q, res_r] [arg_x_high, arg_x_low, arg_y]
+    = emit =<< f (widthInBits (wordWidth dflags)) zero arg_x_high arg_x_low
+    where    ty = cmmExprType dflags arg_x_high
+             shl   x i = CmmMachOp (MO_Shl   (wordWidth dflags)) [x, i]
+             shr   x i = CmmMachOp (MO_U_Shr (wordWidth dflags)) [x, i]
+             or    x y = CmmMachOp (MO_Or    (wordWidth dflags)) [x, y]
+             ge    x y = CmmMachOp (MO_U_Ge  (wordWidth dflags)) [x, y]
+             ne    x y = CmmMachOp (MO_Ne    (wordWidth dflags)) [x, y]
+             minus x y = CmmMachOp (MO_Sub   (wordWidth dflags)) [x, y]
+             times x y = CmmMachOp (MO_Mul   (wordWidth dflags)) [x, y]
+             zero   = lit 0
+             one    = lit 1
+             negone = lit (fromIntegral (widthInBits (wordWidth dflags)) - 1)
+             lit i = CmmLit (CmmInt i (wordWidth dflags))
+
+             f :: Int -> CmmExpr -> CmmExpr -> CmmExpr -> FCode CmmAGraph
+             f 0 acc high _ = return (mkAssign (CmmLocal res_q) acc <*>
+                                      mkAssign (CmmLocal res_r) high)
+             f i acc high low =
+                 do roverflowedBit <- newTemp ty
+                    rhigh'         <- newTemp ty
+                    rhigh''        <- newTemp ty
+                    rlow'          <- newTemp ty
+                    risge          <- newTemp ty
+                    racc'          <- newTemp ty
+                    let high'         = CmmReg (CmmLocal rhigh')
+                        isge          = CmmReg (CmmLocal risge)
+                        overflowedBit = CmmReg (CmmLocal roverflowedBit)
+                    let this = catAGraphs
+                               [mkAssign (CmmLocal roverflowedBit)
+                                          (shr high negone),
+                                mkAssign (CmmLocal rhigh')
+                                          (or (shl high one) (shr low negone)),
+                                mkAssign (CmmLocal rlow')
+                                          (shl low one),
+                                mkAssign (CmmLocal risge)
+                                          (or (overflowedBit `ne` zero)
+                                              (high' `ge` arg_y)),
+                                mkAssign (CmmLocal rhigh'')
+                                          (high' `minus` (arg_y `times` isge)),
+                                mkAssign (CmmLocal racc')
+                                          (or (shl acc one) isge)]
+                    rest <- f (i - 1) (CmmReg (CmmLocal racc'))
+                                      (CmmReg (CmmLocal rhigh''))
+                                      (CmmReg (CmmLocal rlow'))
+                    return (this <*> rest)
+genericWordQuotRem2Op _ _ _ = panic "genericWordQuotRem2Op"
+
+genericWordAdd2Op :: GenericOp
+genericWordAdd2Op [res_h, res_l] [arg_x, arg_y]
+  = do dflags <- getDynFlags
+       r1 <- newTemp (cmmExprType dflags arg_x)
+       r2 <- newTemp (cmmExprType dflags arg_x)
+       let topHalf x = CmmMachOp (MO_U_Shr (wordWidth dflags)) [x, hww]
+           toTopHalf x = CmmMachOp (MO_Shl (wordWidth dflags)) [x, hww]
+           bottomHalf x = CmmMachOp (MO_And (wordWidth dflags)) [x, hwm]
+           add x y = CmmMachOp (MO_Add (wordWidth dflags)) [x, y]
+           or x y = CmmMachOp (MO_Or (wordWidth dflags)) [x, y]
+           hww = CmmLit (CmmInt (fromIntegral (widthInBits (halfWordWidth dflags)))
+                                (wordWidth dflags))
+           hwm = CmmLit (CmmInt (halfWordMask dflags) (wordWidth dflags))
+       emit $ catAGraphs
+          [mkAssign (CmmLocal r1)
+               (add (bottomHalf arg_x) (bottomHalf arg_y)),
+           mkAssign (CmmLocal r2)
+               (add (topHalf (CmmReg (CmmLocal r1)))
+                    (add (topHalf arg_x) (topHalf arg_y))),
+           mkAssign (CmmLocal res_h)
+               (topHalf (CmmReg (CmmLocal r2))),
+           mkAssign (CmmLocal res_l)
+               (or (toTopHalf (CmmReg (CmmLocal r2)))
+                   (bottomHalf (CmmReg (CmmLocal r1))))]
+genericWordAdd2Op _ _ = panic "genericWordAdd2Op"
+
+-- | Implements branchless recovery of the carry flag @c@ by checking the
+-- leftmost bits of both inputs @a@ and @b@ and result @r = a + b@:
+--
+-- @
+--    c = a&b | (a|b)&~r
+-- @
+--
+-- https://brodowsky.it-sky.net/2015/04/02/how-to-recover-the-carry-bit/
+genericWordAddCOp :: GenericOp
+genericWordAddCOp [res_r, res_c] [aa, bb]
+ = do dflags <- getDynFlags
+      emit $ catAGraphs [
+        mkAssign (CmmLocal res_r) (CmmMachOp (mo_wordAdd dflags) [aa,bb]),
+        mkAssign (CmmLocal res_c) $
+          CmmMachOp (mo_wordUShr dflags) [
+            CmmMachOp (mo_wordOr dflags) [
+              CmmMachOp (mo_wordAnd dflags) [aa,bb],
+              CmmMachOp (mo_wordAnd dflags) [
+                CmmMachOp (mo_wordOr dflags) [aa,bb],
+                CmmMachOp (mo_wordNot dflags) [CmmReg (CmmLocal res_r)]
+              ]
+            ],
+            mkIntExpr dflags (wORD_SIZE_IN_BITS dflags - 1)
+          ]
+        ]
+genericWordAddCOp _ _ = panic "genericWordAddCOp"
+
+-- | Implements branchless recovery of the carry flag @c@ by checking the
+-- leftmost bits of both inputs @a@ and @b@ and result @r = a - b@:
+--
+-- @
+--    c = ~a&b | (~a|b)&r
+-- @
+--
+-- https://brodowsky.it-sky.net/2015/04/02/how-to-recover-the-carry-bit/
+genericWordSubCOp :: GenericOp
+genericWordSubCOp [res_r, res_c] [aa, bb]
+ = do dflags <- getDynFlags
+      emit $ catAGraphs [
+        mkAssign (CmmLocal res_r) (CmmMachOp (mo_wordSub dflags) [aa,bb]),
+        mkAssign (CmmLocal res_c) $
+          CmmMachOp (mo_wordUShr dflags) [
+            CmmMachOp (mo_wordOr dflags) [
+              CmmMachOp (mo_wordAnd dflags) [
+                CmmMachOp (mo_wordNot dflags) [aa],
+                bb
+              ],
+              CmmMachOp (mo_wordAnd dflags) [
+                CmmMachOp (mo_wordOr dflags) [
+                  CmmMachOp (mo_wordNot dflags) [aa],
+                  bb
+                ],
+                CmmReg (CmmLocal res_r)
+              ]
+            ],
+            mkIntExpr dflags (wORD_SIZE_IN_BITS dflags - 1)
+          ]
+        ]
+genericWordSubCOp _ _ = panic "genericWordSubCOp"
+
+genericIntAddCOp :: GenericOp
+genericIntAddCOp [res_r, res_c] [aa, bb]
+{-
+   With some bit-twiddling, we can define int{Add,Sub}Czh portably in
+   C, and without needing any comparisons.  This may not be the
+   fastest way to do it - if you have better code, please send it! --SDM
+
+   Return : r = a + b,  c = 0 if no overflow, 1 on overflow.
+
+   We currently don't make use of the r value if c is != 0 (i.e.
+   overflow), we just convert to big integers and try again.  This
+   could be improved by making r and c the correct values for
+   plugging into a new J#.
+
+   { r = ((I_)(a)) + ((I_)(b));                                 \
+     c = ((StgWord)(~(((I_)(a))^((I_)(b))) & (((I_)(a))^r)))    \
+         >> (BITS_IN (I_) - 1);                                 \
+   }
+   Wading through the mass of bracketry, it seems to reduce to:
+   c = ( (~(a^b)) & (a^r) ) >>unsigned (BITS_IN(I_)-1)
+
+-}
+ = do dflags <- getDynFlags
+      emit $ catAGraphs [
+        mkAssign (CmmLocal res_r) (CmmMachOp (mo_wordAdd dflags) [aa,bb]),
+        mkAssign (CmmLocal res_c) $
+          CmmMachOp (mo_wordUShr dflags) [
+                CmmMachOp (mo_wordAnd dflags) [
+                    CmmMachOp (mo_wordNot dflags) [CmmMachOp (mo_wordXor dflags) [aa,bb]],
+                    CmmMachOp (mo_wordXor dflags) [aa, CmmReg (CmmLocal res_r)]
+                ],
+                mkIntExpr dflags (wORD_SIZE_IN_BITS dflags - 1)
+          ]
+        ]
+genericIntAddCOp _ _ = panic "genericIntAddCOp"
+
+genericIntSubCOp :: GenericOp
+genericIntSubCOp [res_r, res_c] [aa, bb]
+{- Similarly:
+   #define subIntCzh(r,c,a,b)                                   \
+   { r = ((I_)(a)) - ((I_)(b));                                 \
+     c = ((StgWord)((((I_)(a))^((I_)(b))) & (((I_)(a))^r)))     \
+         >> (BITS_IN (I_) - 1);                                 \
+   }
+
+   c =  ((a^b) & (a^r)) >>unsigned (BITS_IN(I_)-1)
+-}
+ = do dflags <- getDynFlags
+      emit $ catAGraphs [
+        mkAssign (CmmLocal res_r) (CmmMachOp (mo_wordSub dflags) [aa,bb]),
+        mkAssign (CmmLocal res_c) $
+          CmmMachOp (mo_wordUShr dflags) [
+                CmmMachOp (mo_wordAnd dflags) [
+                    CmmMachOp (mo_wordXor dflags) [aa,bb],
+                    CmmMachOp (mo_wordXor dflags) [aa, CmmReg (CmmLocal res_r)]
+                ],
+                mkIntExpr dflags (wORD_SIZE_IN_BITS dflags - 1)
+          ]
+        ]
+genericIntSubCOp _ _ = panic "genericIntSubCOp"
+
+genericWordMul2Op :: GenericOp
+genericWordMul2Op [res_h, res_l] [arg_x, arg_y]
+ = do dflags <- getDynFlags
+      let t = cmmExprType dflags arg_x
+      xlyl <- liftM CmmLocal $ newTemp t
+      xlyh <- liftM CmmLocal $ newTemp t
+      xhyl <- liftM CmmLocal $ newTemp t
+      r    <- liftM CmmLocal $ newTemp t
+      -- This generic implementation is very simple and slow. We might
+      -- well be able to do better, but for now this at least works.
+      let topHalf x = CmmMachOp (MO_U_Shr (wordWidth dflags)) [x, hww]
+          toTopHalf x = CmmMachOp (MO_Shl (wordWidth dflags)) [x, hww]
+          bottomHalf x = CmmMachOp (MO_And (wordWidth dflags)) [x, hwm]
+          add x y = CmmMachOp (MO_Add (wordWidth dflags)) [x, y]
+          sum = foldl1 add
+          mul x y = CmmMachOp (MO_Mul (wordWidth dflags)) [x, y]
+          or x y = CmmMachOp (MO_Or (wordWidth dflags)) [x, y]
+          hww = CmmLit (CmmInt (fromIntegral (widthInBits (halfWordWidth dflags)))
+                               (wordWidth dflags))
+          hwm = CmmLit (CmmInt (halfWordMask dflags) (wordWidth dflags))
+      emit $ catAGraphs
+             [mkAssign xlyl
+                  (mul (bottomHalf arg_x) (bottomHalf arg_y)),
+              mkAssign xlyh
+                  (mul (bottomHalf arg_x) (topHalf arg_y)),
+              mkAssign xhyl
+                  (mul (topHalf arg_x) (bottomHalf arg_y)),
+              mkAssign r
+                  (sum [topHalf    (CmmReg xlyl),
+                        bottomHalf (CmmReg xhyl),
+                        bottomHalf (CmmReg xlyh)]),
+              mkAssign (CmmLocal res_l)
+                  (or (bottomHalf (CmmReg xlyl))
+                      (toTopHalf (CmmReg r))),
+              mkAssign (CmmLocal res_h)
+                  (sum [mul (topHalf arg_x) (topHalf arg_y),
+                        topHalf (CmmReg xhyl),
+                        topHalf (CmmReg xlyh),
+                        topHalf (CmmReg r)])]
+genericWordMul2Op _ _ = panic "genericWordMul2Op"
+
+-- This replicates what we had in libraries/base/GHC/Float.hs:
+--
+--    abs x    | x == 0    = 0 -- handles (-0.0)
+--             | x >  0    = x
+--             | otherwise = negateFloat x
+genericFabsOp :: Width -> GenericOp
+genericFabsOp w [res_r] [aa]
+ = do dflags <- getDynFlags
+      let zero   = CmmLit (CmmFloat 0 w)
+
+          eq x y = CmmMachOp (MO_F_Eq w) [x, y]
+          gt x y = CmmMachOp (MO_F_Gt w) [x, y]
+
+          neg x  = CmmMachOp (MO_F_Neg w) [x]
+
+          g1 = catAGraphs [mkAssign (CmmLocal res_r) zero]
+          g2 = catAGraphs [mkAssign (CmmLocal res_r) aa]
+
+      res_t <- CmmLocal <$> newTemp (cmmExprType dflags aa)
+      let g3 = catAGraphs [mkAssign res_t aa,
+                           mkAssign (CmmLocal res_r) (neg (CmmReg res_t))]
+
+      g4 <- mkCmmIfThenElse (gt aa zero) g2 g3
+
+      emit =<< mkCmmIfThenElse (eq aa zero) g1 g4
+
+genericFabsOp _ _ _ = panic "genericFabsOp"
+
+-- These PrimOps are NOPs in Cmm
+
+nopOp :: PrimOp -> Bool
+nopOp Int2WordOp     = True
+nopOp Word2IntOp     = True
+nopOp Int2AddrOp     = True
+nopOp Addr2IntOp     = True
+nopOp ChrOp          = True  -- Int# and Char# are rep'd the same
+nopOp OrdOp          = True
+nopOp _              = False
+
+-- These PrimOps turn into double casts
+
+narrowOp :: PrimOp -> Maybe (Width -> Width -> MachOp, Width)
+narrowOp Narrow8IntOp   = Just (MO_SS_Conv, W8)
+narrowOp Narrow16IntOp  = Just (MO_SS_Conv, W16)
+narrowOp Narrow32IntOp  = Just (MO_SS_Conv, W32)
+narrowOp Narrow8WordOp  = Just (MO_UU_Conv, W8)
+narrowOp Narrow16WordOp = Just (MO_UU_Conv, W16)
+narrowOp Narrow32WordOp = Just (MO_UU_Conv, W32)
+narrowOp _              = Nothing
+
+-- Native word signless ops
+
+translateOp :: DynFlags -> PrimOp -> Maybe MachOp
+translateOp dflags IntAddOp       = Just (mo_wordAdd dflags)
+translateOp dflags IntSubOp       = Just (mo_wordSub dflags)
+translateOp dflags WordAddOp      = Just (mo_wordAdd dflags)
+translateOp dflags WordSubOp      = Just (mo_wordSub dflags)
+translateOp dflags AddrAddOp      = Just (mo_wordAdd dflags)
+translateOp dflags AddrSubOp      = Just (mo_wordSub dflags)
+
+translateOp dflags IntEqOp        = Just (mo_wordEq dflags)
+translateOp dflags IntNeOp        = Just (mo_wordNe dflags)
+translateOp dflags WordEqOp       = Just (mo_wordEq dflags)
+translateOp dflags WordNeOp       = Just (mo_wordNe dflags)
+translateOp dflags AddrEqOp       = Just (mo_wordEq dflags)
+translateOp dflags AddrNeOp       = Just (mo_wordNe dflags)
+
+translateOp dflags AndOp          = Just (mo_wordAnd dflags)
+translateOp dflags OrOp           = Just (mo_wordOr dflags)
+translateOp dflags XorOp          = Just (mo_wordXor dflags)
+translateOp dflags NotOp          = Just (mo_wordNot dflags)
+translateOp dflags SllOp          = Just (mo_wordShl dflags)
+translateOp dflags SrlOp          = Just (mo_wordUShr dflags)
+
+translateOp dflags AddrRemOp      = Just (mo_wordURem dflags)
+
+-- Native word signed ops
+
+translateOp dflags IntMulOp        = Just (mo_wordMul dflags)
+translateOp dflags IntMulMayOfloOp = Just (MO_S_MulMayOflo (wordWidth dflags))
+translateOp dflags IntQuotOp       = Just (mo_wordSQuot dflags)
+translateOp dflags IntRemOp        = Just (mo_wordSRem dflags)
+translateOp dflags IntNegOp        = Just (mo_wordSNeg dflags)
+
+
+translateOp dflags IntGeOp        = Just (mo_wordSGe dflags)
+translateOp dflags IntLeOp        = Just (mo_wordSLe dflags)
+translateOp dflags IntGtOp        = Just (mo_wordSGt dflags)
+translateOp dflags IntLtOp        = Just (mo_wordSLt dflags)
+
+translateOp dflags AndIOp         = Just (mo_wordAnd dflags)
+translateOp dflags OrIOp          = Just (mo_wordOr dflags)
+translateOp dflags XorIOp         = Just (mo_wordXor dflags)
+translateOp dflags NotIOp         = Just (mo_wordNot dflags)
+translateOp dflags ISllOp         = Just (mo_wordShl dflags)
+translateOp dflags ISraOp         = Just (mo_wordSShr dflags)
+translateOp dflags ISrlOp         = Just (mo_wordUShr dflags)
+
+-- Native word unsigned ops
+
+translateOp dflags WordGeOp       = Just (mo_wordUGe dflags)
+translateOp dflags WordLeOp       = Just (mo_wordULe dflags)
+translateOp dflags WordGtOp       = Just (mo_wordUGt dflags)
+translateOp dflags WordLtOp       = Just (mo_wordULt dflags)
+
+translateOp dflags WordMulOp      = Just (mo_wordMul dflags)
+translateOp dflags WordQuotOp     = Just (mo_wordUQuot dflags)
+translateOp dflags WordRemOp      = Just (mo_wordURem dflags)
+
+translateOp dflags AddrGeOp       = Just (mo_wordUGe dflags)
+translateOp dflags AddrLeOp       = Just (mo_wordULe dflags)
+translateOp dflags AddrGtOp       = Just (mo_wordUGt dflags)
+translateOp dflags AddrLtOp       = Just (mo_wordULt dflags)
+
+-- Int8# signed ops
+
+translateOp dflags Int8Extend     = Just (MO_SS_Conv W8 (wordWidth dflags))
+translateOp dflags Int8Narrow     = Just (MO_SS_Conv (wordWidth dflags) W8)
+translateOp _      Int8NegOp      = Just (MO_S_Neg W8)
+translateOp _      Int8AddOp      = Just (MO_Add W8)
+translateOp _      Int8SubOp      = Just (MO_Sub W8)
+translateOp _      Int8MulOp      = Just (MO_Mul W8)
+translateOp _      Int8QuotOp     = Just (MO_S_Quot W8)
+translateOp _      Int8RemOp      = Just (MO_S_Rem W8)
+
+translateOp _      Int8EqOp       = Just (MO_Eq W8)
+translateOp _      Int8GeOp       = Just (MO_S_Ge W8)
+translateOp _      Int8GtOp       = Just (MO_S_Gt W8)
+translateOp _      Int8LeOp       = Just (MO_S_Le W8)
+translateOp _      Int8LtOp       = Just (MO_S_Lt W8)
+translateOp _      Int8NeOp       = Just (MO_Ne W8)
+
+-- Word8# unsigned ops
+
+translateOp dflags Word8Extend     = Just (MO_UU_Conv W8 (wordWidth dflags))
+translateOp dflags Word8Narrow     = Just (MO_UU_Conv (wordWidth dflags) W8)
+translateOp _      Word8NotOp      = Just (MO_Not W8)
+translateOp _      Word8AddOp      = Just (MO_Add W8)
+translateOp _      Word8SubOp      = Just (MO_Sub W8)
+translateOp _      Word8MulOp      = Just (MO_Mul W8)
+translateOp _      Word8QuotOp     = Just (MO_U_Quot W8)
+translateOp _      Word8RemOp      = Just (MO_U_Rem W8)
+
+translateOp _      Word8EqOp       = Just (MO_Eq W8)
+translateOp _      Word8GeOp       = Just (MO_U_Ge W8)
+translateOp _      Word8GtOp       = Just (MO_U_Gt W8)
+translateOp _      Word8LeOp       = Just (MO_U_Le W8)
+translateOp _      Word8LtOp       = Just (MO_U_Lt W8)
+translateOp _      Word8NeOp       = Just (MO_Ne W8)
+
+-- Int16# signed ops
+
+translateOp dflags Int16Extend     = Just (MO_SS_Conv W16 (wordWidth dflags))
+translateOp dflags Int16Narrow     = Just (MO_SS_Conv (wordWidth dflags) W16)
+translateOp _      Int16NegOp      = Just (MO_S_Neg W16)
+translateOp _      Int16AddOp      = Just (MO_Add W16)
+translateOp _      Int16SubOp      = Just (MO_Sub W16)
+translateOp _      Int16MulOp      = Just (MO_Mul W16)
+translateOp _      Int16QuotOp     = Just (MO_S_Quot W16)
+translateOp _      Int16RemOp      = Just (MO_S_Rem W16)
+
+translateOp _      Int16EqOp       = Just (MO_Eq W16)
+translateOp _      Int16GeOp       = Just (MO_S_Ge W16)
+translateOp _      Int16GtOp       = Just (MO_S_Gt W16)
+translateOp _      Int16LeOp       = Just (MO_S_Le W16)
+translateOp _      Int16LtOp       = Just (MO_S_Lt W16)
+translateOp _      Int16NeOp       = Just (MO_Ne W16)
+
+-- Word16# unsigned ops
+
+translateOp dflags Word16Extend     = Just (MO_UU_Conv W16 (wordWidth dflags))
+translateOp dflags Word16Narrow     = Just (MO_UU_Conv (wordWidth dflags) W16)
+translateOp _      Word16NotOp      = Just (MO_Not W16)
+translateOp _      Word16AddOp      = Just (MO_Add W16)
+translateOp _      Word16SubOp      = Just (MO_Sub W16)
+translateOp _      Word16MulOp      = Just (MO_Mul W16)
+translateOp _      Word16QuotOp     = Just (MO_U_Quot W16)
+translateOp _      Word16RemOp      = Just (MO_U_Rem W16)
+
+translateOp _      Word16EqOp       = Just (MO_Eq W16)
+translateOp _      Word16GeOp       = Just (MO_U_Ge W16)
+translateOp _      Word16GtOp       = Just (MO_U_Gt W16)
+translateOp _      Word16LeOp       = Just (MO_U_Le W16)
+translateOp _      Word16LtOp       = Just (MO_U_Lt W16)
+translateOp _      Word16NeOp       = Just (MO_Ne W16)
+
+-- Char# ops
+
+translateOp dflags CharEqOp       = Just (MO_Eq (wordWidth dflags))
+translateOp dflags CharNeOp       = Just (MO_Ne (wordWidth dflags))
+translateOp dflags CharGeOp       = Just (MO_U_Ge (wordWidth dflags))
+translateOp dflags CharLeOp       = Just (MO_U_Le (wordWidth dflags))
+translateOp dflags CharGtOp       = Just (MO_U_Gt (wordWidth dflags))
+translateOp dflags CharLtOp       = Just (MO_U_Lt (wordWidth dflags))
+
+-- Double ops
+
+translateOp _      DoubleEqOp     = Just (MO_F_Eq W64)
+translateOp _      DoubleNeOp     = Just (MO_F_Ne W64)
+translateOp _      DoubleGeOp     = Just (MO_F_Ge W64)
+translateOp _      DoubleLeOp     = Just (MO_F_Le W64)
+translateOp _      DoubleGtOp     = Just (MO_F_Gt W64)
+translateOp _      DoubleLtOp     = Just (MO_F_Lt W64)
+
+translateOp _      DoubleAddOp    = Just (MO_F_Add W64)
+translateOp _      DoubleSubOp    = Just (MO_F_Sub W64)
+translateOp _      DoubleMulOp    = Just (MO_F_Mul W64)
+translateOp _      DoubleDivOp    = Just (MO_F_Quot W64)
+translateOp _      DoubleNegOp    = Just (MO_F_Neg W64)
+
+-- Float ops
+
+translateOp _      FloatEqOp     = Just (MO_F_Eq W32)
+translateOp _      FloatNeOp     = Just (MO_F_Ne W32)
+translateOp _      FloatGeOp     = Just (MO_F_Ge W32)
+translateOp _      FloatLeOp     = Just (MO_F_Le W32)
+translateOp _      FloatGtOp     = Just (MO_F_Gt W32)
+translateOp _      FloatLtOp     = Just (MO_F_Lt W32)
+
+translateOp _      FloatAddOp    = Just (MO_F_Add  W32)
+translateOp _      FloatSubOp    = Just (MO_F_Sub  W32)
+translateOp _      FloatMulOp    = Just (MO_F_Mul  W32)
+translateOp _      FloatDivOp    = Just (MO_F_Quot W32)
+translateOp _      FloatNegOp    = Just (MO_F_Neg  W32)
+
+-- Vector ops
+
+translateOp _ (VecAddOp FloatVec n w) = Just (MO_VF_Add  n w)
+translateOp _ (VecSubOp FloatVec n w) = Just (MO_VF_Sub  n w)
+translateOp _ (VecMulOp FloatVec n w) = Just (MO_VF_Mul  n w)
+translateOp _ (VecDivOp FloatVec n w) = Just (MO_VF_Quot n w)
+translateOp _ (VecNegOp FloatVec n w) = Just (MO_VF_Neg  n w)
+
+translateOp _ (VecAddOp  IntVec n w) = Just (MO_V_Add   n w)
+translateOp _ (VecSubOp  IntVec n w) = Just (MO_V_Sub   n w)
+translateOp _ (VecMulOp  IntVec n w) = Just (MO_V_Mul   n w)
+translateOp _ (VecQuotOp IntVec n w) = Just (MO_VS_Quot n w)
+translateOp _ (VecRemOp  IntVec n w) = Just (MO_VS_Rem  n w)
+translateOp _ (VecNegOp  IntVec n w) = Just (MO_VS_Neg  n w)
+
+translateOp _ (VecAddOp  WordVec n w) = Just (MO_V_Add   n w)
+translateOp _ (VecSubOp  WordVec n w) = Just (MO_V_Sub   n w)
+translateOp _ (VecMulOp  WordVec n w) = Just (MO_V_Mul   n w)
+translateOp _ (VecQuotOp WordVec n w) = Just (MO_VU_Quot n w)
+translateOp _ (VecRemOp  WordVec n w) = Just (MO_VU_Rem  n w)
+
+-- Conversions
+
+translateOp dflags Int2DoubleOp   = Just (MO_SF_Conv (wordWidth dflags) W64)
+translateOp dflags Double2IntOp   = Just (MO_FS_Conv W64 (wordWidth dflags))
+
+translateOp dflags Int2FloatOp    = Just (MO_SF_Conv (wordWidth dflags) W32)
+translateOp dflags Float2IntOp    = Just (MO_FS_Conv W32 (wordWidth dflags))
+
+translateOp _      Float2DoubleOp = Just (MO_FF_Conv W32 W64)
+translateOp _      Double2FloatOp = Just (MO_FF_Conv W64 W32)
+
+-- Word comparisons masquerading as more exotic things.
+
+translateOp dflags SameMutVarOp           = Just (mo_wordEq dflags)
+translateOp dflags SameMVarOp             = Just (mo_wordEq dflags)
+translateOp dflags SameMutableArrayOp     = Just (mo_wordEq dflags)
+translateOp dflags SameMutableByteArrayOp = Just (mo_wordEq dflags)
+translateOp dflags SameMutableArrayArrayOp= Just (mo_wordEq dflags)
+translateOp dflags SameSmallMutableArrayOp= Just (mo_wordEq dflags)
+translateOp dflags SameTVarOp             = Just (mo_wordEq dflags)
+translateOp dflags EqStablePtrOp          = Just (mo_wordEq dflags)
+-- See Note [Comparing stable names]
+translateOp dflags EqStableNameOp         = Just (mo_wordEq dflags)
+
+translateOp _      _ = Nothing
+
+-- Note [Comparing stable names]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- A StableName# is actually a pointer to a stable name object (SNO)
+-- containing an index into the stable name table (SNT). We
+-- used to compare StableName#s by following the pointers to the
+-- SNOs and checking whether they held the same SNT indices. However,
+-- this is not necessary: there is a one-to-one correspondence
+-- between SNOs and entries in the SNT, so simple pointer equality
+-- does the trick.
+
+-- These primops are implemented by CallishMachOps, because they sometimes
+-- turn into foreign calls depending on the backend.
+
+callishOp :: PrimOp -> Maybe CallishMachOp
+callishOp DoublePowerOp  = Just MO_F64_Pwr
+callishOp DoubleSinOp    = Just MO_F64_Sin
+callishOp DoubleCosOp    = Just MO_F64_Cos
+callishOp DoubleTanOp    = Just MO_F64_Tan
+callishOp DoubleSinhOp   = Just MO_F64_Sinh
+callishOp DoubleCoshOp   = Just MO_F64_Cosh
+callishOp DoubleTanhOp   = Just MO_F64_Tanh
+callishOp DoubleAsinOp   = Just MO_F64_Asin
+callishOp DoubleAcosOp   = Just MO_F64_Acos
+callishOp DoubleAtanOp   = Just MO_F64_Atan
+callishOp DoubleAsinhOp  = Just MO_F64_Asinh
+callishOp DoubleAcoshOp  = Just MO_F64_Acosh
+callishOp DoubleAtanhOp  = Just MO_F64_Atanh
+callishOp DoubleLogOp    = Just MO_F64_Log
+callishOp DoubleLog1POp  = Just MO_F64_Log1P
+callishOp DoubleExpOp    = Just MO_F64_Exp
+callishOp DoubleExpM1Op  = Just MO_F64_ExpM1
+callishOp DoubleSqrtOp   = Just MO_F64_Sqrt
+
+callishOp FloatPowerOp  = Just MO_F32_Pwr
+callishOp FloatSinOp    = Just MO_F32_Sin
+callishOp FloatCosOp    = Just MO_F32_Cos
+callishOp FloatTanOp    = Just MO_F32_Tan
+callishOp FloatSinhOp   = Just MO_F32_Sinh
+callishOp FloatCoshOp   = Just MO_F32_Cosh
+callishOp FloatTanhOp   = Just MO_F32_Tanh
+callishOp FloatAsinOp   = Just MO_F32_Asin
+callishOp FloatAcosOp   = Just MO_F32_Acos
+callishOp FloatAtanOp   = Just MO_F32_Atan
+callishOp FloatAsinhOp  = Just MO_F32_Asinh
+callishOp FloatAcoshOp  = Just MO_F32_Acosh
+callishOp FloatAtanhOp  = Just MO_F32_Atanh
+callishOp FloatLogOp    = Just MO_F32_Log
+callishOp FloatLog1POp  = Just MO_F32_Log1P
+callishOp FloatExpOp    = Just MO_F32_Exp
+callishOp FloatExpM1Op  = Just MO_F32_ExpM1
+callishOp FloatSqrtOp   = Just MO_F32_Sqrt
+
+callishOp _ = Nothing
+
+------------------------------------------------------------------------------
+-- Helpers for translating various minor variants of array indexing.
+
+doIndexOffAddrOp :: Maybe MachOp
+                 -> CmmType
+                 -> [LocalReg]
+                 -> [CmmExpr]
+                 -> FCode ()
+doIndexOffAddrOp maybe_post_read_cast rep [res] [addr,idx]
+   = mkBasicIndexedRead 0 maybe_post_read_cast rep res addr rep idx
+doIndexOffAddrOp _ _ _ _
+   = panic "GHC.StgToCmm.Prim: doIndexOffAddrOp"
+
+doIndexOffAddrOpAs :: Maybe MachOp
+                   -> CmmType
+                   -> CmmType
+                   -> [LocalReg]
+                   -> [CmmExpr]
+                   -> FCode ()
+doIndexOffAddrOpAs maybe_post_read_cast rep idx_rep [res] [addr,idx]
+   = mkBasicIndexedRead 0 maybe_post_read_cast rep res addr idx_rep idx
+doIndexOffAddrOpAs _ _ _ _ _
+   = panic "GHC.StgToCmm.Prim: doIndexOffAddrOpAs"
+
+doIndexByteArrayOp :: Maybe MachOp
+                   -> CmmType
+                   -> [LocalReg]
+                   -> [CmmExpr]
+                   -> FCode ()
+doIndexByteArrayOp maybe_post_read_cast rep [res] [addr,idx]
+   = do dflags <- getDynFlags
+        mkBasicIndexedRead (arrWordsHdrSize dflags) maybe_post_read_cast rep res addr rep idx
+doIndexByteArrayOp _ _ _ _
+   = panic "GHC.StgToCmm.Prim: doIndexByteArrayOp"
+
+doIndexByteArrayOpAs :: Maybe MachOp
+                    -> CmmType
+                    -> CmmType
+                    -> [LocalReg]
+                    -> [CmmExpr]
+                    -> FCode ()
+doIndexByteArrayOpAs maybe_post_read_cast rep idx_rep [res] [addr,idx]
+   = do dflags <- getDynFlags
+        mkBasicIndexedRead (arrWordsHdrSize dflags) maybe_post_read_cast rep res addr idx_rep idx
+doIndexByteArrayOpAs _ _ _ _ _
+   = panic "GHC.StgToCmm.Prim: doIndexByteArrayOpAs"
+
+doReadPtrArrayOp :: LocalReg
+                 -> CmmExpr
+                 -> CmmExpr
+                 -> FCode ()
+doReadPtrArrayOp res addr idx
+   = do dflags <- getDynFlags
+        mkBasicIndexedRead (arrPtrsHdrSize dflags) Nothing (gcWord dflags) res addr (gcWord dflags) idx
+
+doWriteOffAddrOp :: Maybe MachOp
+                 -> CmmType
+                 -> [LocalReg]
+                 -> [CmmExpr]
+                 -> FCode ()
+doWriteOffAddrOp maybe_pre_write_cast idx_ty [] [addr,idx,val]
+   = mkBasicIndexedWrite 0 maybe_pre_write_cast addr idx_ty idx val
+doWriteOffAddrOp _ _ _ _
+   = panic "GHC.StgToCmm.Prim: doWriteOffAddrOp"
+
+doWriteByteArrayOp :: Maybe MachOp
+                   -> CmmType
+                   -> [LocalReg]
+                   -> [CmmExpr]
+                   -> FCode ()
+doWriteByteArrayOp maybe_pre_write_cast idx_ty [] [addr,idx,val]
+   = do dflags <- getDynFlags
+        mkBasicIndexedWrite (arrWordsHdrSize dflags) maybe_pre_write_cast addr idx_ty idx val
+doWriteByteArrayOp _ _ _ _
+   = panic "GHC.StgToCmm.Prim: doWriteByteArrayOp"
+
+doWritePtrArrayOp :: CmmExpr
+                  -> CmmExpr
+                  -> CmmExpr
+                  -> FCode ()
+doWritePtrArrayOp addr idx val
+  = do dflags <- getDynFlags
+       let ty = cmmExprType dflags val
+       -- This write barrier is to ensure that the heap writes to the object
+       -- referred to by val have happened before we write val into the array.
+       -- See #12469 for details.
+       emitPrimCall [] MO_WriteBarrier []
+       mkBasicIndexedWrite (arrPtrsHdrSize dflags) Nothing addr ty idx val
+       emit (setInfo addr (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel)))
+  -- the write barrier.  We must write a byte into the mark table:
+  -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N]
+       emit $ mkStore (
+         cmmOffsetExpr dflags
+          (cmmOffsetExprW dflags (cmmOffsetB dflags addr (arrPtrsHdrSize dflags))
+                         (loadArrPtrsSize dflags addr))
+          (CmmMachOp (mo_wordUShr dflags) [idx,
+                                           mkIntExpr dflags (mUT_ARR_PTRS_CARD_BITS dflags)])
+         ) (CmmLit (CmmInt 1 W8))
+
+loadArrPtrsSize :: DynFlags -> CmmExpr -> CmmExpr
+loadArrPtrsSize dflags addr = CmmLoad (cmmOffsetB dflags addr off) (bWord dflags)
+ where off = fixedHdrSize dflags + oFFSET_StgMutArrPtrs_ptrs dflags
+
+mkBasicIndexedRead :: ByteOff      -- Initial offset in bytes
+                   -> Maybe MachOp -- Optional result cast
+                   -> CmmType      -- Type of element we are accessing
+                   -> LocalReg     -- Destination
+                   -> CmmExpr      -- Base address
+                   -> CmmType      -- Type of element by which we are indexing
+                   -> CmmExpr      -- Index
+                   -> FCode ()
+mkBasicIndexedRead off Nothing ty res base idx_ty idx
+   = do dflags <- getDynFlags
+        emitAssign (CmmLocal res) (cmmLoadIndexOffExpr dflags off ty base idx_ty idx)
+mkBasicIndexedRead off (Just cast) ty res base idx_ty idx
+   = do dflags <- getDynFlags
+        emitAssign (CmmLocal res) (CmmMachOp cast [
+                                   cmmLoadIndexOffExpr dflags off ty base idx_ty idx])
+
+mkBasicIndexedWrite :: ByteOff      -- Initial offset in bytes
+                    -> Maybe MachOp -- Optional value cast
+                    -> CmmExpr      -- Base address
+                    -> CmmType      -- Type of element by which we are indexing
+                    -> CmmExpr      -- Index
+                    -> CmmExpr      -- Value to write
+                    -> FCode ()
+mkBasicIndexedWrite off Nothing base idx_ty idx val
+   = do dflags <- getDynFlags
+        emitStore (cmmIndexOffExpr dflags off (typeWidth idx_ty) base idx) val
+mkBasicIndexedWrite off (Just cast) base idx_ty idx val
+   = mkBasicIndexedWrite off Nothing base idx_ty idx (CmmMachOp cast [val])
+
+-- ----------------------------------------------------------------------------
+-- Misc utils
+
+cmmIndexOffExpr :: DynFlags
+                -> ByteOff  -- Initial offset in bytes
+                -> Width    -- Width of element by which we are indexing
+                -> CmmExpr  -- Base address
+                -> CmmExpr  -- Index
+                -> CmmExpr
+cmmIndexOffExpr dflags off width base idx
+   = cmmIndexExpr dflags width (cmmOffsetB dflags base off) idx
+
+cmmLoadIndexOffExpr :: DynFlags
+                    -> ByteOff  -- Initial offset in bytes
+                    -> CmmType  -- Type of element we are accessing
+                    -> CmmExpr  -- Base address
+                    -> CmmType  -- Type of element by which we are indexing
+                    -> CmmExpr  -- Index
+                    -> CmmExpr
+cmmLoadIndexOffExpr dflags off ty base idx_ty idx
+   = CmmLoad (cmmIndexOffExpr dflags off (typeWidth idx_ty) base idx) ty
+
+setInfo :: CmmExpr -> CmmExpr -> CmmAGraph
+setInfo closure_ptr info_ptr = mkStore closure_ptr info_ptr
+
+------------------------------------------------------------------------------
+-- Helpers for translating vector primops.
+
+vecVmmType :: PrimOpVecCat -> Length -> Width -> CmmType
+vecVmmType pocat n w = vec n (vecCmmCat pocat w)
+
+vecCmmCat :: PrimOpVecCat -> Width -> CmmType
+vecCmmCat IntVec   = cmmBits
+vecCmmCat WordVec  = cmmBits
+vecCmmCat FloatVec = cmmFloat
+
+vecElemInjectCast :: DynFlags -> PrimOpVecCat -> Width -> Maybe MachOp
+vecElemInjectCast _      FloatVec _   =  Nothing
+vecElemInjectCast dflags IntVec   W8  =  Just (mo_WordTo8  dflags)
+vecElemInjectCast dflags IntVec   W16 =  Just (mo_WordTo16 dflags)
+vecElemInjectCast dflags IntVec   W32 =  Just (mo_WordTo32 dflags)
+vecElemInjectCast _      IntVec   W64 =  Nothing
+vecElemInjectCast dflags WordVec  W8  =  Just (mo_WordTo8  dflags)
+vecElemInjectCast dflags WordVec  W16 =  Just (mo_WordTo16 dflags)
+vecElemInjectCast dflags WordVec  W32 =  Just (mo_WordTo32 dflags)
+vecElemInjectCast _      WordVec  W64 =  Nothing
+vecElemInjectCast _      _        _   =  Nothing
+
+vecElemProjectCast :: DynFlags -> PrimOpVecCat -> Width -> Maybe MachOp
+vecElemProjectCast _      FloatVec _   =  Nothing
+vecElemProjectCast dflags IntVec   W8  =  Just (mo_s_8ToWord  dflags)
+vecElemProjectCast dflags IntVec   W16 =  Just (mo_s_16ToWord dflags)
+vecElemProjectCast dflags IntVec   W32 =  Just (mo_s_32ToWord dflags)
+vecElemProjectCast _      IntVec   W64 =  Nothing
+vecElemProjectCast dflags WordVec  W8  =  Just (mo_u_8ToWord  dflags)
+vecElemProjectCast dflags WordVec  W16 =  Just (mo_u_16ToWord dflags)
+vecElemProjectCast dflags WordVec  W32 =  Just (mo_u_32ToWord dflags)
+vecElemProjectCast _      WordVec  W64 =  Nothing
+vecElemProjectCast _      _        _   =  Nothing
+
+
+-- NOTE [SIMD Design for the future]
+-- Check to make sure that we can generate code for the specified vector type
+-- given the current set of dynamic flags.
+-- Currently these checks are specific to x86 and x86_64 architecture.
+-- This should be fixed!
+-- In particular,
+-- 1) Add better support for other architectures! (this may require a redesign)
+-- 2) Decouple design choices from LLVM's pseudo SIMD model!
+--   The high level LLVM naive rep makes per CPU family SIMD generation is own
+--   optimization problem, and hides important differences in eg ARM vs x86_64 simd
+-- 3) Depending on the architecture, the SIMD registers may also support general
+--    computations on Float/Double/Word/Int scalars, but currently on
+--    for example x86_64, we always put Word/Int (or sized) in GPR
+--    (general purpose) registers. Would relaxing that allow for
+--    useful optimization opportunities?
+--      Phrased differently, it is worth experimenting with supporting
+--    different register mapping strategies than we currently have, especially if
+--    someday we want SIMD to be a first class denizen in GHC along with scalar
+--    values!
+--      The current design with respect to register mapping of scalars could
+--    very well be the best,but exploring the  design space and doing careful
+--    measurments is the only only way to validate that.
+--      In some next generation CPU ISAs, notably RISC V, the SIMD extension
+--    includes  support for a sort of run time CPU dependent vectorization parameter,
+--    where a loop may act upon a single scalar each iteration OR some 2,4,8 ...
+--    element chunk! Time will tell if that direction sees wide adoption,
+--    but it is from that context that unifying our handling of simd and scalars
+--    may benefit. It is not likely to benefit current architectures, though
+--    it may very well be a design perspective that helps guide improving the NCG.
+
+
+checkVecCompatibility :: DynFlags -> PrimOpVecCat -> Length -> Width -> FCode ()
+checkVecCompatibility dflags vcat l w = do
+    when (hscTarget dflags /= HscLlvm) $ do
+        sorry $ unlines ["SIMD vector instructions require the LLVM back-end."
+                         ,"Please use -fllvm."]
+    check vecWidth vcat l w
+  where
+    check :: Width -> PrimOpVecCat -> Length -> Width -> FCode ()
+    check W128 FloatVec 4 W32 | not (isSseEnabled dflags) =
+        sorry $ "128-bit wide single-precision floating point " ++
+                "SIMD vector instructions require at least -msse."
+    check W128 _ _ _ | not (isSse2Enabled dflags) =
+        sorry $ "128-bit wide integer and double precision " ++
+                "SIMD vector instructions require at least -msse2."
+    check W256 FloatVec _ _ | not (isAvxEnabled dflags) =
+        sorry $ "256-bit wide floating point " ++
+                "SIMD vector instructions require at least -mavx."
+    check W256 _ _ _ | not (isAvx2Enabled dflags) =
+        sorry $ "256-bit wide integer " ++
+                "SIMD vector instructions require at least -mavx2."
+    check W512 _ _ _ | not (isAvx512fEnabled dflags) =
+        sorry $ "512-bit wide " ++
+                "SIMD vector instructions require -mavx512f."
+    check _ _ _ _ = return ()
+
+    vecWidth = typeWidth (vecVmmType vcat l w)
+
+------------------------------------------------------------------------------
+-- Helpers for translating vector packing and unpacking.
+
+doVecPackOp :: Maybe MachOp  -- Cast from element to vector component
+            -> CmmType       -- Type of vector
+            -> CmmExpr       -- Initial vector
+            -> [CmmExpr]     -- Elements
+            -> CmmFormal     -- Destination for result
+            -> FCode ()
+doVecPackOp maybe_pre_write_cast ty z es res = do
+    dst <- newTemp ty
+    emitAssign (CmmLocal dst) z
+    vecPack dst es 0
+  where
+    vecPack :: CmmFormal -> [CmmExpr] -> Int -> FCode ()
+    vecPack src [] _ =
+        emitAssign (CmmLocal res) (CmmReg (CmmLocal src))
+
+    vecPack src (e : es) i = do
+        dst <- newTemp ty
+        if isFloatType (vecElemType ty)
+          then emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Insert len wid)
+                                                    [CmmReg (CmmLocal src), cast e, iLit])
+          else emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid)
+                                                    [CmmReg (CmmLocal src), cast e, iLit])
+        vecPack dst es (i + 1)
+      where
+        -- vector indices are always 32-bits
+        iLit = CmmLit (CmmInt (toInteger i) W32)
+
+    cast :: CmmExpr -> CmmExpr
+    cast val = case maybe_pre_write_cast of
+                 Nothing   -> val
+                 Just cast -> CmmMachOp cast [val]
+
+    len :: Length
+    len = vecLength ty
+
+    wid :: Width
+    wid = typeWidth (vecElemType ty)
+
+doVecUnpackOp :: Maybe MachOp  -- Cast from vector component to element result
+              -> CmmType       -- Type of vector
+              -> CmmExpr       -- Vector
+              -> [CmmFormal]   -- Element results
+              -> FCode ()
+doVecUnpackOp maybe_post_read_cast ty e res =
+    vecUnpack res 0
+  where
+    vecUnpack :: [CmmFormal] -> Int -> FCode ()
+    vecUnpack [] _ =
+        return ()
+
+    vecUnpack (r : rs) i = do
+        if isFloatType (vecElemType ty)
+          then emitAssign (CmmLocal r) (cast (CmmMachOp (MO_VF_Extract len wid)
+                                             [e, iLit]))
+          else emitAssign (CmmLocal r) (cast (CmmMachOp (MO_V_Extract len wid)
+                                             [e, iLit]))
+        vecUnpack rs (i + 1)
+      where
+        -- vector indices are always 32-bits
+        iLit = CmmLit (CmmInt (toInteger i) W32)
+
+    cast :: CmmExpr -> CmmExpr
+    cast val = case maybe_post_read_cast of
+                 Nothing   -> val
+                 Just cast -> CmmMachOp cast [val]
+
+    len :: Length
+    len = vecLength ty
+
+    wid :: Width
+    wid = typeWidth (vecElemType ty)
+
+doVecInsertOp :: Maybe MachOp  -- Cast from element to vector component
+              -> CmmType       -- Vector type
+              -> CmmExpr       -- Source vector
+              -> CmmExpr       -- Element
+              -> CmmExpr       -- Index at which to insert element
+              -> CmmFormal     -- Destination for result
+              -> FCode ()
+doVecInsertOp maybe_pre_write_cast ty src e idx res = do
+    dflags <- getDynFlags
+    -- vector indices are always 32-bits
+    let idx' :: CmmExpr
+        idx' = CmmMachOp (MO_SS_Conv (wordWidth dflags) W32) [idx]
+    if isFloatType (vecElemType ty)
+      then emitAssign (CmmLocal res) (CmmMachOp (MO_VF_Insert len wid) [src, cast e, idx'])
+      else emitAssign (CmmLocal res) (CmmMachOp (MO_V_Insert len wid) [src, cast e, idx'])
+  where
+    cast :: CmmExpr -> CmmExpr
+    cast val = case maybe_pre_write_cast of
+                 Nothing   -> val
+                 Just cast -> CmmMachOp cast [val]
+
+    len :: Length
+    len = vecLength ty
+
+    wid :: Width
+    wid = typeWidth (vecElemType ty)
+
+------------------------------------------------------------------------------
+-- Helpers for translating prefetching.
+
+
+-- | Translate byte array prefetch operations into proper primcalls.
+doPrefetchByteArrayOp :: Int
+                      -> [CmmExpr]
+                      -> FCode ()
+doPrefetchByteArrayOp locality  [addr,idx]
+   = do dflags <- getDynFlags
+        mkBasicPrefetch locality (arrWordsHdrSize dflags)  addr idx
+doPrefetchByteArrayOp _ _
+   = panic "GHC.StgToCmm.Prim: doPrefetchByteArrayOp"
+
+-- | Translate mutable byte array prefetch operations into proper primcalls.
+doPrefetchMutableByteArrayOp :: Int
+                      -> [CmmExpr]
+                      -> FCode ()
+doPrefetchMutableByteArrayOp locality  [addr,idx]
+   = do dflags <- getDynFlags
+        mkBasicPrefetch locality (arrWordsHdrSize dflags)  addr idx
+doPrefetchMutableByteArrayOp _ _
+   = panic "GHC.StgToCmm.Prim: doPrefetchByteArrayOp"
+
+-- | Translate address prefetch operations into proper primcalls.
+doPrefetchAddrOp ::Int
+                 -> [CmmExpr]
+                 -> FCode ()
+doPrefetchAddrOp locality   [addr,idx]
+   = mkBasicPrefetch locality 0  addr idx
+doPrefetchAddrOp _ _
+   = panic "GHC.StgToCmm.Prim: doPrefetchAddrOp"
+
+-- | Translate value prefetch operations into proper primcalls.
+doPrefetchValueOp :: Int
+                 -> [CmmExpr]
+                 -> FCode ()
+doPrefetchValueOp  locality   [addr]
+  =  do dflags <- getDynFlags
+        mkBasicPrefetch locality 0 addr  (CmmLit (CmmInt 0 (wordWidth dflags)))
+doPrefetchValueOp _ _
+  = panic "GHC.StgToCmm.Prim: doPrefetchValueOp"
+
+-- | helper to generate prefetch primcalls
+mkBasicPrefetch :: Int          -- Locality level 0-3
+                -> ByteOff      -- Initial offset in bytes
+                -> CmmExpr      -- Base address
+                -> CmmExpr      -- Index
+                -> FCode ()
+mkBasicPrefetch locality off base idx
+   = do dflags <- getDynFlags
+        emitPrimCall [] (MO_Prefetch_Data locality) [cmmIndexExpr dflags W8 (cmmOffsetB dflags base off) idx]
+        return ()
+
+-- ----------------------------------------------------------------------------
+-- Allocating byte arrays
+
+-- | Takes a register to return the newly allocated array in and the
+-- size of the new array in bytes. Allocates a new
+-- 'MutableByteArray#'.
+doNewByteArrayOp :: CmmFormal -> ByteOff -> FCode ()
+doNewByteArrayOp res_r n = do
+    dflags <- getDynFlags
+
+    let info_ptr = mkLblExpr mkArrWords_infoLabel
+        rep = arrWordsRep dflags n
+
+    tickyAllocPrim (mkIntExpr dflags (arrWordsHdrSize dflags))
+        (mkIntExpr dflags (nonHdrSize dflags rep))
+        (zeroExpr dflags)
+
+    let hdr_size = fixedHdrSize dflags
+
+    base <- allocHeapClosure rep info_ptr cccsExpr
+                     [ (mkIntExpr dflags n,
+                        hdr_size + oFFSET_StgArrBytes_bytes dflags)
+                     ]
+
+    emit $ mkAssign (CmmLocal res_r) base
+
+-- ----------------------------------------------------------------------------
+-- Comparing byte arrays
+
+doCompareByteArraysOp :: LocalReg -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
+                     -> FCode ()
+doCompareByteArraysOp res ba1 ba1_off ba2 ba2_off n = do
+    dflags <- getDynFlags
+    ba1_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba1 (arrWordsHdrSize dflags)) ba1_off
+    ba2_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba2 (arrWordsHdrSize dflags)) ba2_off
+
+    -- short-cut in case of equal pointers avoiding a costly
+    -- subroutine call to the memcmp(3) routine; the Cmm logic below
+    -- results in assembly code being generated for
+    --
+    --   cmpPrefix10 :: ByteArray# -> ByteArray# -> Int#
+    --   cmpPrefix10 ba1 ba2 = compareByteArrays# ba1 0# ba2 0# 10#
+    --
+    -- that looks like
+    --
+    --          leaq 16(%r14),%rax
+    --          leaq 16(%rsi),%rbx
+    --          xorl %ecx,%ecx
+    --          cmpq %rbx,%rax
+    --          je l_ptr_eq
+    --
+    --          ; NB: the common case (unequal pointers) falls-through
+    --          ; the conditional jump, and therefore matches the
+    --          ; usual static branch prediction convention of modern cpus
+    --
+    --          subq $8,%rsp
+    --          movq %rbx,%rsi
+    --          movq %rax,%rdi
+    --          movl $10,%edx
+    --          xorl %eax,%eax
+    --          call memcmp
+    --          addq $8,%rsp
+    --          movslq %eax,%rax
+    --          movq %rax,%rcx
+    --  l_ptr_eq:
+    --          movq %rcx,%rbx
+    --          jmp *(%rbp)
+
+    l_ptr_eq <- newBlockId
+    l_ptr_ne <- newBlockId
+
+    emit (mkAssign (CmmLocal res) (zeroExpr dflags))
+    emit (mkCbranch (cmmEqWord dflags ba1_p ba2_p)
+                    l_ptr_eq l_ptr_ne (Just False))
+
+    emitLabel l_ptr_ne
+    emitMemcmpCall res ba1_p ba2_p n 1
+
+    emitLabel l_ptr_eq
+
+-- ----------------------------------------------------------------------------
+-- Copying byte arrays
+
+-- | Takes a source 'ByteArray#', an offset in the source array, a
+-- destination 'MutableByteArray#', an offset into the destination
+-- array, and the number of bytes to copy.  Copies the given number of
+-- bytes from the source array to the destination array.
+doCopyByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
+                  -> FCode ()
+doCopyByteArrayOp = emitCopyByteArray copy
+  where
+    -- Copy data (we assume the arrays aren't overlapping since
+    -- they're of different types)
+    copy _src _dst dst_p src_p bytes align =
+        emitMemcpyCall dst_p src_p bytes align
+
+-- | Takes a source 'MutableByteArray#', an offset in the source
+-- array, a destination 'MutableByteArray#', an offset into the
+-- destination array, and the number of bytes to copy.  Copies the
+-- given number of bytes from the source array to the destination
+-- array.
+doCopyMutableByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
+                         -> FCode ()
+doCopyMutableByteArrayOp = emitCopyByteArray copy
+  where
+    -- The only time the memory might overlap is when the two arrays
+    -- we were provided are the same array!
+    -- TODO: Optimize branch for common case of no aliasing.
+    copy src dst dst_p src_p bytes align = do
+        dflags <- getDynFlags
+        (moveCall, cpyCall) <- forkAltPair
+            (getCode $ emitMemmoveCall dst_p src_p bytes align)
+            (getCode $ emitMemcpyCall  dst_p src_p bytes align)
+        emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall
+
+emitCopyByteArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
+                      -> Alignment -> FCode ())
+                  -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
+                  -> FCode ()
+emitCopyByteArray copy src src_off dst dst_off n = do
+    dflags <- getDynFlags
+    let byteArrayAlignment = wordAlignment dflags
+        srcOffAlignment = cmmExprAlignment src_off
+        dstOffAlignment = cmmExprAlignment dst_off
+        align = minimum [byteArrayAlignment, srcOffAlignment, dstOffAlignment]
+    dst_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags dst (arrWordsHdrSize dflags)) dst_off
+    src_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags src (arrWordsHdrSize dflags)) src_off
+    copy src dst dst_p src_p n align
+
+-- | Takes a source 'ByteArray#', an offset in the source array, a
+-- destination 'Addr#', and the number of bytes to copy.  Copies the given
+-- number of bytes from the source array to the destination memory region.
+doCopyByteArrayToAddrOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> FCode ()
+doCopyByteArrayToAddrOp src src_off dst_p bytes = do
+    -- Use memcpy (we are allowed to assume the arrays aren't overlapping)
+    dflags <- getDynFlags
+    src_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags src (arrWordsHdrSize dflags)) src_off
+    emitMemcpyCall dst_p src_p bytes (mkAlignment 1)
+
+-- | Takes a source 'MutableByteArray#', an offset in the source array, a
+-- destination 'Addr#', and the number of bytes to copy.  Copies the given
+-- number of bytes from the source array to the destination memory region.
+doCopyMutableByteArrayToAddrOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
+                               -> FCode ()
+doCopyMutableByteArrayToAddrOp = doCopyByteArrayToAddrOp
+
+-- | Takes a source 'Addr#', a destination 'MutableByteArray#', an offset into
+-- the destination array, and the number of bytes to copy.  Copies the given
+-- number of bytes from the source memory region to the destination array.
+doCopyAddrToByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> FCode ()
+doCopyAddrToByteArrayOp src_p dst dst_off bytes = do
+    -- Use memcpy (we are allowed to assume the arrays aren't overlapping)
+    dflags <- getDynFlags
+    dst_p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags dst (arrWordsHdrSize dflags)) dst_off
+    emitMemcpyCall dst_p src_p bytes (mkAlignment 1)
+
+
+-- ----------------------------------------------------------------------------
+-- Setting byte arrays
+
+-- | Takes a 'MutableByteArray#', an offset into the array, a length,
+-- and a byte, and sets each of the selected bytes in the array to the
+-- character.
+doSetByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
+                 -> FCode ()
+doSetByteArrayOp ba off len c = do
+    dflags <- getDynFlags
+
+    let byteArrayAlignment = wordAlignment dflags -- known since BA is allocated on heap
+        offsetAlignment = cmmExprAlignment off
+        align = min byteArrayAlignment offsetAlignment
+
+    p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba (arrWordsHdrSize dflags)) off
+    emitMemsetCall p c len align
+
+-- ----------------------------------------------------------------------------
+-- Allocating arrays
+
+-- | Allocate a new array.
+doNewArrayOp :: CmmFormal             -- ^ return register
+             -> SMRep                 -- ^ representation of the array
+             -> CLabel                -- ^ info pointer
+             -> [(CmmExpr, ByteOff)]  -- ^ header payload
+             -> WordOff               -- ^ array size
+             -> CmmExpr               -- ^ initial element
+             -> FCode ()
+doNewArrayOp res_r rep info payload n init = do
+    dflags <- getDynFlags
+
+    let info_ptr = mkLblExpr info
+
+    tickyAllocPrim (mkIntExpr dflags (hdrSize dflags rep))
+        (mkIntExpr dflags (nonHdrSize dflags rep))
+        (zeroExpr dflags)
+
+    base <- allocHeapClosure rep info_ptr cccsExpr payload
+
+    arr <- CmmLocal `fmap` newTemp (bWord dflags)
+    emit $ mkAssign arr base
+
+    -- Initialise all elements of the array
+    let mkOff off = cmmOffsetW dflags (CmmReg arr) (hdrSizeW dflags rep + off)
+        initialization = [ mkStore (mkOff off) init | off <- [0.. n - 1] ]
+    emit (catAGraphs initialization)
+
+    emit $ mkAssign (CmmLocal res_r) (CmmReg arr)
+
+-- ----------------------------------------------------------------------------
+-- Copying pointer arrays
+
+-- EZY: This code has an unusually high amount of assignTemp calls, seen
+-- nowhere else in the code generator.  This is mostly because these
+-- "primitive" ops result in a surprisingly large amount of code.  It
+-- will likely be worthwhile to optimize what is emitted here, so that
+-- our optimization passes don't waste time repeatedly optimizing the
+-- same bits of code.
+
+-- More closely imitates 'assignTemp' from the old code generator, which
+-- returns a CmmExpr rather than a LocalReg.
+assignTempE :: CmmExpr -> FCode CmmExpr
+assignTempE e = do
+    t <- assignTemp e
+    return (CmmReg (CmmLocal t))
+
+-- | Takes a source 'Array#', an offset in the source array, a
+-- destination 'MutableArray#', an offset into the destination array,
+-- and the number of elements to copy.  Copies the given number of
+-- elements from the source array to the destination array.
+doCopyArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff
+              -> FCode ()
+doCopyArrayOp = emitCopyArray copy
+  where
+    -- Copy data (we assume the arrays aren't overlapping since
+    -- they're of different types)
+    copy _src _dst dst_p src_p bytes =
+        do dflags <- getDynFlags
+           emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes)
+               (wordAlignment dflags)
+
+
+-- | Takes a source 'MutableArray#', an offset in the source array, a
+-- destination 'MutableArray#', an offset into the destination array,
+-- and the number of elements to copy.  Copies the given number of
+-- elements from the source array to the destination array.
+doCopyMutableArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff
+                     -> FCode ()
+doCopyMutableArrayOp = emitCopyArray copy
+  where
+    -- The only time the memory might overlap is when the two arrays
+    -- we were provided are the same array!
+    -- TODO: Optimize branch for common case of no aliasing.
+    copy src dst dst_p src_p bytes = do
+        dflags <- getDynFlags
+        (moveCall, cpyCall) <- forkAltPair
+            (getCode $ emitMemmoveCall dst_p src_p (mkIntExpr dflags bytes)
+             (wordAlignment dflags))
+            (getCode $ emitMemcpyCall  dst_p src_p (mkIntExpr dflags bytes)
+             (wordAlignment dflags))
+        emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall
+
+emitCopyArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> ByteOff
+                  -> FCode ())  -- ^ copy function
+              -> CmmExpr        -- ^ source array
+              -> CmmExpr        -- ^ offset in source array
+              -> CmmExpr        -- ^ destination array
+              -> CmmExpr        -- ^ offset in destination array
+              -> WordOff        -- ^ number of elements to copy
+              -> FCode ()
+emitCopyArray copy src0 src_off dst0 dst_off0 n =
+    when (n /= 0) $ do
+        dflags <- getDynFlags
+
+        -- Passed as arguments (be careful)
+        src     <- assignTempE src0
+        dst     <- assignTempE dst0
+        dst_off <- assignTempE dst_off0
+
+        -- Set the dirty bit in the header.
+        emit (setInfo dst (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel)))
+
+        dst_elems_p <- assignTempE $ cmmOffsetB dflags dst
+                       (arrPtrsHdrSize dflags)
+        dst_p <- assignTempE $ cmmOffsetExprW dflags dst_elems_p dst_off
+        src_p <- assignTempE $ cmmOffsetExprW dflags
+                 (cmmOffsetB dflags src (arrPtrsHdrSize dflags)) src_off
+        let bytes = wordsToBytes dflags n
+
+        copy src dst dst_p src_p bytes
+
+        -- The base address of the destination card table
+        dst_cards_p <- assignTempE $ cmmOffsetExprW dflags dst_elems_p
+                       (loadArrPtrsSize dflags dst)
+
+        emitSetCards dst_off dst_cards_p n
+
+doCopySmallArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff
+                   -> FCode ()
+doCopySmallArrayOp = emitCopySmallArray copy
+  where
+    -- Copy data (we assume the arrays aren't overlapping since
+    -- they're of different types)
+    copy _src _dst dst_p src_p bytes =
+        do dflags <- getDynFlags
+           emitMemcpyCall dst_p src_p (mkIntExpr dflags bytes)
+               (wordAlignment dflags)
+
+
+doCopySmallMutableArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> WordOff
+                          -> FCode ()
+doCopySmallMutableArrayOp = emitCopySmallArray copy
+  where
+    -- The only time the memory might overlap is when the two arrays
+    -- we were provided are the same array!
+    -- TODO: Optimize branch for common case of no aliasing.
+    copy src dst dst_p src_p bytes = do
+        dflags <- getDynFlags
+        (moveCall, cpyCall) <- forkAltPair
+            (getCode $ emitMemmoveCall dst_p src_p (mkIntExpr dflags bytes)
+             (wordAlignment dflags))
+            (getCode $ emitMemcpyCall  dst_p src_p (mkIntExpr dflags bytes)
+             (wordAlignment dflags))
+        emit =<< mkCmmIfThenElse (cmmEqWord dflags src dst) moveCall cpyCall
+
+emitCopySmallArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> ByteOff
+                       -> FCode ())  -- ^ copy function
+                   -> CmmExpr        -- ^ source array
+                   -> CmmExpr        -- ^ offset in source array
+                   -> CmmExpr        -- ^ destination array
+                   -> CmmExpr        -- ^ offset in destination array
+                   -> WordOff        -- ^ number of elements to copy
+                   -> FCode ()
+emitCopySmallArray copy src0 src_off dst0 dst_off n =
+    when (n /= 0) $ do
+        dflags <- getDynFlags
+
+        -- Passed as arguments (be careful)
+        src     <- assignTempE src0
+        dst     <- assignTempE dst0
+
+        -- Set the dirty bit in the header.
+        emit (setInfo dst (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel)))
+
+        dst_p <- assignTempE $ cmmOffsetExprW dflags
+                 (cmmOffsetB dflags dst (smallArrPtrsHdrSize dflags)) dst_off
+        src_p <- assignTempE $ cmmOffsetExprW dflags
+                 (cmmOffsetB dflags src (smallArrPtrsHdrSize dflags)) src_off
+        let bytes = wordsToBytes dflags n
+
+        copy src dst dst_p src_p bytes
+
+-- | Takes an info table label, a register to return the newly
+-- allocated array in, a source array, an offset in the source array,
+-- and the number of elements to copy. Allocates a new array and
+-- initializes it from the source array.
+emitCloneArray :: CLabel -> CmmFormal -> CmmExpr -> CmmExpr -> WordOff
+               -> FCode ()
+emitCloneArray info_p res_r src src_off n = do
+    dflags <- getDynFlags
+
+    let info_ptr = mkLblExpr info_p
+        rep = arrPtrsRep dflags n
+
+    tickyAllocPrim (mkIntExpr dflags (arrPtrsHdrSize dflags))
+        (mkIntExpr dflags (nonHdrSize dflags rep))
+        (zeroExpr dflags)
+
+    let hdr_size = fixedHdrSize dflags
+
+    base <- allocHeapClosure rep info_ptr cccsExpr
+                     [ (mkIntExpr dflags n,
+                        hdr_size + oFFSET_StgMutArrPtrs_ptrs dflags)
+                     , (mkIntExpr dflags (nonHdrSizeW rep),
+                        hdr_size + oFFSET_StgMutArrPtrs_size dflags)
+                     ]
+
+    arr <- CmmLocal `fmap` newTemp (bWord dflags)
+    emit $ mkAssign arr base
+
+    dst_p <- assignTempE $ cmmOffsetB dflags (CmmReg arr)
+             (arrPtrsHdrSize dflags)
+    src_p <- assignTempE $ cmmOffsetExprW dflags src
+             (cmmAddWord dflags
+              (mkIntExpr dflags (arrPtrsHdrSizeW dflags)) src_off)
+
+    emitMemcpyCall dst_p src_p (mkIntExpr dflags (wordsToBytes dflags n))
+        (wordAlignment dflags)
+
+    emit $ mkAssign (CmmLocal res_r) (CmmReg arr)
+
+-- | Takes an info table label, a register to return the newly
+-- allocated array in, a source array, an offset in the source array,
+-- and the number of elements to copy. Allocates a new array and
+-- initializes it from the source array.
+emitCloneSmallArray :: CLabel -> CmmFormal -> CmmExpr -> CmmExpr -> WordOff
+                    -> FCode ()
+emitCloneSmallArray info_p res_r src src_off n = do
+    dflags <- getDynFlags
+
+    let info_ptr = mkLblExpr info_p
+        rep = smallArrPtrsRep n
+
+    tickyAllocPrim (mkIntExpr dflags (smallArrPtrsHdrSize dflags))
+        (mkIntExpr dflags (nonHdrSize dflags rep))
+        (zeroExpr dflags)
+
+    let hdr_size = fixedHdrSize dflags
+
+    base <- allocHeapClosure rep info_ptr cccsExpr
+                     [ (mkIntExpr dflags n,
+                        hdr_size + oFFSET_StgSmallMutArrPtrs_ptrs dflags)
+                     ]
+
+    arr <- CmmLocal `fmap` newTemp (bWord dflags)
+    emit $ mkAssign arr base
+
+    dst_p <- assignTempE $ cmmOffsetB dflags (CmmReg arr)
+             (smallArrPtrsHdrSize dflags)
+    src_p <- assignTempE $ cmmOffsetExprW dflags src
+             (cmmAddWord dflags
+              (mkIntExpr dflags (smallArrPtrsHdrSizeW dflags)) src_off)
+
+    emitMemcpyCall dst_p src_p (mkIntExpr dflags (wordsToBytes dflags n))
+        (wordAlignment dflags)
+
+    emit $ mkAssign (CmmLocal res_r) (CmmReg arr)
+
+-- | Takes and offset in the destination array, the base address of
+-- the card table, and the number of elements affected (*not* the
+-- number of cards). The number of elements may not be zero.
+-- Marks the relevant cards as dirty.
+emitSetCards :: CmmExpr -> CmmExpr -> WordOff -> FCode ()
+emitSetCards dst_start dst_cards_start n = do
+    dflags <- getDynFlags
+    start_card <- assignTempE $ cardCmm dflags dst_start
+    let end_card = cardCmm dflags
+                   (cmmSubWord dflags
+                    (cmmAddWord dflags dst_start (mkIntExpr dflags n))
+                    (mkIntExpr dflags 1))
+    emitMemsetCall (cmmAddWord dflags dst_cards_start start_card)
+        (mkIntExpr dflags 1)
+        (cmmAddWord dflags (cmmSubWord dflags end_card start_card) (mkIntExpr dflags 1))
+        (mkAlignment 1) -- no alignment (1 byte)
+
+-- Convert an element index to a card index
+cardCmm :: DynFlags -> CmmExpr -> CmmExpr
+cardCmm dflags i =
+    cmmUShrWord dflags i (mkIntExpr dflags (mUT_ARR_PTRS_CARD_BITS dflags))
+
+------------------------------------------------------------------------------
+-- SmallArray PrimOp implementations
+
+doReadSmallPtrArrayOp :: LocalReg
+                      -> CmmExpr
+                      -> CmmExpr
+                      -> FCode ()
+doReadSmallPtrArrayOp res addr idx = do
+    dflags <- getDynFlags
+    mkBasicIndexedRead (smallArrPtrsHdrSize dflags) Nothing (gcWord dflags) res addr
+        (gcWord dflags) idx
+
+doWriteSmallPtrArrayOp :: CmmExpr
+                       -> CmmExpr
+                       -> CmmExpr
+                       -> FCode ()
+doWriteSmallPtrArrayOp addr idx val = do
+    dflags <- getDynFlags
+    let ty = cmmExprType dflags val
+    emitPrimCall [] MO_WriteBarrier [] -- #12469
+    mkBasicIndexedWrite (smallArrPtrsHdrSize dflags) Nothing addr ty idx val
+    emit (setInfo addr (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel)))
+
+------------------------------------------------------------------------------
+-- Atomic read-modify-write
+
+-- | Emit an atomic modification to a byte array element. The result
+-- reg contains that previous value of the element. Implies a full
+-- memory barrier.
+doAtomicRMW :: LocalReg      -- ^ Result reg
+            -> AtomicMachOp  -- ^ Atomic op (e.g. add)
+            -> CmmExpr       -- ^ MutableByteArray#
+            -> CmmExpr       -- ^ Index
+            -> CmmType       -- ^ Type of element by which we are indexing
+            -> CmmExpr       -- ^ Op argument (e.g. amount to add)
+            -> FCode ()
+doAtomicRMW res amop mba idx idx_ty n = do
+    dflags <- getDynFlags
+    let width = typeWidth idx_ty
+        addr  = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
+                width mba idx
+    emitPrimCall
+        [ res ]
+        (MO_AtomicRMW width amop)
+        [ addr, n ]
+
+-- | Emit an atomic read to a byte array that acts as a memory barrier.
+doAtomicReadByteArray
+    :: LocalReg  -- ^ Result reg
+    -> CmmExpr   -- ^ MutableByteArray#
+    -> CmmExpr   -- ^ Index
+    -> CmmType   -- ^ Type of element by which we are indexing
+    -> FCode ()
+doAtomicReadByteArray res mba idx idx_ty = do
+    dflags <- getDynFlags
+    let width = typeWidth idx_ty
+        addr  = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
+                width mba idx
+    emitPrimCall
+        [ res ]
+        (MO_AtomicRead width)
+        [ addr ]
+
+-- | Emit an atomic write to a byte array that acts as a memory barrier.
+doAtomicWriteByteArray
+    :: CmmExpr   -- ^ MutableByteArray#
+    -> CmmExpr   -- ^ Index
+    -> CmmType   -- ^ Type of element by which we are indexing
+    -> CmmExpr   -- ^ Value to write
+    -> FCode ()
+doAtomicWriteByteArray mba idx idx_ty val = do
+    dflags <- getDynFlags
+    let width = typeWidth idx_ty
+        addr  = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
+                width mba idx
+    emitPrimCall
+        [ {- no results -} ]
+        (MO_AtomicWrite width)
+        [ addr, val ]
+
+doCasByteArray
+    :: LocalReg  -- ^ Result reg
+    -> CmmExpr   -- ^ MutableByteArray#
+    -> CmmExpr   -- ^ Index
+    -> CmmType   -- ^ Type of element by which we are indexing
+    -> CmmExpr   -- ^ Old value
+    -> CmmExpr   -- ^ New value
+    -> FCode ()
+doCasByteArray res mba idx idx_ty old new = do
+    dflags <- getDynFlags
+    let width = (typeWidth idx_ty)
+        addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
+               width mba idx
+    emitPrimCall
+        [ res ]
+        (MO_Cmpxchg width)
+        [ addr, old, new ]
+
+------------------------------------------------------------------------------
+-- Helpers for emitting function calls
+
+-- | Emit a call to @memcpy@.
+emitMemcpyCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode ()
+emitMemcpyCall dst src n align = do
+    emitPrimCall
+        [ {-no results-} ]
+        (MO_Memcpy (alignmentBytes align))
+        [ dst, src, n ]
+
+-- | Emit a call to @memmove@.
+emitMemmoveCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode ()
+emitMemmoveCall dst src n align = do
+    emitPrimCall
+        [ {- no results -} ]
+        (MO_Memmove (alignmentBytes align))
+        [ dst, src, n ]
+
+-- | Emit a call to @memset@.  The second argument must fit inside an
+-- unsigned char.
+emitMemsetCall :: CmmExpr -> CmmExpr -> CmmExpr -> Alignment -> FCode ()
+emitMemsetCall dst c n align = do
+    emitPrimCall
+        [ {- no results -} ]
+        (MO_Memset (alignmentBytes align))
+        [ dst, c, n ]
+
+emitMemcmpCall :: LocalReg -> CmmExpr -> CmmExpr -> CmmExpr -> Int -> FCode ()
+emitMemcmpCall res ptr1 ptr2 n align = do
+    -- 'MO_Memcmp' is assumed to return an 32bit 'CInt' because all
+    -- code-gens currently call out to the @memcmp(3)@ C function.
+    -- This was easier than moving the sign-extensions into
+    -- all the code-gens.
+    dflags <- getDynFlags
+    let is32Bit = typeWidth (localRegType res) == W32
+
+    cres <- if is32Bit
+              then return res
+              else newTemp b32
+
+    emitPrimCall
+        [ cres ]
+        (MO_Memcmp align)
+        [ ptr1, ptr2, n ]
+
+    unless is32Bit $ do
+      emit $ mkAssign (CmmLocal res)
+                      (CmmMachOp
+                         (mo_s_32ToWord dflags)
+                         [(CmmReg (CmmLocal cres))])
+
+emitBSwapCall :: LocalReg -> CmmExpr -> Width -> FCode ()
+emitBSwapCall res x width = do
+    emitPrimCall
+        [ res ]
+        (MO_BSwap width)
+        [ x ]
+
+emitBRevCall :: LocalReg -> CmmExpr -> Width -> FCode ()
+emitBRevCall res x width = do
+    emitPrimCall
+        [ res ]
+        (MO_BRev width)
+        [ x ]
+
+emitPopCntCall :: LocalReg -> CmmExpr -> Width -> FCode ()
+emitPopCntCall res x width = do
+    emitPrimCall
+        [ res ]
+        (MO_PopCnt width)
+        [ x ]
+
+emitPdepCall :: LocalReg -> CmmExpr -> CmmExpr -> Width -> FCode ()
+emitPdepCall res x y width = do
+    emitPrimCall
+        [ res ]
+        (MO_Pdep width)
+        [ x, y ]
+
+emitPextCall :: LocalReg -> CmmExpr -> CmmExpr -> Width -> FCode ()
+emitPextCall res x y width = do
+    emitPrimCall
+        [ res ]
+        (MO_Pext width)
+        [ x, y ]
+
+emitClzCall :: LocalReg -> CmmExpr -> Width -> FCode ()
+emitClzCall res x width = do
+    emitPrimCall
+        [ res ]
+        (MO_Clz width)
+        [ x ]
+
+emitCtzCall :: LocalReg -> CmmExpr -> Width -> FCode ()
+emitCtzCall res x width = do
+    emitPrimCall
+        [ res ]
+        (MO_Ctz width)
+        [ x ]
diff --git a/compiler/GHC/StgToCmm/Prof.hs b/compiler/GHC/StgToCmm/Prof.hs
new file mode 100644
index 0000000000..ce8ef61f17
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Prof.hs
@@ -0,0 +1,360 @@
+-----------------------------------------------------------------------------
+--
+-- Code generation for profiling
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.Prof (
+        initCostCentres, ccType, ccsType,
+        mkCCostCentre, mkCCostCentreStack,
+
+        -- Cost-centre Profiling
+        dynProfHdr, profDynAlloc, profAlloc, staticProfHdr, initUpdFrameProf,
+        enterCostCentreThunk, enterCostCentreFun,
+        costCentreFrom,
+        storeCurCCS,
+        emitSetCCC,
+
+        saveCurrentCostCentre, restoreCurrentCostCentre,
+
+        -- Lag/drag/void stuff
+        ldvEnter, ldvEnterClosure, ldvRecordCreate
+  ) where
+
+import GhcPrelude
+
+import GHC.StgToCmm.Closure
+import GHC.StgToCmm.Utils
+import GHC.StgToCmm.Monad
+import SMRep
+
+import MkGraph
+import Cmm
+import CmmUtils
+import CLabel
+
+import CostCentre
+import DynFlags
+import FastString
+import Module
+import Outputable
+
+import Control.Monad
+import Data.Char (ord)
+
+-----------------------------------------------------------------------------
+--
+-- Cost-centre-stack Profiling
+--
+-----------------------------------------------------------------------------
+
+-- Expression representing the current cost centre stack
+ccsType :: DynFlags -> CmmType -- Type of a cost-centre stack
+ccsType = bWord
+
+ccType :: DynFlags -> CmmType -- Type of a cost centre
+ccType = bWord
+
+storeCurCCS :: CmmExpr -> CmmAGraph
+storeCurCCS e = mkAssign cccsReg e
+
+mkCCostCentre :: CostCentre -> CmmLit
+mkCCostCentre cc = CmmLabel (mkCCLabel cc)
+
+mkCCostCentreStack :: CostCentreStack -> CmmLit
+mkCCostCentreStack ccs = CmmLabel (mkCCSLabel ccs)
+
+costCentreFrom :: DynFlags
+               -> CmmExpr         -- A closure pointer
+               -> CmmExpr        -- The cost centre from that closure
+costCentreFrom dflags cl = CmmLoad (cmmOffsetB dflags cl (oFFSET_StgHeader_ccs dflags)) (ccsType dflags)
+
+-- | The profiling header words in a static closure
+staticProfHdr :: DynFlags -> CostCentreStack -> [CmmLit]
+staticProfHdr dflags ccs
+ = ifProfilingL dflags [mkCCostCentreStack ccs, staticLdvInit dflags]
+
+-- | Profiling header words in a dynamic closure
+dynProfHdr :: DynFlags -> CmmExpr -> [CmmExpr]
+dynProfHdr dflags ccs = ifProfilingL dflags [ccs, dynLdvInit dflags]
+
+-- | Initialise the profiling field of an update frame
+initUpdFrameProf :: CmmExpr -> FCode ()
+initUpdFrameProf frame
+  = ifProfiling $        -- frame->header.prof.ccs = CCCS
+    do dflags <- getDynFlags
+       emitStore (cmmOffset dflags frame (oFFSET_StgHeader_ccs dflags)) cccsExpr
+        -- frame->header.prof.hp.rs = NULL (or frame-header.prof.hp.ldvw = 0)
+        -- is unnecessary because it is not used anyhow.
+
+---------------------------------------------------------------------------
+--         Saving and restoring the current cost centre
+---------------------------------------------------------------------------
+
+{-        Note [Saving the current cost centre]
+        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The current cost centre is like a global register.  Like other
+global registers, it's a caller-saves one.  But consider
+        case (f x) of (p,q) -> rhs
+Since 'f' may set the cost centre, we must restore it
+before resuming rhs.  So we want code like this:
+        local_cc = CCC  -- save
+        r = f( x )
+        CCC = local_cc  -- restore
+That is, we explicitly "save" the current cost centre in
+a LocalReg, local_cc; and restore it after the call. The
+C-- infrastructure will arrange to save local_cc across the
+call.
+
+The same goes for join points;
+        let j x = join-stuff
+        in blah-blah
+We want this kind of code:
+        local_cc = CCC  -- save
+        blah-blah
+     J:
+        CCC = local_cc  -- restore
+-}
+
+saveCurrentCostCentre :: FCode (Maybe LocalReg)
+        -- Returns Nothing if profiling is off
+saveCurrentCostCentre
+  = do dflags <- getDynFlags
+       if not (gopt Opt_SccProfilingOn dflags)
+           then return Nothing
+           else do local_cc <- newTemp (ccType dflags)
+                   emitAssign (CmmLocal local_cc) cccsExpr
+                   return (Just local_cc)
+
+restoreCurrentCostCentre :: Maybe LocalReg -> FCode ()
+restoreCurrentCostCentre Nothing
+  = return ()
+restoreCurrentCostCentre (Just local_cc)
+  = emit (storeCurCCS (CmmReg (CmmLocal local_cc)))
+
+
+-------------------------------------------------------------------------------
+-- Recording allocation in a cost centre
+-------------------------------------------------------------------------------
+
+-- | Record the allocation of a closure.  The CmmExpr is the cost
+-- centre stack to which to attribute the allocation.
+profDynAlloc :: SMRep -> CmmExpr -> FCode ()
+profDynAlloc rep ccs
+  = ifProfiling $
+    do dflags <- getDynFlags
+       profAlloc (mkIntExpr dflags (heapClosureSizeW dflags rep)) ccs
+
+-- | Record the allocation of a closure (size is given by a CmmExpr)
+-- The size must be in words, because the allocation counter in a CCS counts
+-- in words.
+profAlloc :: CmmExpr -> CmmExpr -> FCode ()
+profAlloc words ccs
+  = ifProfiling $
+        do dflags <- getDynFlags
+           let alloc_rep = rEP_CostCentreStack_mem_alloc dflags
+           emit (addToMemE alloc_rep
+                       (cmmOffsetB dflags ccs (oFFSET_CostCentreStack_mem_alloc dflags))
+                       (CmmMachOp (MO_UU_Conv (wordWidth dflags) (typeWidth alloc_rep)) $
+                         [CmmMachOp (mo_wordSub dflags) [words,
+                                                         mkIntExpr dflags (profHdrSize dflags)]]))
+                       -- subtract the "profiling overhead", which is the
+                       -- profiling header in a closure.
+
+-- -----------------------------------------------------------------------
+-- Setting the current cost centre on entry to a closure
+
+enterCostCentreThunk :: CmmExpr -> FCode ()
+enterCostCentreThunk closure =
+  ifProfiling $ do
+      dflags <- getDynFlags
+      emit $ storeCurCCS (costCentreFrom dflags closure)
+
+enterCostCentreFun :: CostCentreStack -> CmmExpr -> FCode ()
+enterCostCentreFun ccs closure =
+  ifProfiling $ do
+    if isCurrentCCS ccs
+       then do dflags <- getDynFlags
+               emitRtsCall rtsUnitId (fsLit "enterFunCCS")
+                   [(baseExpr, AddrHint),
+                    (costCentreFrom dflags closure, AddrHint)] False
+       else return () -- top-level function, nothing to do
+
+ifProfiling :: FCode () -> FCode ()
+ifProfiling code
+  = do dflags <- getDynFlags
+       if gopt Opt_SccProfilingOn dflags
+           then code
+           else return ()
+
+ifProfilingL :: DynFlags -> [a] -> [a]
+ifProfilingL dflags xs
+  | gopt Opt_SccProfilingOn dflags = xs
+  | otherwise                      = []
+
+
+---------------------------------------------------------------
+--        Initialising Cost Centres & CCSs
+---------------------------------------------------------------
+
+initCostCentres :: CollectedCCs -> FCode ()
+-- Emit the declarations
+initCostCentres (local_CCs, singleton_CCSs)
+  = do dflags <- getDynFlags
+       when (gopt Opt_SccProfilingOn dflags) $
+           do mapM_ emitCostCentreDecl local_CCs
+              mapM_ emitCostCentreStackDecl singleton_CCSs
+
+
+emitCostCentreDecl :: CostCentre -> FCode ()
+emitCostCentreDecl cc = do
+  { dflags <- getDynFlags
+  ; let is_caf | isCafCC cc = mkIntCLit dflags (ord 'c') -- 'c' == is a CAF
+               | otherwise  = zero dflags
+                        -- NB. bytesFS: we want the UTF-8 bytes here (#5559)
+  ; label <- newByteStringCLit (bytesFS $ costCentreUserNameFS cc)
+  ; modl  <- newByteStringCLit (bytesFS $ Module.moduleNameFS
+                                        $ Module.moduleName
+                                        $ cc_mod cc)
+  ; loc <- newByteStringCLit $ bytesFS $ mkFastString $
+                   showPpr dflags (costCentreSrcSpan cc)
+           -- XXX going via FastString to get UTF-8 encoding is silly
+  ; let
+     lits = [ zero dflags,           -- StgInt ccID,
+              label,        -- char *label,
+              modl,        -- char *module,
+              loc,      -- char *srcloc,
+              zero64,   -- StgWord64 mem_alloc
+              zero dflags,     -- StgWord time_ticks
+              is_caf,   -- StgInt is_caf
+              zero dflags      -- struct _CostCentre *link
+            ]
+  ; emitDataLits (mkCCLabel cc) lits
+  }
+
+emitCostCentreStackDecl :: CostCentreStack -> FCode ()
+emitCostCentreStackDecl ccs
+  = case maybeSingletonCCS ccs of
+    Just cc ->
+        do dflags <- getDynFlags
+           let mk_lits cc = zero dflags :
+                            mkCCostCentre cc :
+                            replicate (sizeof_ccs_words dflags - 2) (zero dflags)
+                -- Note: to avoid making any assumptions about how the
+                -- C compiler (that compiles the RTS, in particular) does
+                -- layouts of structs containing long-longs, simply
+                -- pad out the struct with zero words until we hit the
+                -- size of the overall struct (which we get via DerivedConstants.h)
+           emitDataLits (mkCCSLabel ccs) (mk_lits cc)
+    Nothing -> pprPanic "emitCostCentreStackDecl" (ppr ccs)
+
+zero :: DynFlags -> CmmLit
+zero dflags = mkIntCLit dflags 0
+zero64 :: CmmLit
+zero64 = CmmInt 0 W64
+
+sizeof_ccs_words :: DynFlags -> Int
+sizeof_ccs_words dflags
+    -- round up to the next word.
+  | ms == 0   = ws
+  | otherwise = ws + 1
+  where
+   (ws,ms) = sIZEOF_CostCentreStack dflags `divMod` wORD_SIZE dflags
+
+-- ---------------------------------------------------------------------------
+-- Set the current cost centre stack
+
+emitSetCCC :: CostCentre -> Bool -> Bool -> FCode ()
+emitSetCCC cc tick push
+ = do dflags <- getDynFlags
+      if not (gopt Opt_SccProfilingOn dflags)
+          then return ()
+          else do tmp <- newTemp (ccsType dflags)
+                  pushCostCentre tmp cccsExpr cc
+                  when tick $ emit (bumpSccCount dflags (CmmReg (CmmLocal tmp)))
+                  when push $ emit (storeCurCCS (CmmReg (CmmLocal tmp)))
+
+pushCostCentre :: LocalReg -> CmmExpr -> CostCentre -> FCode ()
+pushCostCentre result ccs cc
+  = emitRtsCallWithResult result AddrHint
+        rtsUnitId
+        (fsLit "pushCostCentre") [(ccs,AddrHint),
+                                (CmmLit (mkCCostCentre cc), AddrHint)]
+        False
+
+bumpSccCount :: DynFlags -> CmmExpr -> CmmAGraph
+bumpSccCount dflags ccs
+  = addToMem (rEP_CostCentreStack_scc_count dflags)
+         (cmmOffsetB dflags ccs (oFFSET_CostCentreStack_scc_count dflags)) 1
+
+-----------------------------------------------------------------------------
+--
+--                Lag/drag/void stuff
+--
+-----------------------------------------------------------------------------
+
+--
+-- Initial value for the LDV field in a static closure
+--
+staticLdvInit :: DynFlags -> CmmLit
+staticLdvInit = zeroCLit
+
+--
+-- Initial value of the LDV field in a dynamic closure
+--
+dynLdvInit :: DynFlags -> CmmExpr
+dynLdvInit dflags =     -- (era << LDV_SHIFT) | LDV_STATE_CREATE
+  CmmMachOp (mo_wordOr dflags) [
+      CmmMachOp (mo_wordShl dflags) [loadEra dflags, mkIntExpr dflags (lDV_SHIFT dflags)],
+      CmmLit (mkWordCLit dflags (iLDV_STATE_CREATE dflags))
+  ]
+
+--
+-- Initialise the LDV word of a new closure
+--
+ldvRecordCreate :: CmmExpr -> FCode ()
+ldvRecordCreate closure = do
+  dflags <- getDynFlags
+  emit $ mkStore (ldvWord dflags closure) (dynLdvInit dflags)
+
+--
+-- | Called when a closure is entered, marks the closure as having
+-- been "used".  The closure is not an "inherently used" one.  The
+-- closure is not @IND@ because that is not considered for LDV profiling.
+--
+ldvEnterClosure :: ClosureInfo -> CmmReg -> FCode ()
+ldvEnterClosure closure_info node_reg = do
+    dflags <- getDynFlags
+    let tag = funTag dflags closure_info
+    -- don't forget to substract node's tag
+    ldvEnter (cmmOffsetB dflags (CmmReg node_reg) (-tag))
+
+ldvEnter :: CmmExpr -> FCode ()
+-- Argument is a closure pointer
+ldvEnter cl_ptr = do
+    dflags <- getDynFlags
+    let -- don't forget to substract node's tag
+        ldv_wd = ldvWord dflags cl_ptr
+        new_ldv_wd = cmmOrWord dflags (cmmAndWord dflags (CmmLoad ldv_wd (bWord dflags))
+                                                         (CmmLit (mkWordCLit dflags (iLDV_CREATE_MASK dflags))))
+                                      (cmmOrWord dflags (loadEra dflags) (CmmLit (mkWordCLit dflags (iLDV_STATE_USE dflags))))
+    ifProfiling $
+         -- if (era > 0) {
+         --    LDVW((c)) = (LDVW((c)) & LDV_CREATE_MASK) |
+         --                era | LDV_STATE_USE }
+        emit =<< mkCmmIfThenElse (CmmMachOp (mo_wordUGt dflags) [loadEra dflags, CmmLit (zeroCLit dflags)])
+                     (mkStore ldv_wd new_ldv_wd)
+                     mkNop
+
+loadEra :: DynFlags -> CmmExpr
+loadEra dflags = CmmMachOp (MO_UU_Conv (cIntWidth dflags) (wordWidth dflags))
+    [CmmLoad (mkLblExpr (mkCmmDataLabel rtsUnitId (fsLit "era")))
+             (cInt dflags)]
+
+ldvWord :: DynFlags -> CmmExpr -> CmmExpr
+-- Takes the address of a closure, and returns
+-- the address of the LDV word in the closure
+ldvWord dflags closure_ptr
+    = cmmOffsetB dflags closure_ptr (oFFSET_StgHeader_ldvw dflags)
diff --git a/compiler/GHC/StgToCmm/Ticky.hs b/compiler/GHC/StgToCmm/Ticky.hs
new file mode 100644
index 0000000000..06ef520c0d
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Ticky.hs
@@ -0,0 +1,682 @@
+{-# LANGUAGE BangPatterns #-}
+
+-----------------------------------------------------------------------------
+--
+-- Code generation for ticky-ticky profiling
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+{- OVERVIEW: ticky ticky profiling
+
+Please see
+https://gitlab.haskell.org/ghc/ghc/wikis/debugging/ticky-ticky and also
+edit it and the rest of this comment to keep them up-to-date if you
+change ticky-ticky. Thanks!
+
+ *** All allocation ticky numbers are in bytes. ***
+
+Some of the relevant source files:
+
+       ***not necessarily an exhaustive list***
+
+  * some codeGen/ modules import this one
+
+  * this module imports cmm/CLabel.hs to manage labels
+
+  * cmm/CmmParse.y expands some macros using generators defined in
+    this module
+
+  * includes/stg/Ticky.h declares all of the global counters
+
+  * includes/rts/Ticky.h declares the C data type for an
+    STG-declaration's counters
+
+  * some macros defined in includes/Cmm.h (and used within the RTS's
+    CMM code) update the global ticky counters
+
+  * at the end of execution rts/Ticky.c generates the final report
+    +RTS -r<report-file> -RTS
+
+The rts/Ticky.c function that generates the report includes an
+STG-declaration's ticky counters if
+
+  * that declaration was entered, or
+
+  * it was allocated (if -ticky-allocd)
+
+On either of those events, the counter is "registered" by adding it to
+a linked list; cf the CMM generated by registerTickyCtr.
+
+Ticky-ticky profiling has evolved over many years. Many of the
+counters from its most sophisticated days are no longer
+active/accurate. As the RTS has changed, sometimes the ticky code for
+relevant counters was not accordingly updated. Unfortunately, neither
+were the comments.
+
+As of March 2013, there still exist deprecated code and comments in
+the code generator as well as the RTS because:
+
+  * I don't know what is out-of-date versus merely commented out for
+    momentary convenience, and
+
+  * someone else might know how to repair it!
+
+-}
+
+module GHC.StgToCmm.Ticky (
+  withNewTickyCounterFun,
+  withNewTickyCounterLNE,
+  withNewTickyCounterThunk,
+  withNewTickyCounterStdThunk,
+  withNewTickyCounterCon,
+
+  tickyDynAlloc,
+  tickyAllocHeap,
+
+  tickyAllocPrim,
+  tickyAllocThunk,
+  tickyAllocPAP,
+  tickyHeapCheck,
+  tickyStackCheck,
+
+  tickyUnknownCall, tickyDirectCall,
+
+  tickyPushUpdateFrame,
+  tickyUpdateFrameOmitted,
+
+  tickyEnterDynCon,
+  tickyEnterStaticCon,
+  tickyEnterViaNode,
+
+  tickyEnterFun,
+  tickyEnterThunk, tickyEnterStdThunk,        -- dynamic non-value
+                                              -- thunks only
+  tickyEnterLNE,
+
+  tickyUpdateBhCaf,
+  tickyBlackHole,
+  tickyUnboxedTupleReturn,
+  tickyReturnOldCon, tickyReturnNewCon,
+
+  tickyKnownCallTooFewArgs, tickyKnownCallExact, tickyKnownCallExtraArgs,
+  tickySlowCall, tickySlowCallPat,
+  ) where
+
+import GhcPrelude
+
+import GHC.StgToCmm.ArgRep    ( slowCallPattern , toArgRep , argRepString )
+import GHC.StgToCmm.Closure
+import GHC.StgToCmm.Utils
+import GHC.StgToCmm.Monad
+
+import StgSyn
+import CmmExpr
+import MkGraph
+import CmmUtils
+import CLabel
+import SMRep
+
+import Module
+import Name
+import Id
+import BasicTypes
+import FastString
+import Outputable
+import Util
+
+import DynFlags
+
+-- Turgid imports for showTypeCategory
+import PrelNames
+import TcType
+import Type
+import TyCon
+
+import Data.Maybe
+import qualified Data.Char
+import Control.Monad ( when )
+
+-----------------------------------------------------------------------------
+--
+-- Ticky-ticky profiling
+--
+-----------------------------------------------------------------------------
+
+data TickyClosureType
+    = TickyFun
+        Bool -- True <-> single entry
+    | TickyCon
+    | TickyThunk
+        Bool -- True <-> updateable
+        Bool -- True <-> standard thunk (AP or selector), has no entry counter
+    | TickyLNE
+
+withNewTickyCounterFun :: Bool -> Name  -> [NonVoid Id] -> FCode a -> FCode a
+withNewTickyCounterFun single_entry = withNewTickyCounter (TickyFun single_entry)
+
+withNewTickyCounterLNE :: Name  -> [NonVoid Id] -> FCode a -> FCode a
+withNewTickyCounterLNE nm args code = do
+  b <- tickyLNEIsOn
+  if not b then code else withNewTickyCounter TickyLNE nm args code
+
+thunkHasCounter :: Bool -> FCode Bool
+thunkHasCounter isStatic = do
+  b <- tickyDynThunkIsOn
+  pure (not isStatic && b)
+
+withNewTickyCounterThunk
+  :: Bool -- ^ static
+  -> Bool -- ^ updateable
+  -> Name
+  -> FCode a
+  -> FCode a
+withNewTickyCounterThunk isStatic isUpdatable name code = do
+    has_ctr <- thunkHasCounter isStatic
+    if not has_ctr
+      then code
+      else withNewTickyCounter (TickyThunk isUpdatable False) name [] code
+
+withNewTickyCounterStdThunk
+  :: Bool -- ^ updateable
+  -> Name
+  -> FCode a
+  -> FCode a
+withNewTickyCounterStdThunk isUpdatable name code = do
+    has_ctr <- thunkHasCounter False
+    if not has_ctr
+      then code
+      else withNewTickyCounter (TickyThunk isUpdatable True) name [] code
+
+withNewTickyCounterCon
+  :: Name
+  -> FCode a
+  -> FCode a
+withNewTickyCounterCon name code = do
+    has_ctr <- thunkHasCounter False
+    if not has_ctr
+      then code
+      else withNewTickyCounter TickyCon name [] code
+
+-- args does not include the void arguments
+withNewTickyCounter :: TickyClosureType -> Name -> [NonVoid Id] -> FCode a -> FCode a
+withNewTickyCounter cloType name args m = do
+  lbl <- emitTickyCounter cloType name args
+  setTickyCtrLabel lbl m
+
+emitTickyCounter :: TickyClosureType -> Name -> [NonVoid Id] -> FCode CLabel
+emitTickyCounter cloType name args
+  = let ctr_lbl = mkRednCountsLabel name in
+    (>> return ctr_lbl) $
+    ifTicky $ do
+        { dflags <- getDynFlags
+        ; parent <- getTickyCtrLabel
+        ; mod_name <- getModuleName
+
+          -- When printing the name of a thing in a ticky file, we
+          -- want to give the module name even for *local* things.  We
+          -- print just "x (M)" rather that "M.x" to distinguish them
+          -- from the global kind.
+        ; let ppr_for_ticky_name :: SDoc
+              ppr_for_ticky_name =
+                let n = ppr name
+                    ext = case cloType of
+                              TickyFun single_entry -> parens $ hcat $ punctuate comma $
+                                  [text "fun"] ++ [text "se"|single_entry]
+                              TickyCon -> parens (text "con")
+                              TickyThunk upd std -> parens $ hcat $ punctuate comma $
+                                  [text "thk"] ++ [text "se"|not upd] ++ [text "std"|std]
+                              TickyLNE | isInternalName name -> parens (text "LNE")
+                                       | otherwise -> panic "emitTickyCounter: how is this an external LNE?"
+                    p = case hasHaskellName parent of
+                            -- NB the default "top" ticky ctr does not
+                            -- have a Haskell name
+                          Just pname -> text "in" <+> ppr (nameUnique pname)
+                          _ -> empty
+                in if isInternalName name
+                   then n <+> parens (ppr mod_name) <+> ext <+> p
+                   else n <+> ext <+> p
+
+        ; fun_descr_lit <- newStringCLit $ showSDocDebug dflags ppr_for_ticky_name
+        ; arg_descr_lit <- newStringCLit $ map (showTypeCategory . idType . fromNonVoid) args
+        ; emitDataLits ctr_lbl
+        -- Must match layout of includes/rts/Ticky.h's StgEntCounter
+        --
+        -- krc: note that all the fields are I32 now; some were I16
+        -- before, but the code generator wasn't handling that
+        -- properly and it led to chaos, panic and disorder.
+            [ mkIntCLit dflags 0,               -- registered?
+              mkIntCLit dflags (length args),   -- Arity
+              mkIntCLit dflags 0,               -- Heap allocated for this thing
+              fun_descr_lit,
+              arg_descr_lit,
+              zeroCLit dflags,          -- Entries into this thing
+              zeroCLit dflags,          -- Heap allocated by this thing
+              zeroCLit dflags                   -- Link to next StgEntCounter
+            ]
+        }
+
+-- -----------------------------------------------------------------------------
+-- Ticky stack frames
+
+tickyPushUpdateFrame, tickyUpdateFrameOmitted :: FCode ()
+tickyPushUpdateFrame    = ifTicky $ bumpTickyCounter (fsLit "UPDF_PUSHED_ctr")
+tickyUpdateFrameOmitted = ifTicky $ bumpTickyCounter (fsLit "UPDF_OMITTED_ctr")
+
+-- -----------------------------------------------------------------------------
+-- Ticky entries
+
+-- NB the name-specific entries are only available for names that have
+-- dedicated Cmm code. As far as I know, this just rules out
+-- constructor thunks. For them, there is no CMM code block to put the
+-- bump of name-specific ticky counter into. On the other hand, we can
+-- still track allocation their allocation.
+
+tickyEnterDynCon, tickyEnterStaticCon, tickyEnterViaNode :: FCode ()
+tickyEnterDynCon      = ifTicky $ bumpTickyCounter (fsLit "ENT_DYN_CON_ctr")
+tickyEnterStaticCon   = ifTicky $ bumpTickyCounter (fsLit "ENT_STATIC_CON_ctr")
+tickyEnterViaNode     = ifTicky $ bumpTickyCounter (fsLit "ENT_VIA_NODE_ctr")
+
+tickyEnterThunk :: ClosureInfo -> FCode ()
+tickyEnterThunk cl_info
+  = ifTicky $ do
+    { bumpTickyCounter ctr
+    ; has_ctr <- thunkHasCounter static
+    ; when has_ctr $ do
+      ticky_ctr_lbl <- getTickyCtrLabel
+      registerTickyCtrAtEntryDyn ticky_ctr_lbl
+      bumpTickyEntryCount ticky_ctr_lbl }
+  where
+    updatable = closureSingleEntry cl_info
+    static    = isStaticClosure cl_info
+
+    ctr | static    = if updatable then fsLit "ENT_STATIC_THK_SINGLE_ctr"
+                                   else fsLit "ENT_STATIC_THK_MANY_ctr"
+        | otherwise = if updatable then fsLit "ENT_DYN_THK_SINGLE_ctr"
+                                   else fsLit "ENT_DYN_THK_MANY_ctr"
+
+tickyEnterStdThunk :: ClosureInfo -> FCode ()
+tickyEnterStdThunk = tickyEnterThunk
+
+tickyBlackHole :: Bool{-updatable-} -> FCode ()
+tickyBlackHole updatable
+  = ifTicky (bumpTickyCounter ctr)
+  where
+    ctr | updatable = (fsLit "UPD_BH_SINGLE_ENTRY_ctr")
+        | otherwise = (fsLit "UPD_BH_UPDATABLE_ctr")
+
+tickyUpdateBhCaf :: ClosureInfo -> FCode ()
+tickyUpdateBhCaf cl_info
+  = ifTicky (bumpTickyCounter ctr)
+  where
+    ctr | closureUpdReqd cl_info = (fsLit "UPD_CAF_BH_SINGLE_ENTRY_ctr")
+        | otherwise              = (fsLit "UPD_CAF_BH_UPDATABLE_ctr")
+
+tickyEnterFun :: ClosureInfo -> FCode ()
+tickyEnterFun cl_info = ifTicky $ do
+  ctr_lbl <- getTickyCtrLabel
+
+  if isStaticClosure cl_info
+    then do bumpTickyCounter (fsLit "ENT_STATIC_FUN_DIRECT_ctr")
+            registerTickyCtr ctr_lbl
+    else do bumpTickyCounter (fsLit "ENT_DYN_FUN_DIRECT_ctr")
+            registerTickyCtrAtEntryDyn ctr_lbl
+
+  bumpTickyEntryCount ctr_lbl
+
+tickyEnterLNE :: FCode ()
+tickyEnterLNE = ifTicky $ do
+  bumpTickyCounter (fsLit "ENT_LNE_ctr")
+  ifTickyLNE $ do
+    ctr_lbl <- getTickyCtrLabel
+    registerTickyCtr ctr_lbl
+    bumpTickyEntryCount ctr_lbl
+
+-- needn't register a counter upon entry if
+--
+-- 1) it's for a dynamic closure, and
+--
+-- 2) -ticky-allocd is on
+--
+-- since the counter was registered already upon being alloc'd
+registerTickyCtrAtEntryDyn :: CLabel -> FCode ()
+registerTickyCtrAtEntryDyn ctr_lbl = do
+  already_registered <- tickyAllocdIsOn
+  when (not already_registered) $ registerTickyCtr ctr_lbl
+
+registerTickyCtr :: CLabel -> FCode ()
+-- Register a ticky counter
+--   if ( ! f_ct.registeredp ) {
+--          f_ct.link = ticky_entry_ctrs;       /* hook this one onto the front of the list */
+--          ticky_entry_ctrs = & (f_ct);        /* mark it as "registered" */
+--          f_ct.registeredp = 1 }
+registerTickyCtr ctr_lbl = do
+  dflags <- getDynFlags
+  let
+    -- krc: code generator doesn't handle Not, so we test for Eq 0 instead
+    test = CmmMachOp (MO_Eq (wordWidth dflags))
+              [CmmLoad (CmmLit (cmmLabelOffB ctr_lbl
+                                (oFFSET_StgEntCounter_registeredp dflags))) (bWord dflags),
+               zeroExpr dflags]
+    register_stmts
+      = [ mkStore (CmmLit (cmmLabelOffB ctr_lbl (oFFSET_StgEntCounter_link dflags)))
+                   (CmmLoad ticky_entry_ctrs (bWord dflags))
+        , mkStore ticky_entry_ctrs (mkLblExpr ctr_lbl)
+        , mkStore (CmmLit (cmmLabelOffB ctr_lbl
+                                (oFFSET_StgEntCounter_registeredp dflags)))
+                   (mkIntExpr dflags 1) ]
+    ticky_entry_ctrs = mkLblExpr (mkCmmDataLabel rtsUnitId (fsLit "ticky_entry_ctrs"))
+  emit =<< mkCmmIfThen test (catAGraphs register_stmts)
+
+tickyReturnOldCon, tickyReturnNewCon :: RepArity -> FCode ()
+tickyReturnOldCon arity
+  = ifTicky $ do { bumpTickyCounter (fsLit "RET_OLD_ctr")
+                 ; bumpHistogram    (fsLit "RET_OLD_hst") arity }
+tickyReturnNewCon arity
+  = ifTicky $ do { bumpTickyCounter (fsLit "RET_NEW_ctr")
+                 ; bumpHistogram    (fsLit "RET_NEW_hst") arity }
+
+tickyUnboxedTupleReturn :: RepArity -> FCode ()
+tickyUnboxedTupleReturn arity
+  = ifTicky $ do { bumpTickyCounter (fsLit "RET_UNBOXED_TUP_ctr")
+                 ; bumpHistogram    (fsLit "RET_UNBOXED_TUP_hst") arity }
+
+-- -----------------------------------------------------------------------------
+-- Ticky calls
+
+-- Ticks at a *call site*:
+tickyDirectCall :: RepArity -> [StgArg] -> FCode ()
+tickyDirectCall arity args
+  | args `lengthIs` arity = tickyKnownCallExact
+  | otherwise = do tickyKnownCallExtraArgs
+                   tickySlowCallPat (map argPrimRep (drop arity args))
+
+tickyKnownCallTooFewArgs :: FCode ()
+tickyKnownCallTooFewArgs = ifTicky $ bumpTickyCounter (fsLit "KNOWN_CALL_TOO_FEW_ARGS_ctr")
+
+tickyKnownCallExact :: FCode ()
+tickyKnownCallExact      = ifTicky $ bumpTickyCounter (fsLit "KNOWN_CALL_ctr")
+
+tickyKnownCallExtraArgs :: FCode ()
+tickyKnownCallExtraArgs  = ifTicky $ bumpTickyCounter (fsLit "KNOWN_CALL_EXTRA_ARGS_ctr")
+
+tickyUnknownCall :: FCode ()
+tickyUnknownCall         = ifTicky $ bumpTickyCounter (fsLit "UNKNOWN_CALL_ctr")
+
+-- Tick for the call pattern at slow call site (i.e. in addition to
+-- tickyUnknownCall, tickyKnownCallExtraArgs, etc.)
+tickySlowCall :: LambdaFormInfo -> [StgArg] -> FCode ()
+tickySlowCall _ [] = return ()
+tickySlowCall lf_info args = do
+ -- see Note [Ticky for slow calls]
+ if isKnownFun lf_info
+   then tickyKnownCallTooFewArgs
+   else tickyUnknownCall
+ tickySlowCallPat (map argPrimRep args)
+
+tickySlowCallPat :: [PrimRep] -> FCode ()
+tickySlowCallPat args = ifTicky $
+  let argReps = map toArgRep args
+      (_, n_matched) = slowCallPattern argReps
+  in if n_matched > 0 && args `lengthIs` n_matched
+     then bumpTickyLbl $ mkRtsSlowFastTickyCtrLabel $ concatMap (map Data.Char.toLower . argRepString) argReps
+     else bumpTickyCounter $ fsLit "VERY_SLOW_CALL_ctr"
+
+{-
+
+Note [Ticky for slow calls]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Terminology is unfortunately a bit mixed up for these calls. codeGen
+uses "slow call" to refer to unknown calls and under-saturated known
+calls.
+
+Nowadays, though (ie as of the eval/apply paper), the significantly
+slower calls are actually just a subset of these: the ones with no
+built-in argument pattern (cf GHC.StgToCmm.ArgRep.slowCallPattern)
+
+So for ticky profiling, we split slow calls into
+"SLOW_CALL_fast_<pattern>_ctr" (those matching a built-in pattern) and
+VERY_SLOW_CALL_ctr (those without a built-in pattern; these are very
+bad for both space and time).
+
+-}
+
+-- -----------------------------------------------------------------------------
+-- Ticky allocation
+
+tickyDynAlloc :: Maybe Id -> SMRep -> LambdaFormInfo -> FCode ()
+-- Called when doing a dynamic heap allocation; the LambdaFormInfo
+-- used to distinguish between closure types
+--
+-- TODO what else to count while we're here?
+tickyDynAlloc mb_id rep lf = ifTicky $ getDynFlags >>= \dflags ->
+  let bytes = wORD_SIZE dflags * heapClosureSizeW dflags rep
+
+      countGlobal tot ctr = do
+        bumpTickyCounterBy tot bytes
+        bumpTickyCounter   ctr
+      countSpecific = ifTickyAllocd $ case mb_id of
+        Nothing -> return ()
+        Just id -> do
+          let ctr_lbl = mkRednCountsLabel (idName id)
+          registerTickyCtr ctr_lbl
+          bumpTickyAllocd ctr_lbl bytes
+
+  -- TODO are we still tracking "good stuff" (_gds) versus
+  -- administrative (_adm) versus slop (_slp)? I'm going with all _gds
+  -- for now, since I don't currently know neither if we do nor how to
+  -- distinguish. NSF Mar 2013
+
+  in case () of
+    _ | isConRep rep   ->
+          ifTickyDynThunk countSpecific >>
+          countGlobal (fsLit "ALLOC_CON_gds") (fsLit "ALLOC_CON_ctr")
+      | isThunkRep rep ->
+          ifTickyDynThunk countSpecific >>
+          if lfUpdatable lf
+          then countGlobal (fsLit "ALLOC_THK_gds") (fsLit "ALLOC_UP_THK_ctr")
+          else countGlobal (fsLit "ALLOC_THK_gds") (fsLit "ALLOC_SE_THK_ctr")
+      | isFunRep   rep ->
+          countSpecific >>
+          countGlobal (fsLit "ALLOC_FUN_gds") (fsLit "ALLOC_FUN_ctr")
+      | otherwise      -> panic "How is this heap object not a con, thunk, or fun?"
+
+
+
+tickyAllocHeap ::
+  Bool -> -- is this a genuine allocation? As opposed to
+          -- GHC.StgToCmm.Layout.adjustHpBackwards
+  VirtualHpOffset -> FCode ()
+-- Called when doing a heap check [TICK_ALLOC_HEAP]
+-- Must be lazy in the amount of allocation!
+tickyAllocHeap genuine hp
+  = ifTicky $
+    do  { dflags <- getDynFlags
+        ; ticky_ctr <- getTickyCtrLabel
+        ; emit $ catAGraphs $
+            -- only test hp from within the emit so that the monadic
+            -- computation itself is not strict in hp (cf knot in
+            -- GHC.StgToCmm.Monad.getHeapUsage)
+          if hp == 0 then []
+          else let !bytes = wORD_SIZE dflags * hp in [
+            -- Bump the allocation total in the closure's StgEntCounter
+            addToMem (rEP_StgEntCounter_allocs dflags)
+                     (CmmLit (cmmLabelOffB ticky_ctr (oFFSET_StgEntCounter_allocs dflags)))
+                     bytes,
+            -- Bump the global allocation total ALLOC_HEAP_tot
+            addToMemLbl (bWord dflags)
+                        (mkCmmDataLabel rtsUnitId (fsLit "ALLOC_HEAP_tot"))
+                        bytes,
+            -- Bump the global allocation counter ALLOC_HEAP_ctr
+            if not genuine then mkNop
+            else addToMemLbl (bWord dflags)
+                             (mkCmmDataLabel rtsUnitId (fsLit "ALLOC_HEAP_ctr"))
+                             1
+            ]}
+
+
+--------------------------------------------------------------------------------
+-- these three are only called from CmmParse.y (ie ultimately from the RTS)
+
+-- the units are bytes
+
+tickyAllocPrim :: CmmExpr  -- ^ size of the full header, in bytes
+               -> CmmExpr  -- ^ size of the payload, in bytes
+               -> CmmExpr -> FCode ()
+tickyAllocPrim _hdr _goods _slop = ifTicky $ do
+  bumpTickyCounter    (fsLit "ALLOC_PRIM_ctr")
+  bumpTickyCounterByE (fsLit "ALLOC_PRIM_adm") _hdr
+  bumpTickyCounterByE (fsLit "ALLOC_PRIM_gds") _goods
+  bumpTickyCounterByE (fsLit "ALLOC_PRIM_slp") _slop
+
+tickyAllocThunk :: CmmExpr -> CmmExpr -> FCode ()
+tickyAllocThunk _goods _slop = ifTicky $ do
+    -- TODO is it ever called with a Single-Entry thunk?
+  bumpTickyCounter    (fsLit "ALLOC_UP_THK_ctr")
+  bumpTickyCounterByE (fsLit "ALLOC_THK_gds") _goods
+  bumpTickyCounterByE (fsLit "ALLOC_THK_slp") _slop
+
+tickyAllocPAP :: CmmExpr -> CmmExpr -> FCode ()
+tickyAllocPAP _goods _slop = ifTicky $ do
+  bumpTickyCounter    (fsLit "ALLOC_PAP_ctr")
+  bumpTickyCounterByE (fsLit "ALLOC_PAP_gds") _goods
+  bumpTickyCounterByE (fsLit "ALLOC_PAP_slp") _slop
+
+tickyHeapCheck :: FCode ()
+tickyHeapCheck = ifTicky $ bumpTickyCounter (fsLit "HEAP_CHK_ctr")
+
+tickyStackCheck :: FCode ()
+tickyStackCheck = ifTicky $ bumpTickyCounter (fsLit "STK_CHK_ctr")
+
+-- -----------------------------------------------------------------------------
+-- Ticky utils
+
+ifTicky :: FCode () -> FCode ()
+ifTicky code =
+  getDynFlags >>= \dflags -> when (gopt Opt_Ticky dflags) code
+
+tickyAllocdIsOn :: FCode Bool
+tickyAllocdIsOn = gopt Opt_Ticky_Allocd `fmap` getDynFlags
+
+tickyLNEIsOn :: FCode Bool
+tickyLNEIsOn = gopt Opt_Ticky_LNE `fmap` getDynFlags
+
+tickyDynThunkIsOn :: FCode Bool
+tickyDynThunkIsOn = gopt Opt_Ticky_Dyn_Thunk `fmap` getDynFlags
+
+ifTickyAllocd :: FCode () -> FCode ()
+ifTickyAllocd code = tickyAllocdIsOn >>= \b -> when b code
+
+ifTickyLNE :: FCode () -> FCode ()
+ifTickyLNE code = tickyLNEIsOn >>= \b -> when b code
+
+ifTickyDynThunk :: FCode () -> FCode ()
+ifTickyDynThunk code = tickyDynThunkIsOn >>= \b -> when b code
+
+bumpTickyCounter :: FastString -> FCode ()
+bumpTickyCounter lbl = bumpTickyLbl (mkCmmDataLabel rtsUnitId lbl)
+
+bumpTickyCounterBy :: FastString -> Int -> FCode ()
+bumpTickyCounterBy lbl = bumpTickyLblBy (mkCmmDataLabel rtsUnitId lbl)
+
+bumpTickyCounterByE :: FastString -> CmmExpr -> FCode ()
+bumpTickyCounterByE lbl = bumpTickyLblByE (mkCmmDataLabel rtsUnitId lbl)
+
+bumpTickyEntryCount :: CLabel -> FCode ()
+bumpTickyEntryCount lbl = do
+  dflags <- getDynFlags
+  bumpTickyLit (cmmLabelOffB lbl (oFFSET_StgEntCounter_entry_count dflags))
+
+bumpTickyAllocd :: CLabel -> Int -> FCode ()
+bumpTickyAllocd lbl bytes = do
+  dflags <- getDynFlags
+  bumpTickyLitBy (cmmLabelOffB lbl (oFFSET_StgEntCounter_allocd dflags)) bytes
+
+bumpTickyLbl :: CLabel -> FCode ()
+bumpTickyLbl lhs = bumpTickyLitBy (cmmLabelOffB lhs 0) 1
+
+bumpTickyLblBy :: CLabel -> Int -> FCode ()
+bumpTickyLblBy lhs = bumpTickyLitBy (cmmLabelOffB lhs 0)
+
+bumpTickyLblByE :: CLabel -> CmmExpr -> FCode ()
+bumpTickyLblByE lhs = bumpTickyLitByE (cmmLabelOffB lhs 0)
+
+bumpTickyLit :: CmmLit -> FCode ()
+bumpTickyLit lhs = bumpTickyLitBy lhs 1
+
+bumpTickyLitBy :: CmmLit -> Int -> FCode ()
+bumpTickyLitBy lhs n = do
+  dflags <- getDynFlags
+  emit (addToMem (bWord dflags) (CmmLit lhs) n)
+
+bumpTickyLitByE :: CmmLit -> CmmExpr -> FCode ()
+bumpTickyLitByE lhs e = do
+  dflags <- getDynFlags
+  emit (addToMemE (bWord dflags) (CmmLit lhs) e)
+
+bumpHistogram :: FastString -> Int -> FCode ()
+bumpHistogram lbl n = do
+    dflags <- getDynFlags
+    let offset = n `min` (tICKY_BIN_COUNT dflags - 1)
+    emit (addToMem (bWord dflags)
+           (cmmIndexExpr dflags
+                (wordWidth dflags)
+                (CmmLit (CmmLabel (mkCmmDataLabel rtsUnitId lbl)))
+                (CmmLit (CmmInt (fromIntegral offset) (wordWidth dflags))))
+           1)
+
+------------------------------------------------------------------
+-- Showing the "type category" for ticky-ticky profiling
+
+showTypeCategory :: Type -> Char
+  {-
+        +           dictionary
+
+        >           function
+
+        {C,I,F,D,W} char, int, float, double, word
+        {c,i,f,d,w} unboxed ditto
+
+        T           tuple
+
+        P           other primitive type
+        p           unboxed ditto
+
+        L           list
+        E           enumeration type
+        S           other single-constructor type
+        M           other multi-constructor data-con type
+
+        .           other type
+
+        -           reserved for others to mark as "uninteresting"
+
+  Accurate as of Mar 2013, but I eliminated the Array category instead
+  of updating it, for simplicity. It's in P/p, I think --NSF
+
+    -}
+showTypeCategory ty
+  | isDictTy ty = '+'
+  | otherwise = case tcSplitTyConApp_maybe ty of
+  Nothing -> '.'
+  Just (tycon, _) ->
+    (if isUnliftedTyCon tycon then Data.Char.toLower else id) $
+    let anyOf us = getUnique tycon `elem` us in
+    case () of
+      _ | anyOf [funTyConKey] -> '>'
+        | anyOf [charPrimTyConKey, charTyConKey] -> 'C'
+        | anyOf [doublePrimTyConKey, doubleTyConKey] -> 'D'
+        | anyOf [floatPrimTyConKey, floatTyConKey] -> 'F'
+        | anyOf [intPrimTyConKey, int32PrimTyConKey, int64PrimTyConKey,
+                 intTyConKey, int8TyConKey, int16TyConKey, int32TyConKey, int64TyConKey
+                ] -> 'I'
+        | anyOf [wordPrimTyConKey, word32PrimTyConKey, word64PrimTyConKey, wordTyConKey,
+                 word8TyConKey, word16TyConKey, word32TyConKey, word64TyConKey
+                ] -> 'W'
+        | anyOf [listTyConKey] -> 'L'
+        | isTupleTyCon tycon       -> 'T'
+        | isPrimTyCon tycon        -> 'P'
+        | isEnumerationTyCon tycon -> 'E'
+        | isJust (tyConSingleDataCon_maybe tycon) -> 'S'
+        | otherwise -> 'M' -- oh, well...
diff --git a/compiler/GHC/StgToCmm/Utils.hs b/compiler/GHC/StgToCmm/Utils.hs
new file mode 100644
index 0000000000..30e37bb930
--- /dev/null
+++ b/compiler/GHC/StgToCmm/Utils.hs
@@ -0,0 +1,578 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Code generator utilities; mostly monadic
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.StgToCmm.Utils (
+        cgLit, mkSimpleLit,
+        emitDataLits, mkDataLits,
+        emitRODataLits, mkRODataLits,
+        emitRtsCall, emitRtsCallWithResult, emitRtsCallGen,
+        assignTemp, newTemp,
+
+        newUnboxedTupleRegs,
+
+        emitMultiAssign, emitCmmLitSwitch, emitSwitch,
+
+        tagToClosure, mkTaggedObjectLoad,
+
+        callerSaves, callerSaveVolatileRegs, get_GlobalReg_addr,
+
+        cmmAndWord, cmmOrWord, cmmNegate, cmmEqWord, cmmNeWord,
+        cmmUGtWord, cmmSubWord, cmmMulWord, cmmAddWord, cmmUShrWord,
+        cmmOffsetExprW, cmmOffsetExprB,
+        cmmRegOffW, cmmRegOffB,
+        cmmLabelOffW, cmmLabelOffB,
+        cmmOffsetW, cmmOffsetB,
+        cmmOffsetLitW, cmmOffsetLitB,
+        cmmLoadIndexW,
+        cmmConstrTag1,
+
+        cmmUntag, cmmIsTagged,
+
+        addToMem, addToMemE, addToMemLblE, addToMemLbl,
+        mkWordCLit,
+        newStringCLit, newByteStringCLit,
+        blankWord,
+  ) where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.StgToCmm.Monad
+import GHC.StgToCmm.Closure
+import Cmm
+import BlockId
+import MkGraph
+import GHC.Platform.Regs
+import CLabel
+import CmmUtils
+import CmmSwitch
+import GHC.StgToCmm.CgUtils
+
+import ForeignCall
+import IdInfo
+import Type
+import TyCon
+import SMRep
+import Module
+import Literal
+import Digraph
+import Util
+import Unique
+import UniqSupply (MonadUnique(..))
+import DynFlags
+import FastString
+import Outputable
+import RepType
+
+import Data.ByteString (ByteString)
+import qualified Data.ByteString.Char8 as BS8
+import qualified Data.Map as M
+import Data.Char
+import Data.List
+import Data.Ord
+
+
+-------------------------------------------------------------------------
+--
+--      Literals
+--
+-------------------------------------------------------------------------
+
+cgLit :: Literal -> FCode CmmLit
+cgLit (LitString s) = newByteStringCLit s
+ -- not unpackFS; we want the UTF-8 byte stream.
+cgLit other_lit     = do dflags <- getDynFlags
+                         return (mkSimpleLit dflags other_lit)
+
+mkSimpleLit :: DynFlags -> Literal -> CmmLit
+mkSimpleLit dflags (LitChar   c)                = CmmInt (fromIntegral (ord c))
+                                                         (wordWidth dflags)
+mkSimpleLit dflags LitNullAddr                  = zeroCLit dflags
+mkSimpleLit dflags (LitNumber LitNumInt i _)    = CmmInt i (wordWidth dflags)
+mkSimpleLit _      (LitNumber LitNumInt64 i _)  = CmmInt i W64
+mkSimpleLit dflags (LitNumber LitNumWord i _)   = CmmInt i (wordWidth dflags)
+mkSimpleLit _      (LitNumber LitNumWord64 i _) = CmmInt i W64
+mkSimpleLit _      (LitFloat r)                 = CmmFloat r W32
+mkSimpleLit _      (LitDouble r)                = CmmFloat r W64
+mkSimpleLit _      (LitLabel fs ms fod)
+  = let -- TODO: Literal labels might not actually be in the current package...
+        labelSrc = ForeignLabelInThisPackage
+    in CmmLabel (mkForeignLabel fs ms labelSrc fod)
+-- NB: LitRubbish should have been lowered in "CoreToStg"
+mkSimpleLit _      other = pprPanic "mkSimpleLit" (ppr other)
+
+--------------------------------------------------------------------------
+--
+-- Incrementing a memory location
+--
+--------------------------------------------------------------------------
+
+addToMemLbl :: CmmType -> CLabel -> Int -> CmmAGraph
+addToMemLbl rep lbl n = addToMem rep (CmmLit (CmmLabel lbl)) n
+
+addToMemLblE :: CmmType -> CLabel -> CmmExpr -> CmmAGraph
+addToMemLblE rep lbl = addToMemE rep (CmmLit (CmmLabel lbl))
+
+addToMem :: CmmType     -- rep of the counter
+         -> CmmExpr     -- Address
+         -> Int         -- What to add (a word)
+         -> CmmAGraph
+addToMem rep ptr n = addToMemE rep ptr (CmmLit (CmmInt (toInteger n) (typeWidth rep)))
+
+addToMemE :: CmmType    -- rep of the counter
+          -> CmmExpr    -- Address
+          -> CmmExpr    -- What to add (a word-typed expression)
+          -> CmmAGraph
+addToMemE rep ptr n
+  = mkStore ptr (CmmMachOp (MO_Add (typeWidth rep)) [CmmLoad ptr rep, n])
+
+
+-------------------------------------------------------------------------
+--
+--      Loading a field from an object,
+--      where the object pointer is itself tagged
+--
+-------------------------------------------------------------------------
+
+mkTaggedObjectLoad
+  :: DynFlags -> LocalReg -> LocalReg -> ByteOff -> DynTag -> CmmAGraph
+-- (loadTaggedObjectField reg base off tag) generates assignment
+--      reg = bitsK[ base + off - tag ]
+-- where K is fixed by 'reg'
+mkTaggedObjectLoad dflags reg base offset tag
+  = mkAssign (CmmLocal reg)
+             (CmmLoad (cmmOffsetB dflags
+                                  (CmmReg (CmmLocal base))
+                                  (offset - tag))
+                      (localRegType reg))
+
+-------------------------------------------------------------------------
+--
+--      Converting a closure tag to a closure for enumeration types
+--      (this is the implementation of tagToEnum#).
+--
+-------------------------------------------------------------------------
+
+tagToClosure :: DynFlags -> TyCon -> CmmExpr -> CmmExpr
+tagToClosure dflags tycon tag
+  = CmmLoad (cmmOffsetExprW dflags closure_tbl tag) (bWord dflags)
+  where closure_tbl = CmmLit (CmmLabel lbl)
+        lbl = mkClosureTableLabel (tyConName tycon) NoCafRefs
+
+-------------------------------------------------------------------------
+--
+--      Conditionals and rts calls
+--
+-------------------------------------------------------------------------
+
+emitRtsCall :: UnitId -> FastString -> [(CmmExpr,ForeignHint)] -> Bool -> FCode ()
+emitRtsCall pkg fun args safe = emitRtsCallGen [] (mkCmmCodeLabel pkg fun) args safe
+
+emitRtsCallWithResult :: LocalReg -> ForeignHint -> UnitId -> FastString
+        -> [(CmmExpr,ForeignHint)] -> Bool -> FCode ()
+emitRtsCallWithResult res hint pkg fun args safe
+   = emitRtsCallGen [(res,hint)] (mkCmmCodeLabel pkg fun) args safe
+
+-- Make a call to an RTS C procedure
+emitRtsCallGen
+   :: [(LocalReg,ForeignHint)]
+   -> CLabel
+   -> [(CmmExpr,ForeignHint)]
+   -> Bool -- True <=> CmmSafe call
+   -> FCode ()
+emitRtsCallGen res lbl args safe
+  = do { dflags <- getDynFlags
+       ; updfr_off <- getUpdFrameOff
+       ; let (caller_save, caller_load) = callerSaveVolatileRegs dflags
+       ; emit caller_save
+       ; call updfr_off
+       ; emit caller_load }
+  where
+    call updfr_off =
+      if safe then
+        emit =<< mkCmmCall fun_expr res' args' updfr_off
+      else do
+        let conv = ForeignConvention CCallConv arg_hints res_hints CmmMayReturn
+        emit $ mkUnsafeCall (ForeignTarget fun_expr conv) res' args'
+    (args', arg_hints) = unzip args
+    (res',  res_hints) = unzip res
+    fun_expr = mkLblExpr lbl
+
+
+-----------------------------------------------------------------------------
+--
+--      Caller-Save Registers
+--
+-----------------------------------------------------------------------------
+
+-- Here we generate the sequence of saves/restores required around a
+-- foreign call instruction.
+
+-- TODO: reconcile with includes/Regs.h
+--  * Regs.h claims that BaseReg should be saved last and loaded first
+--    * This might not have been tickled before since BaseReg is callee save
+--  * Regs.h saves SparkHd, ParkT1, SparkBase and SparkLim
+--
+-- This code isn't actually used right now, because callerSaves
+-- only ever returns true in the current universe for registers NOT in
+-- system_regs (just do a grep for CALLER_SAVES in
+-- includes/stg/MachRegs.h).  It's all one giant no-op, and for
+-- good reason: having to save system registers on every foreign call
+-- would be very expensive, so we avoid assigning them to those
+-- registers when we add support for an architecture.
+--
+-- Note that the old code generator actually does more work here: it
+-- also saves other global registers.  We can't (nor want) to do that
+-- here, as we don't have liveness information.  And really, we
+-- shouldn't be doing the workaround at this point in the pipeline, see
+-- Note [Register parameter passing] and the ToDo on CmmCall in
+-- cmm/CmmNode.hs.  Right now the workaround is to avoid inlining across
+-- unsafe foreign calls in rewriteAssignments, but this is strictly
+-- temporary.
+callerSaveVolatileRegs :: DynFlags -> (CmmAGraph, CmmAGraph)
+callerSaveVolatileRegs dflags = (caller_save, caller_load)
+  where
+    platform = targetPlatform dflags
+
+    caller_save = catAGraphs (map callerSaveGlobalReg    regs_to_save)
+    caller_load = catAGraphs (map callerRestoreGlobalReg regs_to_save)
+
+    system_regs = [ Sp,SpLim,Hp,HpLim,CCCS,CurrentTSO,CurrentNursery
+                    {- ,SparkHd,SparkTl,SparkBase,SparkLim -}
+                  , BaseReg ]
+
+    regs_to_save = filter (callerSaves platform) system_regs
+
+    callerSaveGlobalReg reg
+        = mkStore (get_GlobalReg_addr dflags reg) (CmmReg (CmmGlobal reg))
+
+    callerRestoreGlobalReg reg
+        = mkAssign (CmmGlobal reg)
+                   (CmmLoad (get_GlobalReg_addr dflags reg) (globalRegType dflags reg))
+
+
+-------------------------------------------------------------------------
+--
+--      Strings generate a top-level data block
+--
+-------------------------------------------------------------------------
+
+emitDataLits :: CLabel -> [CmmLit] -> FCode ()
+-- Emit a data-segment data block
+emitDataLits lbl lits = emitDecl (mkDataLits (Section Data lbl) lbl lits)
+
+emitRODataLits :: CLabel -> [CmmLit] -> FCode ()
+-- Emit a read-only data block
+emitRODataLits lbl lits = emitDecl (mkRODataLits lbl lits)
+
+newStringCLit :: String -> FCode CmmLit
+-- Make a global definition for the string,
+-- and return its label
+newStringCLit str = newByteStringCLit (BS8.pack str)
+
+newByteStringCLit :: ByteString -> FCode CmmLit
+newByteStringCLit bytes
+  = do  { uniq <- newUnique
+        ; let (lit, decl) = mkByteStringCLit (mkStringLitLabel uniq) bytes
+        ; emitDecl decl
+        ; return lit }
+
+-------------------------------------------------------------------------
+--
+--      Assigning expressions to temporaries
+--
+-------------------------------------------------------------------------
+
+assignTemp :: CmmExpr -> FCode LocalReg
+-- Make sure the argument is in a local register.
+-- We don't bother being particularly aggressive with avoiding
+-- unnecessary local registers, since we can rely on a later
+-- optimization pass to inline as necessary (and skipping out
+-- on things like global registers can be a little dangerous
+-- due to them being trashed on foreign calls--though it means
+-- the optimization pass doesn't have to do as much work)
+assignTemp (CmmReg (CmmLocal reg)) = return reg
+assignTemp e = do { dflags <- getDynFlags
+                  ; uniq <- newUnique
+                  ; let reg = LocalReg uniq (cmmExprType dflags e)
+                  ; emitAssign (CmmLocal reg) e
+                  ; return reg }
+
+newTemp :: MonadUnique m => CmmType -> m LocalReg
+newTemp rep = do { uniq <- getUniqueM
+                 ; return (LocalReg uniq rep) }
+
+newUnboxedTupleRegs :: Type -> FCode ([LocalReg], [ForeignHint])
+-- Choose suitable local regs to use for the components
+-- of an unboxed tuple that we are about to return to
+-- the Sequel.  If the Sequel is a join point, using the
+-- regs it wants will save later assignments.
+newUnboxedTupleRegs res_ty
+  = ASSERT( isUnboxedTupleType res_ty )
+    do  { dflags <- getDynFlags
+        ; sequel <- getSequel
+        ; regs <- choose_regs dflags sequel
+        ; ASSERT( regs `equalLength` reps )
+          return (regs, map primRepForeignHint reps) }
+  where
+    reps = typePrimRep res_ty
+    choose_regs _ (AssignTo regs _) = return regs
+    choose_regs dflags _            = mapM (newTemp . primRepCmmType dflags) reps
+
+
+
+-------------------------------------------------------------------------
+--      emitMultiAssign
+-------------------------------------------------------------------------
+
+emitMultiAssign :: [LocalReg] -> [CmmExpr] -> FCode ()
+-- Emit code to perform the assignments in the
+-- input simultaneously, using temporary variables when necessary.
+
+type Key  = Int
+type Vrtx = (Key, Stmt) -- Give each vertex a unique number,
+                        -- for fast comparison
+type Stmt = (LocalReg, CmmExpr) -- r := e
+
+-- We use the strongly-connected component algorithm, in which
+--      * the vertices are the statements
+--      * an edge goes from s1 to s2 iff
+--              s1 assigns to something s2 uses
+--        that is, if s1 should *follow* s2 in the final order
+
+emitMultiAssign []    []    = return ()
+emitMultiAssign [reg] [rhs] = emitAssign (CmmLocal reg) rhs
+emitMultiAssign regs rhss   = do
+  dflags <- getDynFlags
+  ASSERT2( equalLength regs rhss, ppr regs $$ ppr rhss )
+    unscramble dflags ([1..] `zip` (regs `zip` rhss))
+
+unscramble :: DynFlags -> [Vrtx] -> FCode ()
+unscramble dflags vertices = mapM_ do_component components
+  where
+        edges :: [ Node Key Vrtx ]
+        edges = [ DigraphNode vertex key1 (edges_from stmt1)
+                | vertex@(key1, stmt1) <- vertices ]
+
+        edges_from :: Stmt -> [Key]
+        edges_from stmt1 = [ key2 | (key2, stmt2) <- vertices,
+                                    stmt1 `mustFollow` stmt2 ]
+
+        components :: [SCC Vrtx]
+        components = stronglyConnCompFromEdgedVerticesUniq edges
+
+        -- do_components deal with one strongly-connected component
+        -- Not cyclic, or singleton?  Just do it
+        do_component :: SCC Vrtx -> FCode ()
+        do_component (AcyclicSCC (_,stmt))  = mk_graph stmt
+        do_component (CyclicSCC [])         = panic "do_component"
+        do_component (CyclicSCC [(_,stmt)]) = mk_graph stmt
+
+                -- Cyclic?  Then go via temporaries.  Pick one to
+                -- break the loop and try again with the rest.
+        do_component (CyclicSCC ((_,first_stmt) : rest)) = do
+            dflags <- getDynFlags
+            u <- newUnique
+            let (to_tmp, from_tmp) = split dflags u first_stmt
+            mk_graph to_tmp
+            unscramble dflags rest
+            mk_graph from_tmp
+
+        split :: DynFlags -> Unique -> Stmt -> (Stmt, Stmt)
+        split dflags uniq (reg, rhs)
+          = ((tmp, rhs), (reg, CmmReg (CmmLocal tmp)))
+          where
+            rep = cmmExprType dflags rhs
+            tmp = LocalReg uniq rep
+
+        mk_graph :: Stmt -> FCode ()
+        mk_graph (reg, rhs) = emitAssign (CmmLocal reg) rhs
+
+        mustFollow :: Stmt -> Stmt -> Bool
+        (reg, _) `mustFollow` (_, rhs) = regUsedIn dflags (CmmLocal reg) rhs
+
+-------------------------------------------------------------------------
+--      mkSwitch
+-------------------------------------------------------------------------
+
+
+emitSwitch :: CmmExpr                      -- Tag to switch on
+           -> [(ConTagZ, CmmAGraphScoped)] -- Tagged branches
+           -> Maybe CmmAGraphScoped        -- Default branch (if any)
+           -> ConTagZ -> ConTagZ           -- Min and Max possible values;
+                                           -- behaviour outside this range is
+                                           -- undefined
+           -> FCode ()
+
+-- First, two rather common cases in which there is no work to do
+emitSwitch _ []         (Just code) _ _ = emit (fst code)
+emitSwitch _ [(_,code)] Nothing     _ _ = emit (fst code)
+
+-- Right, off we go
+emitSwitch tag_expr branches mb_deflt lo_tag hi_tag = do
+    join_lbl      <- newBlockId
+    mb_deflt_lbl  <- label_default join_lbl mb_deflt
+    branches_lbls <- label_branches join_lbl branches
+    tag_expr'     <- assignTemp' tag_expr
+
+    -- Sort the branches before calling mk_discrete_switch
+    let branches_lbls' = [ (fromIntegral i, l) | (i,l) <- sortBy (comparing fst) branches_lbls ]
+    let range = (fromIntegral lo_tag, fromIntegral hi_tag)
+
+    emit $ mk_discrete_switch False tag_expr' branches_lbls' mb_deflt_lbl range
+
+    emitLabel join_lbl
+
+mk_discrete_switch :: Bool -- ^ Use signed comparisons
+          -> CmmExpr
+          -> [(Integer, BlockId)]
+          -> Maybe BlockId
+          -> (Integer, Integer)
+          -> CmmAGraph
+
+-- SINGLETON TAG RANGE: no case analysis to do
+mk_discrete_switch _ _tag_expr [(tag, lbl)] _ (lo_tag, hi_tag)
+  | lo_tag == hi_tag
+  = ASSERT( tag == lo_tag )
+    mkBranch lbl
+
+-- SINGLETON BRANCH, NO DEFAULT: no case analysis to do
+mk_discrete_switch _ _tag_expr [(_tag,lbl)] Nothing _
+  = mkBranch lbl
+        -- The simplifier might have eliminated a case
+        --       so we may have e.g. case xs of
+        --                               [] -> e
+        -- In that situation we can be sure the (:) case
+        -- can't happen, so no need to test
+
+-- SOMETHING MORE COMPLICATED: defer to CmmImplementSwitchPlans
+-- See Note [Cmm Switches, the general plan] in CmmSwitch
+mk_discrete_switch signed tag_expr branches mb_deflt range
+  = mkSwitch tag_expr $ mkSwitchTargets signed range mb_deflt (M.fromList branches)
+
+divideBranches :: Ord a => [(a,b)] -> ([(a,b)], a, [(a,b)])
+divideBranches branches = (lo_branches, mid, hi_branches)
+  where
+    -- 2 branches => n_branches `div` 2 = 1
+    --            => branches !! 1 give the *second* tag
+    -- There are always at least 2 branches here
+    (mid,_) = branches !! (length branches `div` 2)
+    (lo_branches, hi_branches) = span is_lo branches
+    is_lo (t,_) = t < mid
+
+--------------
+emitCmmLitSwitch :: CmmExpr                    -- Tag to switch on
+               -> [(Literal, CmmAGraphScoped)] -- Tagged branches
+               -> CmmAGraphScoped              -- Default branch (always)
+               -> FCode ()                     -- Emit the code
+emitCmmLitSwitch _scrut []       deflt = emit $ fst deflt
+emitCmmLitSwitch scrut  branches deflt = do
+    scrut' <- assignTemp' scrut
+    join_lbl <- newBlockId
+    deflt_lbl <- label_code join_lbl deflt
+    branches_lbls <- label_branches join_lbl branches
+
+    dflags <- getDynFlags
+    let cmm_ty = cmmExprType dflags scrut
+        rep = typeWidth cmm_ty
+
+    -- We find the necessary type information in the literals in the branches
+    let signed = case head branches of
+                    (LitNumber nt _ _, _) -> litNumIsSigned nt
+                    _ -> False
+
+    let range | signed    = (tARGET_MIN_INT dflags, tARGET_MAX_INT dflags)
+              | otherwise = (0, tARGET_MAX_WORD dflags)
+
+    if isFloatType cmm_ty
+    then emit =<< mk_float_switch rep scrut' deflt_lbl noBound branches_lbls
+    else emit $ mk_discrete_switch
+        signed
+        scrut'
+        [(litValue lit,l) | (lit,l) <- branches_lbls]
+        (Just deflt_lbl)
+        range
+    emitLabel join_lbl
+
+-- | lower bound (inclusive), upper bound (exclusive)
+type LitBound = (Maybe Literal, Maybe Literal)
+
+noBound :: LitBound
+noBound = (Nothing, Nothing)
+
+mk_float_switch :: Width -> CmmExpr -> BlockId
+              -> LitBound
+              -> [(Literal,BlockId)]
+              -> FCode CmmAGraph
+mk_float_switch rep scrut deflt _bounds [(lit,blk)]
+  = do dflags <- getDynFlags
+       return $ mkCbranch (cond dflags) deflt blk Nothing
+  where
+    cond dflags = CmmMachOp ne [scrut, CmmLit cmm_lit]
+      where
+        cmm_lit = mkSimpleLit dflags lit
+        ne      = MO_F_Ne rep
+
+mk_float_switch rep scrut deflt_blk_id (lo_bound, hi_bound) branches
+  = do dflags <- getDynFlags
+       lo_blk <- mk_float_switch rep scrut deflt_blk_id bounds_lo lo_branches
+       hi_blk <- mk_float_switch rep scrut deflt_blk_id bounds_hi hi_branches
+       mkCmmIfThenElse (cond dflags) lo_blk hi_blk
+  where
+    (lo_branches, mid_lit, hi_branches) = divideBranches branches
+
+    bounds_lo = (lo_bound, Just mid_lit)
+    bounds_hi = (Just mid_lit, hi_bound)
+
+    cond dflags = CmmMachOp lt [scrut, CmmLit cmm_lit]
+      where
+        cmm_lit = mkSimpleLit dflags mid_lit
+        lt      = MO_F_Lt rep
+
+
+--------------
+label_default :: BlockId -> Maybe CmmAGraphScoped -> FCode (Maybe BlockId)
+label_default _ Nothing
+  = return Nothing
+label_default join_lbl (Just code)
+  = do lbl <- label_code join_lbl code
+       return (Just lbl)
+
+--------------
+label_branches :: BlockId -> [(a,CmmAGraphScoped)] -> FCode [(a,BlockId)]
+label_branches _join_lbl []
+  = return []
+label_branches join_lbl ((tag,code):branches)
+  = do lbl <- label_code join_lbl code
+       branches' <- label_branches join_lbl branches
+       return ((tag,lbl):branches')
+
+--------------
+label_code :: BlockId -> CmmAGraphScoped -> FCode BlockId
+--  label_code J code
+--      generates
+--  [L: code; goto J]
+-- and returns L
+label_code join_lbl (code,tsc) = do
+    lbl <- newBlockId
+    emitOutOfLine lbl (code MkGraph.<*> mkBranch join_lbl, tsc)
+    return lbl
+
+--------------
+assignTemp' :: CmmExpr -> FCode CmmExpr
+assignTemp' e
+  | isTrivialCmmExpr e = return e
+  | otherwise = do
+       dflags <- getDynFlags
+       lreg <- newTemp (cmmExprType dflags e)
+       let reg = CmmLocal lreg
+       emitAssign reg e
+       return (CmmReg reg)
author	Sylvain Henry <sylvain@haskus.fr>	2019-08-13 17:26:32 +0200
committer	Sylvain Henry <sylvain@haskus.fr>	2019-09-10 00:04:50 +0200
commit	447864a94a1679b5b079e08bb7208a0005381cef (patch)
tree	baa469c52620ce7ae02def3e5e6a6f109cc89f40 /compiler/GHC
parent	270fbe8512f04b6107755fa22bdec62205c0a567 (diff)
download	haskell-447864a94a1679b5b079e08bb7208a0005381cef.tar.gz