36 files changed, 16609 insertions, 0 deletions
diff --git a/compiler/GHC/Cmm/BlockId.hs b/compiler/GHC/Cmm/BlockId.hs
new file mode 100644
index 0000000000..f7f369551b
--- /dev/null
+++ b/compiler/GHC/Cmm/BlockId.hs
@@ -0,0 +1,46 @@
+{-# LANGUAGE TypeSynonymInstances #-}
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+{- BlockId module should probably go away completely, being superseded by Label -}
+module GHC.Cmm.BlockId
+  ( BlockId, mkBlockId -- ToDo: BlockId should be abstract, but it isn't yet
+  , newBlockId
+  , blockLbl, infoTblLbl
+  ) where
+
+import GhcPrelude
+
+import GHC.Cmm.CLabel
+import IdInfo
+import Name
+import Unique
+import UniqSupply
+
+import GHC.Cmm.Dataflow.Label (Label, mkHooplLabel)
+
+----------------------------------------------------------------
+--- Block Ids, their environments, and their sets
+
+{- Note [Unique BlockId]
+~~~~~~~~~~~~~~~~~~~~~~~~
+Although a 'BlockId' is a local label, for reasons of implementation,
+'BlockId's must be unique within an entire compilation unit.  The reason
+is that each local label is mapped to an assembly-language label, and in
+most assembly languages allow, a label is visible throughout the entire
+compilation unit in which it appears.
+-}
+
+type BlockId = Label
+
+mkBlockId :: Unique -> BlockId
+mkBlockId unique = mkHooplLabel $ getKey unique
+
+newBlockId :: MonadUnique m => m BlockId
+newBlockId = mkBlockId <$> getUniqueM
+
+blockLbl :: BlockId -> CLabel
+blockLbl label = mkLocalBlockLabel (getUnique label)
+
+infoTblLbl :: BlockId -> CLabel
+infoTblLbl label
+  = mkBlockInfoTableLabel (mkFCallName (getUnique label) "block") NoCafRefs
diff --git a/compiler/GHC/Cmm/BlockId.hs-boot b/compiler/GHC/Cmm/BlockId.hs-boot
new file mode 100644
index 0000000000..76fd6180a9
--- /dev/null
+++ b/compiler/GHC/Cmm/BlockId.hs-boot
@@ -0,0 +1,8 @@
+module GHC.Cmm.BlockId (BlockId, mkBlockId) where
+
+import GHC.Cmm.Dataflow.Label (Label)
+import Unique (Unique)
+
+type BlockId = Label
+
+mkBlockId :: Unique -> BlockId
diff --git a/compiler/GHC/Cmm/CLabel.hs b/compiler/GHC/Cmm/CLabel.hs
new file mode 100644
index 0000000000..e84278bf65
--- /dev/null
+++ b/compiler/GHC/Cmm/CLabel.hs
@@ -0,0 +1,1571 @@
+-----------------------------------------------------------------------------
+--
+-- Object-file symbols (called CLabel for histerical raisins).
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+{-# LANGUAGE CPP #-}
+
+module GHC.Cmm.CLabel (
+        CLabel, -- abstract type
+        ForeignLabelSource(..),
+        pprDebugCLabel,
+
+        mkClosureLabel,
+        mkSRTLabel,
+        mkInfoTableLabel,
+        mkEntryLabel,
+        mkRednCountsLabel,
+        mkConInfoTableLabel,
+        mkApEntryLabel,
+        mkApInfoTableLabel,
+        mkClosureTableLabel,
+        mkBytesLabel,
+
+        mkLocalBlockLabel,
+        mkLocalClosureLabel,
+        mkLocalInfoTableLabel,
+        mkLocalClosureTableLabel,
+
+        mkBlockInfoTableLabel,
+
+        mkBitmapLabel,
+        mkStringLitLabel,
+
+        mkAsmTempLabel,
+        mkAsmTempDerivedLabel,
+        mkAsmTempEndLabel,
+        mkAsmTempDieLabel,
+
+        mkDirty_MUT_VAR_Label,
+        mkNonmovingWriteBarrierEnabledLabel,
+        mkUpdInfoLabel,
+        mkBHUpdInfoLabel,
+        mkIndStaticInfoLabel,
+        mkMainCapabilityLabel,
+        mkMAP_FROZEN_CLEAN_infoLabel,
+        mkMAP_FROZEN_DIRTY_infoLabel,
+        mkMAP_DIRTY_infoLabel,
+        mkSMAP_FROZEN_CLEAN_infoLabel,
+        mkSMAP_FROZEN_DIRTY_infoLabel,
+        mkSMAP_DIRTY_infoLabel,
+        mkBadAlignmentLabel,
+        mkArrWords_infoLabel,
+        mkSRTInfoLabel,
+
+        mkTopTickyCtrLabel,
+        mkCAFBlackHoleInfoTableLabel,
+        mkRtsPrimOpLabel,
+        mkRtsSlowFastTickyCtrLabel,
+
+        mkSelectorInfoLabel,
+        mkSelectorEntryLabel,
+
+        mkCmmInfoLabel,
+        mkCmmEntryLabel,
+        mkCmmRetInfoLabel,
+        mkCmmRetLabel,
+        mkCmmCodeLabel,
+        mkCmmDataLabel,
+        mkCmmClosureLabel,
+
+        mkRtsApFastLabel,
+
+        mkPrimCallLabel,
+
+        mkForeignLabel,
+        addLabelSize,
+
+        foreignLabelStdcallInfo,
+        isBytesLabel,
+        isForeignLabel,
+        isSomeRODataLabel,
+        isStaticClosureLabel,
+        mkCCLabel, mkCCSLabel,
+
+        DynamicLinkerLabelInfo(..),
+        mkDynamicLinkerLabel,
+        dynamicLinkerLabelInfo,
+
+        mkPicBaseLabel,
+        mkDeadStripPreventer,
+
+        mkHpcTicksLabel,
+
+        -- * Predicates
+        hasCAF,
+        needsCDecl, maybeLocalBlockLabel, externallyVisibleCLabel,
+        isMathFun,
+        isCFunctionLabel, isGcPtrLabel, labelDynamic,
+        isLocalCLabel, mayRedirectTo,
+
+        -- * Conversions
+        toClosureLbl, toSlowEntryLbl, toEntryLbl, toInfoLbl, hasHaskellName,
+
+        pprCLabel,
+        isInfoTableLabel,
+        isConInfoTableLabel
+    ) where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import IdInfo
+import BasicTypes
+import {-# SOURCE #-} GHC.Cmm.BlockId (BlockId, mkBlockId)
+import Packages
+import Module
+import Name
+import Unique
+import PrimOp
+import CostCentre
+import Outputable
+import FastString
+import DynFlags
+import GHC.Platform
+import UniqSet
+import Util
+import PprCore ( {- instances -} )
+
+-- -----------------------------------------------------------------------------
+-- The CLabel type
+
+{- |
+  'CLabel' is an abstract type that supports the following operations:
+
+  - Pretty printing
+
+  - In a C file, does it need to be declared before use?  (i.e. is it
+    guaranteed to be already in scope in the places we need to refer to it?)
+
+  - If it needs to be declared, what type (code or data) should it be
+    declared to have?
+
+  - Is it visible outside this object file or not?
+
+  - Is it "dynamic" (see details below)
+
+  - Eq and Ord, so that we can make sets of CLabels (currently only
+    used in outputting C as far as I can tell, to avoid generating
+    more than one declaration for any given label).
+
+  - Converting an info table label into an entry label.
+
+  CLabel usage is a bit messy in GHC as they are used in a number of different
+  contexts:
+
+  - By the C-- AST to identify labels
+
+  - By the unregisterised C code generator ("PprC") for naming functions (hence
+    the name 'CLabel')
+
+  - By the native and LLVM code generators to identify labels
+
+  For extra fun, each of these uses a slightly different subset of constructors
+  (e.g. 'AsmTempLabel' and 'AsmTempDerivedLabel' are used only in the NCG and
+  LLVM backends).
+
+  In general, we use 'IdLabel' to represent Haskell things early in the
+  pipeline. However, later optimization passes will often represent blocks they
+  create with 'LocalBlockLabel' where there is no obvious 'Name' to hang off the
+  label.
+-}
+
+data CLabel
+  = -- | A label related to the definition of a particular Id or Con in a .hs file.
+    IdLabel
+        Name
+        CafInfo
+        IdLabelInfo             -- encodes the suffix of the label
+
+  -- | A label from a .cmm file that is not associated with a .hs level Id.
+  | CmmLabel
+        UnitId               -- what package the label belongs to.
+        FastString              -- identifier giving the prefix of the label
+        CmmLabelInfo            -- encodes the suffix of the label
+
+  -- | A label with a baked-in \/ algorithmically generated name that definitely
+  --    comes from the RTS. The code for it must compile into libHSrts.a \/ libHSrts.so
+  --    If it doesn't have an algorithmically generated name then use a CmmLabel
+  --    instead and give it an appropriate UnitId argument.
+  | RtsLabel
+        RtsLabelInfo
+
+  -- | A label associated with a block. These aren't visible outside of the
+  -- compilation unit in which they are defined. These are generally used to
+  -- name blocks produced by Cmm-to-Cmm passes and the native code generator,
+  -- where we don't have a 'Name' to associate the label to and therefore can't
+  -- use 'IdLabel'.
+  | LocalBlockLabel
+        {-# UNPACK #-} !Unique
+
+  -- | A 'C' (or otherwise foreign) label.
+  --
+  | ForeignLabel
+        FastString              -- name of the imported label.
+
+        (Maybe Int)             -- possible '@n' suffix for stdcall functions
+                                -- When generating C, the '@n' suffix is omitted, but when
+                                -- generating assembler we must add it to the label.
+
+        ForeignLabelSource      -- what package the foreign label is in.
+
+        FunctionOrData
+
+  -- | Local temporary label used for native (or LLVM) code generation; must not
+  -- appear outside of these contexts. Use primarily for debug information
+  | AsmTempLabel
+        {-# UNPACK #-} !Unique
+
+  -- | A label \"derived\" from another 'CLabel' by the addition of a suffix.
+  -- Must not occur outside of the NCG or LLVM code generators.
+  | AsmTempDerivedLabel
+        CLabel
+        FastString              -- suffix
+
+  | StringLitLabel
+        {-# UNPACK #-} !Unique
+
+  | CC_Label  CostCentre
+  | CCS_Label CostCentreStack
+
+
+  -- | These labels are generated and used inside the NCG only.
+  --    They are special variants of a label used for dynamic linking
+  --    see module PositionIndependentCode for details.
+  | DynamicLinkerLabel DynamicLinkerLabelInfo CLabel
+
+  -- | This label is generated and used inside the NCG only.
+  --    It is used as a base for PIC calculations on some platforms.
+  --    It takes the form of a local numeric assembler label '1'; and
+  --    is pretty-printed as 1b, referring to the previous definition
+  --    of 1: in the assembler source file.
+  | PicBaseLabel
+
+  -- | A label before an info table to prevent excessive dead-stripping on darwin
+  | DeadStripPreventer CLabel
+
+
+  -- | Per-module table of tick locations
+  | HpcTicksLabel Module
+
+  -- | Static reference table
+  | SRTLabel
+        {-# UNPACK #-} !Unique
+
+  -- | A bitmap (function or case return)
+  | LargeBitmapLabel
+        {-# UNPACK #-} !Unique
+
+  deriving Eq
+
+-- This is laborious, but necessary. We can't derive Ord because
+-- Unique doesn't have an Ord instance. Note nonDetCmpUnique in the
+-- implementation. See Note [No Ord for Unique]
+-- This is non-deterministic but we do not currently support deterministic
+-- code-generation. See Note [Unique Determinism and code generation]
+instance Ord CLabel where
+  compare (IdLabel a1 b1 c1) (IdLabel a2 b2 c2) =
+    compare a1 a2 `thenCmp`
+    compare b1 b2 `thenCmp`
+    compare c1 c2
+  compare (CmmLabel a1 b1 c1) (CmmLabel a2 b2 c2) =
+    compare a1 a2 `thenCmp`
+    compare b1 b2 `thenCmp`
+    compare c1 c2
+  compare (RtsLabel a1) (RtsLabel a2) = compare a1 a2
+  compare (LocalBlockLabel u1) (LocalBlockLabel u2) = nonDetCmpUnique u1 u2
+  compare (ForeignLabel a1 b1 c1 d1) (ForeignLabel a2 b2 c2 d2) =
+    compare a1 a2 `thenCmp`
+    compare b1 b2 `thenCmp`
+    compare c1 c2 `thenCmp`
+    compare d1 d2
+  compare (AsmTempLabel u1) (AsmTempLabel u2) = nonDetCmpUnique u1 u2
+  compare (AsmTempDerivedLabel a1 b1) (AsmTempDerivedLabel a2 b2) =
+    compare a1 a2 `thenCmp`
+    compare b1 b2
+  compare (StringLitLabel u1) (StringLitLabel u2) =
+    nonDetCmpUnique u1 u2
+  compare (CC_Label a1) (CC_Label a2) =
+    compare a1 a2
+  compare (CCS_Label a1) (CCS_Label a2) =
+    compare a1 a2
+  compare (DynamicLinkerLabel a1 b1) (DynamicLinkerLabel a2 b2) =
+    compare a1 a2 `thenCmp`
+    compare b1 b2
+  compare PicBaseLabel PicBaseLabel = EQ
+  compare (DeadStripPreventer a1) (DeadStripPreventer a2) =
+    compare a1 a2
+  compare (HpcTicksLabel a1) (HpcTicksLabel a2) =
+    compare a1 a2
+  compare (SRTLabel u1) (SRTLabel u2) =
+    nonDetCmpUnique u1 u2
+  compare (LargeBitmapLabel u1) (LargeBitmapLabel u2) =
+    nonDetCmpUnique u1 u2
+  compare IdLabel{} _ = LT
+  compare _ IdLabel{} = GT
+  compare CmmLabel{} _ = LT
+  compare _ CmmLabel{} = GT
+  compare RtsLabel{} _ = LT
+  compare _ RtsLabel{} = GT
+  compare LocalBlockLabel{} _ = LT
+  compare _ LocalBlockLabel{} = GT
+  compare ForeignLabel{} _ = LT
+  compare _ ForeignLabel{} = GT
+  compare AsmTempLabel{} _ = LT
+  compare _ AsmTempLabel{} = GT
+  compare AsmTempDerivedLabel{} _ = LT
+  compare _ AsmTempDerivedLabel{} = GT
+  compare StringLitLabel{} _ = LT
+  compare _ StringLitLabel{} = GT
+  compare CC_Label{} _ = LT
+  compare _ CC_Label{} = GT
+  compare CCS_Label{} _ = LT
+  compare _ CCS_Label{} = GT
+  compare DynamicLinkerLabel{} _ = LT
+  compare _ DynamicLinkerLabel{} = GT
+  compare PicBaseLabel{} _ = LT
+  compare _ PicBaseLabel{} = GT
+  compare DeadStripPreventer{} _ = LT
+  compare _ DeadStripPreventer{} = GT
+  compare HpcTicksLabel{} _ = LT
+  compare _ HpcTicksLabel{} = GT
+  compare SRTLabel{} _ = LT
+  compare _ SRTLabel{} = GT
+
+-- | Record where a foreign label is stored.
+data ForeignLabelSource
+
+   -- | Label is in a named package
+   = ForeignLabelInPackage      UnitId
+
+   -- | Label is in some external, system package that doesn't also
+   --   contain compiled Haskell code, and is not associated with any .hi files.
+   --   We don't have to worry about Haskell code being inlined from
+   --   external packages. It is safe to treat the RTS package as "external".
+   | ForeignLabelInExternalPackage
+
+   -- | Label is in the package currently being compiled.
+   --   This is only used for creating hacky tmp labels during code generation.
+   --   Don't use it in any code that might be inlined across a package boundary
+   --   (ie, core code) else the information will be wrong relative to the
+   --   destination module.
+   | ForeignLabelInThisPackage
+
+   deriving (Eq, Ord)
+
+
+-- | For debugging problems with the CLabel representation.
+--      We can't make a Show instance for CLabel because lots of its components don't have instances.
+--      The regular Outputable instance only shows the label name, and not its other info.
+--
+pprDebugCLabel :: CLabel -> SDoc
+pprDebugCLabel lbl
+ = case lbl of
+        IdLabel _ _ info-> ppr lbl <> (parens $ text "IdLabel"
+                                       <> whenPprDebug (text ":" <> text (show info)))
+        CmmLabel pkg _name _info
+         -> ppr lbl <> (parens $ text "CmmLabel" <+> ppr pkg)
+
+        RtsLabel{}      -> ppr lbl <> (parens $ text "RtsLabel")
+
+        ForeignLabel _name mSuffix src funOrData
+            -> ppr lbl <> (parens $ text "ForeignLabel"
+                                <+> ppr mSuffix
+                                <+> ppr src
+                                <+> ppr funOrData)
+
+        _               -> ppr lbl <> (parens $ text "other CLabel")
+
+
+data IdLabelInfo
+  = Closure             -- ^ Label for closure
+  | InfoTable           -- ^ Info tables for closures; always read-only
+  | Entry               -- ^ Entry point
+  | Slow                -- ^ Slow entry point
+
+  | LocalInfoTable      -- ^ Like InfoTable but not externally visible
+  | LocalEntry          -- ^ Like Entry but not externally visible
+
+  | RednCounts          -- ^ Label of place to keep Ticky-ticky  info for this Id
+
+  | ConEntry            -- ^ Constructor entry point
+  | ConInfoTable        -- ^ Corresponding info table
+
+  | ClosureTable        -- ^ Table of closures for Enum tycons
+
+  | Bytes               -- ^ Content of a string literal. See
+                        -- Note [Bytes label].
+  | BlockInfoTable      -- ^ Like LocalInfoTable but for a proc-point block
+                        -- instead of a closure entry-point.
+                        -- See Note [Proc-point local block entry-point].
+
+  deriving (Eq, Ord, Show)
+
+
+data RtsLabelInfo
+  = RtsSelectorInfoTable Bool{-updatable-} Int{-offset-}  -- ^ Selector thunks
+  | RtsSelectorEntry     Bool{-updatable-} Int{-offset-}
+
+  | RtsApInfoTable       Bool{-updatable-} Int{-arity-}    -- ^ AP thunks
+  | RtsApEntry           Bool{-updatable-} Int{-arity-}
+
+  | RtsPrimOp PrimOp
+  | RtsApFast     FastString    -- ^ _fast versions of generic apply
+  | RtsSlowFastTickyCtr String
+
+  deriving (Eq, Ord)
+  -- NOTE: Eq on PtrString compares the pointer only, so this isn't
+  -- a real equality.
+
+
+-- | What type of Cmm label we're dealing with.
+--      Determines the suffix appended to the name when a CLabel.CmmLabel
+--      is pretty printed.
+data CmmLabelInfo
+  = CmmInfo                     -- ^ misc rts info tables,      suffix _info
+  | CmmEntry                    -- ^ misc rts entry points,     suffix _entry
+  | CmmRetInfo                  -- ^ misc rts ret info tables,  suffix _info
+  | CmmRet                      -- ^ misc rts return points,    suffix _ret
+  | CmmData                     -- ^ misc rts data bits, eg CHARLIKE_closure
+  | CmmCode                     -- ^ misc rts code
+  | CmmClosure                  -- ^ closures eg CHARLIKE_closure
+  | CmmPrimCall                 -- ^ a prim call to some hand written Cmm code
+  deriving (Eq, Ord)
+
+data DynamicLinkerLabelInfo
+  = CodeStub                    -- MachO: Lfoo$stub, ELF: foo@plt
+  | SymbolPtr                   -- MachO: Lfoo$non_lazy_ptr, Windows: __imp_foo
+  | GotSymbolPtr                -- ELF: foo@got
+  | GotSymbolOffset             -- ELF: foo@gotoff
+
+  deriving (Eq, Ord)
+
+
+-- -----------------------------------------------------------------------------
+-- Constructing CLabels
+-- -----------------------------------------------------------------------------
+
+-- Constructing IdLabels
+-- These are always local:
+
+mkSRTLabel     :: Unique -> CLabel
+mkSRTLabel u = SRTLabel u
+
+mkRednCountsLabel :: Name -> CLabel
+mkRednCountsLabel       name    =
+  IdLabel name NoCafRefs RednCounts  -- Note [ticky for LNE]
+
+-- These have local & (possibly) external variants:
+mkLocalClosureLabel      :: Name -> CafInfo -> CLabel
+mkLocalInfoTableLabel    :: Name -> CafInfo -> CLabel
+mkLocalClosureTableLabel :: Name -> CafInfo -> CLabel
+mkLocalClosureLabel     name c  = IdLabel name  c Closure
+mkLocalInfoTableLabel   name c  = IdLabel name  c LocalInfoTable
+mkLocalClosureTableLabel name c = IdLabel name  c ClosureTable
+
+mkClosureLabel              :: Name -> CafInfo -> CLabel
+mkInfoTableLabel            :: Name -> CafInfo -> CLabel
+mkEntryLabel                :: Name -> CafInfo -> CLabel
+mkClosureTableLabel         :: Name -> CafInfo -> CLabel
+mkConInfoTableLabel         :: Name -> CafInfo -> CLabel
+mkBytesLabel                :: Name -> CLabel
+mkClosureLabel name         c     = IdLabel name c Closure
+mkInfoTableLabel name       c     = IdLabel name c InfoTable
+mkEntryLabel name           c     = IdLabel name c Entry
+mkClosureTableLabel name    c     = IdLabel name c ClosureTable
+mkConInfoTableLabel name    c     = IdLabel name c ConInfoTable
+mkBytesLabel name                 = IdLabel name NoCafRefs Bytes
+
+mkBlockInfoTableLabel :: Name -> CafInfo -> CLabel
+mkBlockInfoTableLabel name c = IdLabel name c BlockInfoTable
+                               -- See Note [Proc-point local block entry-point].
+
+-- Constructing Cmm Labels
+mkDirty_MUT_VAR_Label,
+    mkNonmovingWriteBarrierEnabledLabel,
+    mkUpdInfoLabel,
+    mkBHUpdInfoLabel, mkIndStaticInfoLabel, mkMainCapabilityLabel,
+    mkMAP_FROZEN_CLEAN_infoLabel, mkMAP_FROZEN_DIRTY_infoLabel,
+    mkMAP_DIRTY_infoLabel,
+    mkArrWords_infoLabel,
+    mkTopTickyCtrLabel,
+    mkCAFBlackHoleInfoTableLabel,
+    mkSMAP_FROZEN_CLEAN_infoLabel, mkSMAP_FROZEN_DIRTY_infoLabel,
+    mkSMAP_DIRTY_infoLabel, mkBadAlignmentLabel :: CLabel
+mkDirty_MUT_VAR_Label           = mkForeignLabel (fsLit "dirty_MUT_VAR") Nothing ForeignLabelInExternalPackage IsFunction
+mkNonmovingWriteBarrierEnabledLabel
+                                = CmmLabel rtsUnitId (fsLit "nonmoving_write_barrier_enabled") CmmData
+mkUpdInfoLabel                  = CmmLabel rtsUnitId (fsLit "stg_upd_frame")         CmmInfo
+mkBHUpdInfoLabel                = CmmLabel rtsUnitId (fsLit "stg_bh_upd_frame" )     CmmInfo
+mkIndStaticInfoLabel            = CmmLabel rtsUnitId (fsLit "stg_IND_STATIC")        CmmInfo
+mkMainCapabilityLabel           = CmmLabel rtsUnitId (fsLit "MainCapability")        CmmData
+mkMAP_FROZEN_CLEAN_infoLabel    = CmmLabel rtsUnitId (fsLit "stg_MUT_ARR_PTRS_FROZEN_CLEAN") CmmInfo
+mkMAP_FROZEN_DIRTY_infoLabel    = CmmLabel rtsUnitId (fsLit "stg_MUT_ARR_PTRS_FROZEN_DIRTY") CmmInfo
+mkMAP_DIRTY_infoLabel           = CmmLabel rtsUnitId (fsLit "stg_MUT_ARR_PTRS_DIRTY") CmmInfo
+mkTopTickyCtrLabel              = CmmLabel rtsUnitId (fsLit "top_ct")                CmmData
+mkCAFBlackHoleInfoTableLabel    = CmmLabel rtsUnitId (fsLit "stg_CAF_BLACKHOLE")     CmmInfo
+mkArrWords_infoLabel            = CmmLabel rtsUnitId (fsLit "stg_ARR_WORDS")         CmmInfo
+mkSMAP_FROZEN_CLEAN_infoLabel   = CmmLabel rtsUnitId (fsLit "stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN") CmmInfo
+mkSMAP_FROZEN_DIRTY_infoLabel   = CmmLabel rtsUnitId (fsLit "stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY") CmmInfo
+mkSMAP_DIRTY_infoLabel          = CmmLabel rtsUnitId (fsLit "stg_SMALL_MUT_ARR_PTRS_DIRTY") CmmInfo
+mkBadAlignmentLabel             = CmmLabel rtsUnitId (fsLit "stg_badAlignment")      CmmEntry
+
+mkSRTInfoLabel :: Int -> CLabel
+mkSRTInfoLabel n = CmmLabel rtsUnitId lbl CmmInfo
+ where
+   lbl =
+     case n of
+       1 -> fsLit "stg_SRT_1"
+       2 -> fsLit "stg_SRT_2"
+       3 -> fsLit "stg_SRT_3"
+       4 -> fsLit "stg_SRT_4"
+       5 -> fsLit "stg_SRT_5"
+       6 -> fsLit "stg_SRT_6"
+       7 -> fsLit "stg_SRT_7"
+       8 -> fsLit "stg_SRT_8"
+       9 -> fsLit "stg_SRT_9"
+       10 -> fsLit "stg_SRT_10"
+       11 -> fsLit "stg_SRT_11"
+       12 -> fsLit "stg_SRT_12"
+       13 -> fsLit "stg_SRT_13"
+       14 -> fsLit "stg_SRT_14"
+       15 -> fsLit "stg_SRT_15"
+       16 -> fsLit "stg_SRT_16"
+       _ -> panic "mkSRTInfoLabel"
+
+-----
+mkCmmInfoLabel,   mkCmmEntryLabel, mkCmmRetInfoLabel, mkCmmRetLabel,
+  mkCmmCodeLabel, mkCmmDataLabel,  mkCmmClosureLabel
+        :: UnitId -> FastString -> CLabel
+
+mkCmmInfoLabel      pkg str     = CmmLabel pkg str CmmInfo
+mkCmmEntryLabel     pkg str     = CmmLabel pkg str CmmEntry
+mkCmmRetInfoLabel   pkg str     = CmmLabel pkg str CmmRetInfo
+mkCmmRetLabel       pkg str     = CmmLabel pkg str CmmRet
+mkCmmCodeLabel      pkg str     = CmmLabel pkg str CmmCode
+mkCmmDataLabel      pkg str     = CmmLabel pkg str CmmData
+mkCmmClosureLabel   pkg str     = CmmLabel pkg str CmmClosure
+
+mkLocalBlockLabel :: Unique -> CLabel
+mkLocalBlockLabel u = LocalBlockLabel u
+
+-- Constructing RtsLabels
+mkRtsPrimOpLabel :: PrimOp -> CLabel
+mkRtsPrimOpLabel primop         = RtsLabel (RtsPrimOp primop)
+
+mkSelectorInfoLabel  :: Bool -> Int -> CLabel
+mkSelectorEntryLabel :: Bool -> Int -> CLabel
+mkSelectorInfoLabel  upd off    = RtsLabel (RtsSelectorInfoTable upd off)
+mkSelectorEntryLabel upd off    = RtsLabel (RtsSelectorEntry     upd off)
+
+mkApInfoTableLabel :: Bool -> Int -> CLabel
+mkApEntryLabel     :: Bool -> Int -> CLabel
+mkApInfoTableLabel   upd off    = RtsLabel (RtsApInfoTable       upd off)
+mkApEntryLabel       upd off    = RtsLabel (RtsApEntry           upd off)
+
+
+-- A call to some primitive hand written Cmm code
+mkPrimCallLabel :: PrimCall -> CLabel
+mkPrimCallLabel (PrimCall str pkg)
+        = CmmLabel pkg str CmmPrimCall
+
+
+-- Constructing ForeignLabels
+
+-- | Make a foreign label
+mkForeignLabel
+        :: FastString           -- name
+        -> Maybe Int            -- size prefix
+        -> ForeignLabelSource   -- what package it's in
+        -> FunctionOrData
+        -> CLabel
+
+mkForeignLabel = ForeignLabel
+
+
+-- | Update the label size field in a ForeignLabel
+addLabelSize :: CLabel -> Int -> CLabel
+addLabelSize (ForeignLabel str _ src  fod) sz
+    = ForeignLabel str (Just sz) src fod
+addLabelSize label _
+    = label
+
+-- | Whether label is a top-level string literal
+isBytesLabel :: CLabel -> Bool
+isBytesLabel (IdLabel _ _ Bytes) = True
+isBytesLabel _lbl = False
+
+-- | Whether label is a non-haskell label (defined in C code)
+isForeignLabel :: CLabel -> Bool
+isForeignLabel (ForeignLabel _ _ _ _) = True
+isForeignLabel _lbl = False
+
+-- | Whether label is a static closure label (can come from haskell or cmm)
+isStaticClosureLabel :: CLabel -> Bool
+-- Closure defined in haskell (.hs)
+isStaticClosureLabel (IdLabel _ _ Closure) = True
+-- Closure defined in cmm
+isStaticClosureLabel (CmmLabel _ _ CmmClosure) = True
+isStaticClosureLabel _lbl = False
+
+-- | Whether label is a .rodata label
+isSomeRODataLabel :: CLabel -> Bool
+-- info table defined in haskell (.hs)
+isSomeRODataLabel (IdLabel _ _ ClosureTable) = True
+isSomeRODataLabel (IdLabel _ _ ConInfoTable) = True
+isSomeRODataLabel (IdLabel _ _ InfoTable) = True
+isSomeRODataLabel (IdLabel _ _ LocalInfoTable) = True
+isSomeRODataLabel (IdLabel _ _ BlockInfoTable) = True
+-- info table defined in cmm (.cmm)
+isSomeRODataLabel (CmmLabel _ _ CmmInfo) = True
+isSomeRODataLabel _lbl = False
+
+-- | Whether label is points to some kind of info table
+isInfoTableLabel :: CLabel -> Bool
+isInfoTableLabel (IdLabel _ _ InfoTable)      = True
+isInfoTableLabel (IdLabel _ _ LocalInfoTable) = True
+isInfoTableLabel (IdLabel _ _ ConInfoTable)   = True
+isInfoTableLabel (IdLabel _ _ BlockInfoTable) = True
+isInfoTableLabel _                            = False
+
+-- | Whether label is points to constructor info table
+isConInfoTableLabel :: CLabel -> Bool
+isConInfoTableLabel (IdLabel _ _ ConInfoTable)   = True
+isConInfoTableLabel _                            = False
+
+-- | Get the label size field from a ForeignLabel
+foreignLabelStdcallInfo :: CLabel -> Maybe Int
+foreignLabelStdcallInfo (ForeignLabel _ info _ _) = info
+foreignLabelStdcallInfo _lbl = Nothing
+
+
+-- Constructing Large*Labels
+mkBitmapLabel   :: Unique -> CLabel
+mkBitmapLabel   uniq            = LargeBitmapLabel uniq
+
+-- Constructing Cost Center Labels
+mkCCLabel  :: CostCentre      -> CLabel
+mkCCSLabel :: CostCentreStack -> CLabel
+mkCCLabel           cc          = CC_Label cc
+mkCCSLabel          ccs         = CCS_Label ccs
+
+mkRtsApFastLabel :: FastString -> CLabel
+mkRtsApFastLabel str = RtsLabel (RtsApFast str)
+
+mkRtsSlowFastTickyCtrLabel :: String -> CLabel
+mkRtsSlowFastTickyCtrLabel pat = RtsLabel (RtsSlowFastTickyCtr pat)
+
+
+-- Constructing Code Coverage Labels
+mkHpcTicksLabel :: Module -> CLabel
+mkHpcTicksLabel                = HpcTicksLabel
+
+
+-- Constructing labels used for dynamic linking
+mkDynamicLinkerLabel :: DynamicLinkerLabelInfo -> CLabel -> CLabel
+mkDynamicLinkerLabel            = DynamicLinkerLabel
+
+dynamicLinkerLabelInfo :: CLabel -> Maybe (DynamicLinkerLabelInfo, CLabel)
+dynamicLinkerLabelInfo (DynamicLinkerLabel info lbl) = Just (info, lbl)
+dynamicLinkerLabelInfo _        = Nothing
+
+mkPicBaseLabel :: CLabel
+mkPicBaseLabel                  = PicBaseLabel
+
+
+-- Constructing miscellaneous other labels
+mkDeadStripPreventer :: CLabel -> CLabel
+mkDeadStripPreventer lbl        = DeadStripPreventer lbl
+
+mkStringLitLabel :: Unique -> CLabel
+mkStringLitLabel                = StringLitLabel
+
+mkAsmTempLabel :: Uniquable a => a -> CLabel
+mkAsmTempLabel a                = AsmTempLabel (getUnique a)
+
+mkAsmTempDerivedLabel :: CLabel -> FastString -> CLabel
+mkAsmTempDerivedLabel = AsmTempDerivedLabel
+
+mkAsmTempEndLabel :: CLabel -> CLabel
+mkAsmTempEndLabel l = mkAsmTempDerivedLabel l (fsLit "_end")
+
+-- | Construct a label for a DWARF Debug Information Entity (DIE)
+-- describing another symbol.
+mkAsmTempDieLabel :: CLabel -> CLabel
+mkAsmTempDieLabel l = mkAsmTempDerivedLabel l (fsLit "_die")
+
+-- -----------------------------------------------------------------------------
+-- Convert between different kinds of label
+
+toClosureLbl :: CLabel -> CLabel
+toClosureLbl (IdLabel n c _) = IdLabel n c Closure
+toClosureLbl (CmmLabel m str _) = CmmLabel m str CmmClosure
+toClosureLbl l = pprPanic "toClosureLbl" (ppr l)
+
+toSlowEntryLbl :: CLabel -> CLabel
+toSlowEntryLbl (IdLabel n _ BlockInfoTable)
+  = pprPanic "toSlowEntryLbl" (ppr n)
+toSlowEntryLbl (IdLabel n c _) = IdLabel n c Slow
+toSlowEntryLbl l = pprPanic "toSlowEntryLbl" (ppr l)
+
+toEntryLbl :: CLabel -> CLabel
+toEntryLbl (IdLabel n c LocalInfoTable)  = IdLabel n c LocalEntry
+toEntryLbl (IdLabel n c ConInfoTable)    = IdLabel n c ConEntry
+toEntryLbl (IdLabel n _ BlockInfoTable)  = mkLocalBlockLabel (nameUnique n)
+                              -- See Note [Proc-point local block entry-point].
+toEntryLbl (IdLabel n c _)               = IdLabel n c Entry
+toEntryLbl (CmmLabel m str CmmInfo)      = CmmLabel m str CmmEntry
+toEntryLbl (CmmLabel m str CmmRetInfo)   = CmmLabel m str CmmRet
+toEntryLbl l = pprPanic "toEntryLbl" (ppr l)
+
+toInfoLbl :: CLabel -> CLabel
+toInfoLbl (IdLabel n c LocalEntry)     = IdLabel n c LocalInfoTable
+toInfoLbl (IdLabel n c ConEntry)       = IdLabel n c ConInfoTable
+toInfoLbl (IdLabel n c _)              = IdLabel n c InfoTable
+toInfoLbl (CmmLabel m str CmmEntry)    = CmmLabel m str CmmInfo
+toInfoLbl (CmmLabel m str CmmRet)      = CmmLabel m str CmmRetInfo
+toInfoLbl l = pprPanic "CLabel.toInfoLbl" (ppr l)
+
+hasHaskellName :: CLabel -> Maybe Name
+hasHaskellName (IdLabel n _ _) = Just n
+hasHaskellName _               = Nothing
+
+-- -----------------------------------------------------------------------------
+-- Does a CLabel's referent itself refer to a CAF?
+hasCAF :: CLabel -> Bool
+hasCAF (IdLabel _ _ RednCounts) = False -- Note [ticky for LNE]
+hasCAF (IdLabel _ MayHaveCafRefs _) = True
+hasCAF _                            = False
+
+-- Note [ticky for LNE]
+-- ~~~~~~~~~~~~~~~~~~~~~
+
+-- Until 14 Feb 2013, every ticky counter was associated with a
+-- closure. Thus, ticky labels used IdLabel. It is odd that
+-- GHC.Cmm.Info.Build.cafTransfers would consider such a ticky label
+-- reason to add the name to the CAFEnv (and thus eventually the SRT),
+-- but it was harmless because the ticky was only used if the closure
+-- was also.
+--
+-- Since we now have ticky counters for LNEs, it is no longer the case
+-- that every ticky counter has an actual closure. So I changed the
+-- generation of ticky counters' CLabels to not result in their
+-- associated id ending up in the SRT.
+--
+-- NB IdLabel is still appropriate for ticky ids (as opposed to
+-- CmmLabel) because the LNE's counter is still related to an .hs Id,
+-- that Id just isn't for a proper closure.
+
+-- -----------------------------------------------------------------------------
+-- Does a CLabel need declaring before use or not?
+--
+-- See wiki:commentary/compiler/backends/ppr-c#prototypes
+
+needsCDecl :: CLabel -> Bool
+  -- False <=> it's pre-declared; don't bother
+  -- don't bother declaring Bitmap labels, we always make sure
+  -- they are defined before use.
+needsCDecl (SRTLabel _)                 = True
+needsCDecl (LargeBitmapLabel _)         = False
+needsCDecl (IdLabel _ _ _)              = True
+needsCDecl (LocalBlockLabel _)          = True
+
+needsCDecl (StringLitLabel _)           = False
+needsCDecl (AsmTempLabel _)             = False
+needsCDecl (AsmTempDerivedLabel _ _)    = False
+needsCDecl (RtsLabel _)                 = False
+
+needsCDecl (CmmLabel pkgId _ _)
+        -- Prototypes for labels defined in the runtime system are imported
+        --      into HC files via includes/Stg.h.
+        | pkgId == rtsUnitId         = False
+
+        -- For other labels we inline one into the HC file directly.
+        | otherwise                     = True
+
+needsCDecl l@(ForeignLabel{})           = not (isMathFun l)
+needsCDecl (CC_Label _)                 = True
+needsCDecl (CCS_Label _)                = True
+needsCDecl (HpcTicksLabel _)            = True
+needsCDecl (DynamicLinkerLabel {})      = panic "needsCDecl DynamicLinkerLabel"
+needsCDecl PicBaseLabel                 = panic "needsCDecl PicBaseLabel"
+needsCDecl (DeadStripPreventer {})      = panic "needsCDecl DeadStripPreventer"
+
+-- | If a label is a local block label then return just its 'BlockId', otherwise
+-- 'Nothing'.
+maybeLocalBlockLabel :: CLabel -> Maybe BlockId
+maybeLocalBlockLabel (LocalBlockLabel uq)  = Just $ mkBlockId uq
+maybeLocalBlockLabel _                     = Nothing
+
+
+-- | Check whether a label corresponds to a C function that has
+--      a prototype in a system header somewhere, or is built-in
+--      to the C compiler. For these labels we avoid generating our
+--      own C prototypes.
+isMathFun :: CLabel -> Bool
+isMathFun (ForeignLabel fs _ _ _)       = fs `elementOfUniqSet` math_funs
+isMathFun _ = False
+
+math_funs :: UniqSet FastString
+math_funs = mkUniqSet [
+        -- _ISOC99_SOURCE
+        (fsLit "acos"),         (fsLit "acosf"),        (fsLit "acosh"),
+        (fsLit "acoshf"),       (fsLit "acoshl"),       (fsLit "acosl"),
+        (fsLit "asin"),         (fsLit "asinf"),        (fsLit "asinl"),
+        (fsLit "asinh"),        (fsLit "asinhf"),       (fsLit "asinhl"),
+        (fsLit "atan"),         (fsLit "atanf"),        (fsLit "atanl"),
+        (fsLit "atan2"),        (fsLit "atan2f"),       (fsLit "atan2l"),
+        (fsLit "atanh"),        (fsLit "atanhf"),       (fsLit "atanhl"),
+        (fsLit "cbrt"),         (fsLit "cbrtf"),        (fsLit "cbrtl"),
+        (fsLit "ceil"),         (fsLit "ceilf"),        (fsLit "ceill"),
+        (fsLit "copysign"),     (fsLit "copysignf"),    (fsLit "copysignl"),
+        (fsLit "cos"),          (fsLit "cosf"),         (fsLit "cosl"),
+        (fsLit "cosh"),         (fsLit "coshf"),        (fsLit "coshl"),
+        (fsLit "erf"),          (fsLit "erff"),         (fsLit "erfl"),
+        (fsLit "erfc"),         (fsLit "erfcf"),        (fsLit "erfcl"),
+        (fsLit "exp"),          (fsLit "expf"),         (fsLit "expl"),
+        (fsLit "exp2"),         (fsLit "exp2f"),        (fsLit "exp2l"),
+        (fsLit "expm1"),        (fsLit "expm1f"),       (fsLit "expm1l"),
+        (fsLit "fabs"),         (fsLit "fabsf"),        (fsLit "fabsl"),
+        (fsLit "fdim"),         (fsLit "fdimf"),        (fsLit "fdiml"),
+        (fsLit "floor"),        (fsLit "floorf"),       (fsLit "floorl"),
+        (fsLit "fma"),          (fsLit "fmaf"),         (fsLit "fmal"),
+        (fsLit "fmax"),         (fsLit "fmaxf"),        (fsLit "fmaxl"),
+        (fsLit "fmin"),         (fsLit "fminf"),        (fsLit "fminl"),
+        (fsLit "fmod"),         (fsLit "fmodf"),        (fsLit "fmodl"),
+        (fsLit "frexp"),        (fsLit "frexpf"),       (fsLit "frexpl"),
+        (fsLit "hypot"),        (fsLit "hypotf"),       (fsLit "hypotl"),
+        (fsLit "ilogb"),        (fsLit "ilogbf"),       (fsLit "ilogbl"),
+        (fsLit "ldexp"),        (fsLit "ldexpf"),       (fsLit "ldexpl"),
+        (fsLit "lgamma"),       (fsLit "lgammaf"),      (fsLit "lgammal"),
+        (fsLit "llrint"),       (fsLit "llrintf"),      (fsLit "llrintl"),
+        (fsLit "llround"),      (fsLit "llroundf"),     (fsLit "llroundl"),
+        (fsLit "log"),          (fsLit "logf"),         (fsLit "logl"),
+        (fsLit "log10l"),       (fsLit "log10"),        (fsLit "log10f"),
+        (fsLit "log1pl"),       (fsLit "log1p"),        (fsLit "log1pf"),
+        (fsLit "log2"),         (fsLit "log2f"),        (fsLit "log2l"),
+        (fsLit "logb"),         (fsLit "logbf"),        (fsLit "logbl"),
+        (fsLit "lrint"),        (fsLit "lrintf"),       (fsLit "lrintl"),
+        (fsLit "lround"),       (fsLit "lroundf"),      (fsLit "lroundl"),
+        (fsLit "modf"),         (fsLit "modff"),        (fsLit "modfl"),
+        (fsLit "nan"),          (fsLit "nanf"),         (fsLit "nanl"),
+        (fsLit "nearbyint"),    (fsLit "nearbyintf"),   (fsLit "nearbyintl"),
+        (fsLit "nextafter"),    (fsLit "nextafterf"),   (fsLit "nextafterl"),
+        (fsLit "nexttoward"),   (fsLit "nexttowardf"),  (fsLit "nexttowardl"),
+        (fsLit "pow"),          (fsLit "powf"),         (fsLit "powl"),
+        (fsLit "remainder"),    (fsLit "remainderf"),   (fsLit "remainderl"),
+        (fsLit "remquo"),       (fsLit "remquof"),      (fsLit "remquol"),
+        (fsLit "rint"),         (fsLit "rintf"),        (fsLit "rintl"),
+        (fsLit "round"),        (fsLit "roundf"),       (fsLit "roundl"),
+        (fsLit "scalbln"),      (fsLit "scalblnf"),     (fsLit "scalblnl"),
+        (fsLit "scalbn"),       (fsLit "scalbnf"),      (fsLit "scalbnl"),
+        (fsLit "sin"),          (fsLit "sinf"),         (fsLit "sinl"),
+        (fsLit "sinh"),         (fsLit "sinhf"),        (fsLit "sinhl"),
+        (fsLit "sqrt"),         (fsLit "sqrtf"),        (fsLit "sqrtl"),
+        (fsLit "tan"),          (fsLit "tanf"),         (fsLit "tanl"),
+        (fsLit "tanh"),         (fsLit "tanhf"),        (fsLit "tanhl"),
+        (fsLit "tgamma"),       (fsLit "tgammaf"),      (fsLit "tgammal"),
+        (fsLit "trunc"),        (fsLit "truncf"),       (fsLit "truncl"),
+        -- ISO C 99 also defines these function-like macros in math.h:
+        -- fpclassify, isfinite, isinf, isnormal, signbit, isgreater,
+        -- isgreaterequal, isless, islessequal, islessgreater, isunordered
+
+        -- additional symbols from _BSD_SOURCE
+        (fsLit "drem"),         (fsLit "dremf"),        (fsLit "dreml"),
+        (fsLit "finite"),       (fsLit "finitef"),      (fsLit "finitel"),
+        (fsLit "gamma"),        (fsLit "gammaf"),       (fsLit "gammal"),
+        (fsLit "isinf"),        (fsLit "isinff"),       (fsLit "isinfl"),
+        (fsLit "isnan"),        (fsLit "isnanf"),       (fsLit "isnanl"),
+        (fsLit "j0"),           (fsLit "j0f"),          (fsLit "j0l"),
+        (fsLit "j1"),           (fsLit "j1f"),          (fsLit "j1l"),
+        (fsLit "jn"),           (fsLit "jnf"),          (fsLit "jnl"),
+        (fsLit "lgamma_r"),     (fsLit "lgammaf_r"),    (fsLit "lgammal_r"),
+        (fsLit "scalb"),        (fsLit "scalbf"),       (fsLit "scalbl"),
+        (fsLit "significand"),  (fsLit "significandf"), (fsLit "significandl"),
+        (fsLit "y0"),           (fsLit "y0f"),          (fsLit "y0l"),
+        (fsLit "y1"),           (fsLit "y1f"),          (fsLit "y1l"),
+        (fsLit "yn"),           (fsLit "ynf"),          (fsLit "ynl"),
+
+        -- These functions are described in IEEE Std 754-2008 -
+        -- Standard for Floating-Point Arithmetic and ISO/IEC TS 18661
+        (fsLit "nextup"),       (fsLit "nextupf"),      (fsLit "nextupl"),
+        (fsLit "nextdown"),     (fsLit "nextdownf"),    (fsLit "nextdownl")
+    ]
+
+-- -----------------------------------------------------------------------------
+-- | Is a CLabel visible outside this object file or not?
+--      From the point of view of the code generator, a name is
+--      externally visible if it has to be declared as exported
+--      in the .o file's symbol table; that is, made non-static.
+externallyVisibleCLabel :: CLabel -> Bool -- not C "static"
+externallyVisibleCLabel (StringLitLabel _)      = False
+externallyVisibleCLabel (AsmTempLabel _)        = False
+externallyVisibleCLabel (AsmTempDerivedLabel _ _)= False
+externallyVisibleCLabel (RtsLabel _)            = True
+externallyVisibleCLabel (LocalBlockLabel _)     = False
+externallyVisibleCLabel (CmmLabel _ _ _)        = True
+externallyVisibleCLabel (ForeignLabel{})        = True
+externallyVisibleCLabel (IdLabel name _ info)   = isExternalName name && externallyVisibleIdLabel info
+externallyVisibleCLabel (CC_Label _)            = True
+externallyVisibleCLabel (CCS_Label _)           = True
+externallyVisibleCLabel (DynamicLinkerLabel _ _)  = False
+externallyVisibleCLabel (HpcTicksLabel _)       = True
+externallyVisibleCLabel (LargeBitmapLabel _)    = False
+externallyVisibleCLabel (SRTLabel _)            = False
+externallyVisibleCLabel (PicBaseLabel {}) = panic "externallyVisibleCLabel PicBaseLabel"
+externallyVisibleCLabel (DeadStripPreventer {}) = panic "externallyVisibleCLabel DeadStripPreventer"
+
+externallyVisibleIdLabel :: IdLabelInfo -> Bool
+externallyVisibleIdLabel LocalInfoTable  = False
+externallyVisibleIdLabel LocalEntry      = False
+externallyVisibleIdLabel BlockInfoTable  = False
+externallyVisibleIdLabel _               = True
+
+-- -----------------------------------------------------------------------------
+-- Finding the "type" of a CLabel
+
+-- For generating correct types in label declarations:
+
+data CLabelType
+  = CodeLabel   -- Address of some executable instructions
+  | DataLabel   -- Address of data, not a GC ptr
+  | GcPtrLabel  -- Address of a (presumably static) GC object
+
+isCFunctionLabel :: CLabel -> Bool
+isCFunctionLabel lbl = case labelType lbl of
+                        CodeLabel -> True
+                        _other    -> False
+
+isGcPtrLabel :: CLabel -> Bool
+isGcPtrLabel lbl = case labelType lbl of
+                        GcPtrLabel -> True
+                        _other     -> False
+
+
+-- | Work out the general type of data at the address of this label
+--    whether it be code, data, or static GC object.
+labelType :: CLabel -> CLabelType
+labelType (IdLabel _ _ info)                    = idInfoLabelType info
+labelType (CmmLabel _ _ CmmData)                = DataLabel
+labelType (CmmLabel _ _ CmmClosure)             = GcPtrLabel
+labelType (CmmLabel _ _ CmmCode)                = CodeLabel
+labelType (CmmLabel _ _ CmmInfo)                = DataLabel
+labelType (CmmLabel _ _ CmmEntry)               = CodeLabel
+labelType (CmmLabel _ _ CmmPrimCall)            = CodeLabel
+labelType (CmmLabel _ _ CmmRetInfo)             = DataLabel
+labelType (CmmLabel _ _ CmmRet)                 = CodeLabel
+labelType (RtsLabel (RtsSelectorInfoTable _ _)) = DataLabel
+labelType (RtsLabel (RtsApInfoTable _ _))       = DataLabel
+labelType (RtsLabel (RtsApFast _))              = CodeLabel
+labelType (RtsLabel _)                          = DataLabel
+labelType (LocalBlockLabel _)                   = CodeLabel
+labelType (SRTLabel _)                          = DataLabel
+labelType (ForeignLabel _ _ _ IsFunction)       = CodeLabel
+labelType (ForeignLabel _ _ _ IsData)           = DataLabel
+labelType (AsmTempLabel _)                      = panic "labelType(AsmTempLabel)"
+labelType (AsmTempDerivedLabel _ _)             = panic "labelType(AsmTempDerivedLabel)"
+labelType (StringLitLabel _)                    = DataLabel
+labelType (CC_Label _)                          = DataLabel
+labelType (CCS_Label _)                         = DataLabel
+labelType (DynamicLinkerLabel _ _)              = DataLabel -- Is this right?
+labelType PicBaseLabel                          = DataLabel
+labelType (DeadStripPreventer _)                = DataLabel
+labelType (HpcTicksLabel _)                     = DataLabel
+labelType (LargeBitmapLabel _)                  = DataLabel
+
+idInfoLabelType :: IdLabelInfo -> CLabelType
+idInfoLabelType info =
+  case info of
+    InfoTable     -> DataLabel
+    LocalInfoTable -> DataLabel
+    BlockInfoTable -> DataLabel
+    Closure       -> GcPtrLabel
+    ConInfoTable  -> DataLabel
+    ClosureTable  -> DataLabel
+    RednCounts    -> DataLabel
+    Bytes         -> DataLabel
+    _             -> CodeLabel
+
+
+-- -----------------------------------------------------------------------------
+
+-- | Is a 'CLabel' defined in the current module being compiled?
+--
+-- Sometimes we can optimise references within a compilation unit in ways that
+-- we couldn't for inter-module references. This provides a conservative
+-- estimate of whether a 'CLabel' lives in the current module.
+isLocalCLabel :: Module -> CLabel -> Bool
+isLocalCLabel this_mod lbl =
+  case lbl of
+    IdLabel name _ _
+      | isInternalName name -> True
+      | otherwise           -> nameModule name == this_mod
+    LocalBlockLabel _       -> True
+    _                       -> False
+
+-- -----------------------------------------------------------------------------
+
+-- | Does a 'CLabel' need dynamic linkage?
+--
+-- When referring to data in code, we need to know whether
+-- that data resides in a DLL or not. [Win32 only.]
+-- @labelDynamic@ returns @True@ if the label is located
+-- in a DLL, be it a data reference or not.
+labelDynamic :: DynFlags -> Module -> CLabel -> Bool
+labelDynamic dflags this_mod lbl =
+  case lbl of
+   -- is the RTS in a DLL or not?
+   RtsLabel _ ->
+     externalDynamicRefs && (this_pkg /= rtsUnitId)
+
+   IdLabel n _ _ ->
+     isDllName dflags this_mod n
+
+   -- When compiling in the "dyn" way, each package is to be linked into
+   -- its own shared library.
+   CmmLabel pkg _ _
+    | os == OSMinGW32 ->
+       externalDynamicRefs && (this_pkg /= pkg)
+    | otherwise ->
+       gopt Opt_ExternalDynamicRefs dflags
+
+   LocalBlockLabel _    -> False
+
+   ForeignLabel _ _ source _  ->
+       if os == OSMinGW32
+       then case source of
+            -- Foreign label is in some un-named foreign package (or DLL).
+            ForeignLabelInExternalPackage -> True
+
+            -- Foreign label is linked into the same package as the
+            -- source file currently being compiled.
+            ForeignLabelInThisPackage -> False
+
+            -- Foreign label is in some named package.
+            -- When compiling in the "dyn" way, each package is to be
+            -- linked into its own DLL.
+            ForeignLabelInPackage pkgId ->
+                externalDynamicRefs && (this_pkg /= pkgId)
+
+       else -- On Mac OS X and on ELF platforms, false positives are OK,
+            -- so we claim that all foreign imports come from dynamic
+            -- libraries
+            True
+
+   CC_Label cc ->
+     externalDynamicRefs && not (ccFromThisModule cc this_mod)
+
+   -- CCS_Label always contains a CostCentre defined in the current module
+   CCS_Label _ -> False
+
+   HpcTicksLabel m ->
+     externalDynamicRefs && this_mod /= m
+
+   -- Note that DynamicLinkerLabels do NOT require dynamic linking themselves.
+   _                 -> False
+  where
+    externalDynamicRefs = gopt Opt_ExternalDynamicRefs dflags
+    os = platformOS (targetPlatform dflags)
+    this_pkg = moduleUnitId this_mod
+
+
+-----------------------------------------------------------------------------
+-- Printing out CLabels.
+
+{-
+Convention:
+
+      <name>_<type>
+
+where <name> is <Module>_<name> for external names and <unique> for
+internal names. <type> is one of the following:
+
+         info                   Info table
+         srt                    Static reference table
+         entry                  Entry code (function, closure)
+         slow                   Slow entry code (if any)
+         ret                    Direct return address
+         vtbl                   Vector table
+         <n>_alt                Case alternative (tag n)
+         dflt                   Default case alternative
+         btm                    Large bitmap vector
+         closure                Static closure
+         con_entry              Dynamic Constructor entry code
+         con_info               Dynamic Constructor info table
+         static_entry           Static Constructor entry code
+         static_info            Static Constructor info table
+         sel_info               Selector info table
+         sel_entry              Selector entry code
+         cc                     Cost centre
+         ccs                    Cost centre stack
+
+Many of these distinctions are only for documentation reasons.  For
+example, _ret is only distinguished from _entry to make it easy to
+tell whether a code fragment is a return point or a closure/function
+entry.
+
+Note [Closure and info labels]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For a function 'foo, we have:
+   foo_info    : Points to the info table describing foo's closure
+                 (and entry code for foo with tables next to code)
+   foo_closure : Static (no-free-var) closure only:
+                 points to the statically-allocated closure
+
+For a data constructor (such as Just or Nothing), we have:
+    Just_con_info: Info table for the data constructor itself
+                   the first word of a heap-allocated Just
+    Just_info:     Info table for the *worker function*, an
+                   ordinary Haskell function of arity 1 that
+                   allocates a (Just x) box:
+                      Just = \x -> Just x
+    Just_closure:  The closure for this worker
+
+    Nothing_closure: a statically allocated closure for Nothing
+    Nothing_static_info: info table for Nothing_closure
+
+All these must be exported symbol, EXCEPT Just_info.  We don't need to
+export this because in other modules we either have
+       * A reference to 'Just'; use Just_closure
+       * A saturated call 'Just x'; allocate using Just_con_info
+Not exporting these Just_info labels reduces the number of symbols
+somewhat.
+
+Note [Bytes label]
+~~~~~~~~~~~~~~~~~~
+For a top-level string literal 'foo', we have just one symbol 'foo_bytes', which
+points to a static data block containing the content of the literal.
+
+Note [Proc-point local block entry-points]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+A label for a proc-point local block entry-point has no "_entry" suffix. With
+`infoTblLbl` we derive an info table label from a proc-point block ID. If
+we convert such an info table label into an entry label we must produce
+the label without an "_entry" suffix. So an info table label records
+the fact that it was derived from a block ID in `IdLabelInfo` as
+`BlockInfoTable`.
+
+The info table label and the local block label are both local labels
+and are not externally visible.
+-}
+
+instance Outputable CLabel where
+  ppr c = sdocWithDynFlags $ \dynFlags -> pprCLabel dynFlags c
+
+pprCLabel :: DynFlags -> CLabel -> SDoc
+
+pprCLabel _ (LocalBlockLabel u)
+  =  tempLabelPrefixOrUnderscore <> pprUniqueAlways u
+
+pprCLabel dynFlags (AsmTempLabel u)
+ | not (platformUnregisterised $ targetPlatform dynFlags)
+  =  tempLabelPrefixOrUnderscore <> pprUniqueAlways u
+
+pprCLabel dynFlags (AsmTempDerivedLabel l suf)
+ | platformMisc_ghcWithNativeCodeGen $ platformMisc dynFlags
+   = ptext (asmTempLabelPrefix $ targetPlatform dynFlags)
+     <> case l of AsmTempLabel u    -> pprUniqueAlways u
+                  LocalBlockLabel u -> pprUniqueAlways u
+                  _other            -> pprCLabel dynFlags l
+     <> ftext suf
+
+pprCLabel dynFlags (DynamicLinkerLabel info lbl)
+ | platformMisc_ghcWithNativeCodeGen $ platformMisc dynFlags
+   = pprDynamicLinkerAsmLabel (targetPlatform dynFlags) info lbl
+
+pprCLabel dynFlags PicBaseLabel
+ | platformMisc_ghcWithNativeCodeGen $ platformMisc dynFlags
+   = text "1b"
+
+pprCLabel dynFlags (DeadStripPreventer lbl)
+ | platformMisc_ghcWithNativeCodeGen $ platformMisc dynFlags
+   =
+   {-
+      `lbl` can be temp one but we need to ensure that dsp label will stay
+      in the final binary so we prepend non-temp prefix ("dsp_") and
+      optional `_` (underscore) because this is how you mark non-temp symbols
+      on some platforms (Darwin)
+   -}
+   maybe_underscore dynFlags $ text "dsp_"
+   <> pprCLabel dynFlags lbl <> text "_dsp"
+
+pprCLabel dynFlags (StringLitLabel u)
+ | platformMisc_ghcWithNativeCodeGen $ platformMisc dynFlags
+  = pprUniqueAlways u <> ptext (sLit "_str")
+
+pprCLabel dynFlags lbl
+   = getPprStyle $ \ sty ->
+     if platformMisc_ghcWithNativeCodeGen (platformMisc dynFlags) && asmStyle sty
+     then maybe_underscore dynFlags $ pprAsmCLbl (targetPlatform dynFlags) lbl
+     else pprCLbl lbl
+
+maybe_underscore :: DynFlags -> SDoc -> SDoc
+maybe_underscore dynFlags doc =
+  if platformMisc_leadingUnderscore $ platformMisc dynFlags
+  then pp_cSEP <> doc
+  else doc
+
+pprAsmCLbl :: Platform -> CLabel -> SDoc
+pprAsmCLbl platform (ForeignLabel fs (Just sz) _ _)
+ | platformOS platform == OSMinGW32
+    -- In asm mode, we need to put the suffix on a stdcall ForeignLabel.
+    -- (The C compiler does this itself).
+    = ftext fs <> char '@' <> int sz
+pprAsmCLbl _ lbl
+   = pprCLbl lbl
+
+pprCLbl :: CLabel -> SDoc
+pprCLbl (StringLitLabel u)
+  = pprUniqueAlways u <> text "_str"
+
+pprCLbl (SRTLabel u)
+  = tempLabelPrefixOrUnderscore <> pprUniqueAlways u <> pp_cSEP <> text "srt"
+
+pprCLbl (LargeBitmapLabel u)  =
+  tempLabelPrefixOrUnderscore
+  <> char 'b' <> pprUniqueAlways u <> pp_cSEP <> text "btm"
+-- Some bitsmaps for tuple constructors have a numeric tag (e.g. '7')
+-- until that gets resolved we'll just force them to start
+-- with a letter so the label will be legal assembly code.
+
+
+pprCLbl (CmmLabel _ str CmmCode)        = ftext str
+pprCLbl (CmmLabel _ str CmmData)        = ftext str
+pprCLbl (CmmLabel _ str CmmPrimCall)    = ftext str
+
+pprCLbl (LocalBlockLabel u)             =
+    tempLabelPrefixOrUnderscore <> text "blk_" <> pprUniqueAlways u
+
+pprCLbl (RtsLabel (RtsApFast str))   = ftext str <> text "_fast"
+
+pprCLbl (RtsLabel (RtsSelectorInfoTable upd_reqd offset))
+  = sdocWithDynFlags $ \dflags ->
+    ASSERT(offset >= 0 && offset <= mAX_SPEC_SELECTEE_SIZE dflags)
+    hcat [text "stg_sel_", text (show offset),
+          ptext (if upd_reqd
+                 then (sLit "_upd_info")
+                 else (sLit "_noupd_info"))
+        ]
+
+pprCLbl (RtsLabel (RtsSelectorEntry upd_reqd offset))
+  = sdocWithDynFlags $ \dflags ->
+    ASSERT(offset >= 0 && offset <= mAX_SPEC_SELECTEE_SIZE dflags)
+    hcat [text "stg_sel_", text (show offset),
+                ptext (if upd_reqd
+                        then (sLit "_upd_entry")
+                        else (sLit "_noupd_entry"))
+        ]
+
+pprCLbl (RtsLabel (RtsApInfoTable upd_reqd arity))
+  = sdocWithDynFlags $ \dflags ->
+    ASSERT(arity > 0 && arity <= mAX_SPEC_AP_SIZE dflags)
+    hcat [text "stg_ap_", text (show arity),
+                ptext (if upd_reqd
+                        then (sLit "_upd_info")
+                        else (sLit "_noupd_info"))
+        ]
+
+pprCLbl (RtsLabel (RtsApEntry upd_reqd arity))
+  = sdocWithDynFlags $ \dflags ->
+    ASSERT(arity > 0 && arity <= mAX_SPEC_AP_SIZE dflags)
+    hcat [text "stg_ap_", text (show arity),
+                ptext (if upd_reqd
+                        then (sLit "_upd_entry")
+                        else (sLit "_noupd_entry"))
+        ]
+
+pprCLbl (CmmLabel _ fs CmmInfo)
+  = ftext fs <> text "_info"
+
+pprCLbl (CmmLabel _ fs CmmEntry)
+  = ftext fs <> text "_entry"
+
+pprCLbl (CmmLabel _ fs CmmRetInfo)
+  = ftext fs <> text "_info"
+
+pprCLbl (CmmLabel _ fs CmmRet)
+  = ftext fs <> text "_ret"
+
+pprCLbl (CmmLabel _ fs CmmClosure)
+  = ftext fs <> text "_closure"
+
+pprCLbl (RtsLabel (RtsPrimOp primop))
+  = text "stg_" <> ppr primop
+
+pprCLbl (RtsLabel (RtsSlowFastTickyCtr pat))
+  = text "SLOW_CALL_fast_" <> text pat <> ptext (sLit "_ctr")
+
+pprCLbl (ForeignLabel str _ _ _)
+  = ftext str
+
+pprCLbl (IdLabel name _cafs flavor) =
+  internalNamePrefix name <> ppr name <> ppIdFlavor flavor
+
+pprCLbl (CC_Label cc)           = ppr cc
+pprCLbl (CCS_Label ccs)         = ppr ccs
+
+pprCLbl (HpcTicksLabel mod)
+  = text "_hpc_tickboxes_"  <> ppr mod <> ptext (sLit "_hpc")
+
+pprCLbl (AsmTempLabel {})       = panic "pprCLbl AsmTempLabel"
+pprCLbl (AsmTempDerivedLabel {})= panic "pprCLbl AsmTempDerivedLabel"
+pprCLbl (DynamicLinkerLabel {}) = panic "pprCLbl DynamicLinkerLabel"
+pprCLbl (PicBaseLabel {})       = panic "pprCLbl PicBaseLabel"
+pprCLbl (DeadStripPreventer {}) = panic "pprCLbl DeadStripPreventer"
+
+ppIdFlavor :: IdLabelInfo -> SDoc
+ppIdFlavor x = pp_cSEP <> text
+               (case x of
+                       Closure          -> "closure"
+                       InfoTable        -> "info"
+                       LocalInfoTable   -> "info"
+                       Entry            -> "entry"
+                       LocalEntry       -> "entry"
+                       Slow             -> "slow"
+                       RednCounts       -> "ct"
+                       ConEntry         -> "con_entry"
+                       ConInfoTable     -> "con_info"
+                       ClosureTable     -> "closure_tbl"
+                       Bytes            -> "bytes"
+                       BlockInfoTable   -> "info"
+                      )
+
+
+pp_cSEP :: SDoc
+pp_cSEP = char '_'
+
+
+instance Outputable ForeignLabelSource where
+ ppr fs
+  = case fs of
+        ForeignLabelInPackage pkgId     -> parens $ text "package: " <> ppr pkgId
+        ForeignLabelInThisPackage       -> parens $ text "this package"
+        ForeignLabelInExternalPackage   -> parens $ text "external package"
+
+internalNamePrefix :: Name -> SDoc
+internalNamePrefix name = getPprStyle $ \ sty ->
+  if asmStyle sty && isRandomGenerated then
+    sdocWithPlatform $ \platform ->
+      ptext (asmTempLabelPrefix platform)
+  else
+    empty
+  where
+    isRandomGenerated = not $ isExternalName name
+
+tempLabelPrefixOrUnderscore :: SDoc
+tempLabelPrefixOrUnderscore = sdocWithPlatform $ \platform ->
+  getPprStyle $ \ sty ->
+   if asmStyle sty then
+      ptext (asmTempLabelPrefix platform)
+   else
+      char '_'
+
+-- -----------------------------------------------------------------------------
+-- Machine-dependent knowledge about labels.
+
+asmTempLabelPrefix :: Platform -> PtrString  -- for formatting labels
+asmTempLabelPrefix platform = case platformOS platform of
+    OSDarwin -> sLit "L"
+    OSAIX    -> sLit "__L" -- follow IBM XL C's convention
+    _        -> sLit ".L"
+
+pprDynamicLinkerAsmLabel :: Platform -> DynamicLinkerLabelInfo -> CLabel -> SDoc
+pprDynamicLinkerAsmLabel platform dllInfo lbl =
+    case platformOS platform of
+      OSDarwin
+        | platformArch platform == ArchX86_64 ->
+          case dllInfo of
+            CodeStub        -> char 'L' <> ppr lbl <> text "$stub"
+            SymbolPtr       -> char 'L' <> ppr lbl <> text "$non_lazy_ptr"
+            GotSymbolPtr    -> ppr lbl <> text "@GOTPCREL"
+            GotSymbolOffset -> ppr lbl
+        | otherwise ->
+          case dllInfo of
+            CodeStub  -> char 'L' <> ppr lbl <> text "$stub"
+            SymbolPtr -> char 'L' <> ppr lbl <> text "$non_lazy_ptr"
+            _         -> panic "pprDynamicLinkerAsmLabel"
+
+      OSAIX ->
+          case dllInfo of
+            SymbolPtr -> text "LC.." <> ppr lbl -- GCC's naming convention
+            _         -> panic "pprDynamicLinkerAsmLabel"
+
+      _ | osElfTarget (platformOS platform) -> elfLabel
+
+      OSMinGW32 ->
+          case dllInfo of
+            SymbolPtr -> text "__imp_" <> ppr lbl
+            _         -> panic "pprDynamicLinkerAsmLabel"
+
+      _ -> panic "pprDynamicLinkerAsmLabel"
+  where
+    elfLabel
+      | platformArch platform == ArchPPC
+      = case dllInfo of
+          CodeStub  -> -- See Note [.LCTOC1 in PPC PIC code]
+                       ppr lbl <> text "+32768@plt"
+          SymbolPtr -> text ".LC_" <> ppr lbl
+          _         -> panic "pprDynamicLinkerAsmLabel"
+
+      | platformArch platform == ArchX86_64
+      = case dllInfo of
+          CodeStub        -> ppr lbl <> text "@plt"
+          GotSymbolPtr    -> ppr lbl <> text "@gotpcrel"
+          GotSymbolOffset -> ppr lbl
+          SymbolPtr       -> text ".LC_" <> ppr lbl
+
+      | platformArch platform == ArchPPC_64 ELF_V1
+        || platformArch platform == ArchPPC_64 ELF_V2
+      = case dllInfo of
+          GotSymbolPtr    -> text ".LC_"  <> ppr lbl
+                                  <> text "@toc"
+          GotSymbolOffset -> ppr lbl
+          SymbolPtr       -> text ".LC_" <> ppr lbl
+          _               -> panic "pprDynamicLinkerAsmLabel"
+
+      | otherwise
+      = case dllInfo of
+          CodeStub        -> ppr lbl <> text "@plt"
+          SymbolPtr       -> text ".LC_" <> ppr lbl
+          GotSymbolPtr    -> ppr lbl <> text "@got"
+          GotSymbolOffset -> ppr lbl <> text "@gotoff"
+
+-- Figure out whether `symbol` may serve as an alias
+-- to `target` within one compilation unit.
+--
+-- This is true if any of these holds:
+-- * `target` is a module-internal haskell name.
+-- * `target` is an exported name, but comes from the same
+--   module as `symbol`
+--
+-- These are sufficient conditions for establishing e.g. a
+-- GNU assembly alias ('.equiv' directive). Sadly, there is
+-- no such thing as an alias to an imported symbol (conf.
+-- http://blog.omega-prime.co.uk/2011/07/06/the-sad-state-of-symbol-aliases/)
+-- See note [emit-time elimination of static indirections].
+--
+-- Precondition is that both labels represent the
+-- same semantic value.
+
+mayRedirectTo :: CLabel -> CLabel -> Bool
+mayRedirectTo symbol target
+ | Just nam <- haskellName
+ , staticClosureLabel
+ , isExternalName nam
+ , Just mod <- nameModule_maybe nam
+ , Just anam <- hasHaskellName symbol
+ , Just amod <- nameModule_maybe anam
+ = amod == mod
+
+ | Just nam <- haskellName
+ , staticClosureLabel
+ , isInternalName nam
+ = True
+
+ | otherwise = False
+   where staticClosureLabel = isStaticClosureLabel target
+         haskellName = hasHaskellName target
+
+
+{-
+Note [emit-time elimination of static indirections]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+As described in #15155, certain static values are representationally
+equivalent, e.g. 'cast'ed values (when created by 'newtype' wrappers).
+
+             newtype A = A Int
+             {-# NOINLINE a #-}
+             a = A 42
+
+a1_rYB :: Int
+[GblId, Caf=NoCafRefs, Unf=OtherCon []]
+a1_rYB = GHC.Types.I# 42#
+
+a [InlPrag=NOINLINE] :: A
+[GblId, Unf=OtherCon []]
+a = a1_rYB `cast` (Sym (T15155.N:A[0]) :: Int ~R# A)
+
+Formerly we created static indirections for these (IND_STATIC), which
+consist of a statically allocated forwarding closure that contains
+the (possibly tagged) indirectee. (See CMM/assembly below.)
+This approach is suboptimal for two reasons:
+  (a) they occupy extra space,
+  (b) they need to be entered in order to obtain the indirectee,
+      thus they cannot be tagged.
+
+Fortunately there is a common case where static indirections can be
+eliminated while emitting assembly (native or LLVM), viz. when the
+indirectee is in the same module (object file) as the symbol that
+points to it. In this case an assembly-level identification can
+be created ('.equiv' directive), and as such the same object will
+be assigned two names in the symbol table. Any of the identified
+symbols can be referenced by a tagged pointer.
+
+Currently the 'mayRedirectTo' predicate will
+give a clue whether a label can be equated with another, already
+emitted, label (which can in turn be an alias). The general mechanics
+is that we identify data (IND_STATIC closures) that are amenable
+to aliasing while pretty-printing of assembly output, and emit the
+'.equiv' directive instead of static data in such a case.
+
+Here is a sketch how the output is massaged:
+
+                     Consider
+newtype A = A Int
+{-# NOINLINE a #-}
+a = A 42                                -- I# 42# is the indirectee
+                                        -- 'a' is exported
+
+                 results in STG
+
+a1_rXq :: GHC.Types.Int
+[GblId, Caf=NoCafRefs, Unf=OtherCon []] =
+    CCS_DONT_CARE GHC.Types.I#! [42#];
+
+T15155.a [InlPrag=NOINLINE] :: T15155.A
+[GblId, Unf=OtherCon []] =
+    CAF_ccs  \ u  []  a1_rXq;
+
+                 and CMM
+
+[section ""data" . a1_rXq_closure" {
+     a1_rXq_closure:
+         const GHC.Types.I#_con_info;
+         const 42;
+ }]
+
+[section ""data" . T15155.a_closure" {
+     T15155.a_closure:
+         const stg_IND_STATIC_info;
+         const a1_rXq_closure+1;
+         const 0;
+         const 0;
+ }]
+
+The emitted assembly is
+
+#### INDIRECTEE
+a1_rXq_closure:                         -- module local haskell value
+        .quad   GHC.Types.I#_con_info   -- an Int
+        .quad   42
+
+#### BEFORE
+.globl T15155.a_closure                 -- exported newtype wrapped value
+T15155.a_closure:
+        .quad   stg_IND_STATIC_info     -- the closure info
+        .quad   a1_rXq_closure+1        -- indirectee ('+1' being the tag)
+        .quad   0
+        .quad   0
+
+#### AFTER
+.globl T15155.a_closure                 -- exported newtype wrapped value
+.equiv a1_rXq_closure,T15155.a_closure  -- both are shared
+
+The transformation is performed because
+     T15155.a_closure `mayRedirectTo` a1_rXq_closure+1
+returns True.
+-}
diff --git a/compiler/GHC/Cmm/CallConv.hs b/compiler/GHC/Cmm/CallConv.hs
new file mode 100644
index 0000000000..9200daec57
--- /dev/null
+++ b/compiler/GHC/Cmm/CallConv.hs
@@ -0,0 +1,212 @@
+module GHC.Cmm.CallConv (
+  ParamLocation(..),
+  assignArgumentsPos,
+  assignStack,
+  realArgRegsCover
+) where
+
+import GhcPrelude
+
+import GHC.Cmm.Expr
+import GHC.Runtime.Layout
+import GHC.Cmm (Convention(..))
+import GHC.Cmm.Ppr () -- For Outputable instances
+
+import DynFlags
+import GHC.Platform
+import Outputable
+
+-- Calculate the 'GlobalReg' or stack locations for function call
+-- parameters as used by the Cmm calling convention.
+
+data ParamLocation
+  = RegisterParam GlobalReg
+  | StackParam ByteOff
+
+instance Outputable ParamLocation where
+  ppr (RegisterParam g) = ppr g
+  ppr (StackParam p)    = ppr p
+
+-- |
+-- Given a list of arguments, and a function that tells their types,
+-- return a list showing where each argument is passed
+--
+assignArgumentsPos :: DynFlags
+                   -> ByteOff           -- stack offset to start with
+                   -> Convention
+                   -> (a -> CmmType)    -- how to get a type from an arg
+                   -> [a]               -- args
+                   -> (
+                        ByteOff              -- bytes of stack args
+                      , [(a, ParamLocation)] -- args and locations
+                      )
+
+assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
+    where
+      regs = case (reps, conv) of
+               (_,   NativeNodeCall)   -> getRegsWithNode dflags
+               (_,   NativeDirectCall) -> getRegsWithoutNode dflags
+               ([_], NativeReturn)     -> allRegs dflags
+               (_,   NativeReturn)     -> getRegsWithNode dflags
+               -- GC calling convention *must* put values in registers
+               (_,   GC)               -> allRegs dflags
+               (_,   Slow)             -> nodeOnly
+      -- The calling conventions first assign arguments to registers,
+      -- then switch to the stack when we first run out of registers
+      -- (even if there are still available registers for args of a
+      -- different type).  When returning an unboxed tuple, we also
+      -- separate the stack arguments by pointerhood.
+      (reg_assts, stk_args)  = assign_regs [] reps regs
+      (stk_off,   stk_assts) = assignStack dflags off arg_ty stk_args
+      assignments = reg_assts ++ stk_assts
+
+      assign_regs assts []     _    = (assts, [])
+      assign_regs assts (r:rs) regs | isVecType ty   = vec
+                                    | isFloatType ty = float
+                                    | otherwise      = int
+        where vec = case (w, regs) of
+                      (W128, (vs, fs, ds, ls, s:ss))
+                          | passVectorInReg W128 dflags -> k (RegisterParam (XmmReg s), (vs, fs, ds, ls, ss))
+                      (W256, (vs, fs, ds, ls, s:ss))
+                          | passVectorInReg W256 dflags -> k (RegisterParam (YmmReg s), (vs, fs, ds, ls, ss))
+                      (W512, (vs, fs, ds, ls, s:ss))
+                          | passVectorInReg W512 dflags -> k (RegisterParam (ZmmReg s), (vs, fs, ds, ls, ss))
+                      _ -> (assts, (r:rs))
+              float = case (w, regs) of
+                        (W32, (vs, fs, ds, ls, s:ss))
+                            | passFloatInXmm          -> k (RegisterParam (FloatReg s), (vs, fs, ds, ls, ss))
+                        (W32, (vs, f:fs, ds, ls, ss))
+                            | not passFloatInXmm      -> k (RegisterParam f, (vs, fs, ds, ls, ss))
+                        (W64, (vs, fs, ds, ls, s:ss))
+                            | passFloatInXmm          -> k (RegisterParam (DoubleReg s), (vs, fs, ds, ls, ss))
+                        (W64, (vs, fs, d:ds, ls, ss))
+                            | not passFloatInXmm      -> k (RegisterParam d, (vs, fs, ds, ls, ss))
+                        _ -> (assts, (r:rs))
+              int = case (w, regs) of
+                      (W128, _) -> panic "W128 unsupported register type"
+                      (_, (v:vs, fs, ds, ls, ss)) | widthInBits w <= widthInBits (wordWidth dflags)
+                          -> k (RegisterParam (v gcp), (vs, fs, ds, ls, ss))
+                      (_, (vs, fs, ds, l:ls, ss)) | widthInBits w > widthInBits (wordWidth dflags)
+                          -> k (RegisterParam l, (vs, fs, ds, ls, ss))
+                      _   -> (assts, (r:rs))
+              k (asst, regs') = assign_regs ((r, asst) : assts) rs regs'
+              ty = arg_ty r
+              w  = typeWidth ty
+              gcp | isGcPtrType ty = VGcPtr
+                  | otherwise      = VNonGcPtr
+              passFloatInXmm = passFloatArgsInXmm dflags
+
+passFloatArgsInXmm :: DynFlags -> Bool
+passFloatArgsInXmm dflags = case platformArch (targetPlatform dflags) of
+                              ArchX86_64 -> True
+                              ArchX86    -> False
+                              _          -> False
+
+-- We used to spill vector registers to the stack since the LLVM backend didn't
+-- support vector registers in its calling convention. However, this has now
+-- been fixed. This function remains only as a convenient way to re-enable
+-- spilling when debugging code generation.
+passVectorInReg :: Width -> DynFlags -> Bool
+passVectorInReg _ _ = True
+
+assignStack :: DynFlags -> ByteOff -> (a -> CmmType) -> [a]
+            -> (
+                 ByteOff              -- bytes of stack args
+               , [(a, ParamLocation)] -- args and locations
+               )
+assignStack dflags offset arg_ty args = assign_stk offset [] (reverse args)
+ where
+      assign_stk offset assts [] = (offset, assts)
+      assign_stk offset assts (r:rs)
+        = assign_stk off' ((r, StackParam off') : assts) rs
+        where w    = typeWidth (arg_ty r)
+              off' = offset + size
+              -- Stack arguments always take a whole number of words, we never
+              -- pack them unlike constructor fields.
+              size = roundUpToWords dflags (widthInBytes w)
+
+-----------------------------------------------------------------------------
+-- Local information about the registers available
+
+type AvailRegs = ( [VGcPtr -> GlobalReg]   -- available vanilla regs.
+                 , [GlobalReg]   -- floats
+                 , [GlobalReg]   -- doubles
+                 , [GlobalReg]   -- longs (int64 and word64)
+                 , [Int]         -- XMM (floats and doubles)
+                 )
+
+-- Vanilla registers can contain pointers, Ints, Chars.
+-- Floats and doubles have separate register supplies.
+--
+-- We take these register supplies from the *real* registers, i.e. those
+-- that are guaranteed to map to machine registers.
+
+getRegsWithoutNode, getRegsWithNode :: DynFlags -> AvailRegs
+getRegsWithoutNode dflags =
+  ( filter (\r -> r VGcPtr /= node) (realVanillaRegs dflags)
+  , realFloatRegs dflags
+  , realDoubleRegs dflags
+  , realLongRegs dflags
+  , realXmmRegNos dflags)
+
+-- getRegsWithNode uses R1/node even if it isn't a register
+getRegsWithNode dflags =
+  ( if null (realVanillaRegs dflags)
+    then [VanillaReg 1]
+    else realVanillaRegs dflags
+  , realFloatRegs dflags
+  , realDoubleRegs dflags
+  , realLongRegs dflags
+  , realXmmRegNos dflags)
+
+allFloatRegs, allDoubleRegs, allLongRegs :: DynFlags -> [GlobalReg]
+allVanillaRegs :: DynFlags -> [VGcPtr -> GlobalReg]
+allXmmRegs :: DynFlags -> [Int]
+
+allVanillaRegs dflags = map VanillaReg $ regList (mAX_Vanilla_REG dflags)
+allFloatRegs   dflags = map FloatReg   $ regList (mAX_Float_REG   dflags)
+allDoubleRegs  dflags = map DoubleReg  $ regList (mAX_Double_REG  dflags)
+allLongRegs    dflags = map LongReg    $ regList (mAX_Long_REG    dflags)
+allXmmRegs     dflags =                  regList (mAX_XMM_REG     dflags)
+
+realFloatRegs, realDoubleRegs, realLongRegs :: DynFlags -> [GlobalReg]
+realVanillaRegs :: DynFlags -> [VGcPtr -> GlobalReg]
+realXmmRegNos :: DynFlags -> [Int]
+
+realVanillaRegs dflags = map VanillaReg $ regList (mAX_Real_Vanilla_REG dflags)
+realFloatRegs   dflags = map FloatReg   $ regList (mAX_Real_Float_REG   dflags)
+realDoubleRegs  dflags = map DoubleReg  $ regList (mAX_Real_Double_REG  dflags)
+realLongRegs    dflags = map LongReg    $ regList (mAX_Real_Long_REG    dflags)
+
+realXmmRegNos dflags
+    | isSse2Enabled dflags = regList (mAX_Real_XMM_REG     dflags)
+    | otherwise            = []
+
+regList :: Int -> [Int]
+regList n = [1 .. n]
+
+allRegs :: DynFlags -> AvailRegs
+allRegs dflags = (allVanillaRegs dflags,
+                  allFloatRegs dflags,
+                  allDoubleRegs dflags,
+                  allLongRegs dflags,
+                  allXmmRegs dflags)
+
+nodeOnly :: AvailRegs
+nodeOnly = ([VanillaReg 1], [], [], [], [])
+
+-- This returns the set of global registers that *cover* the machine registers
+-- used for argument passing. On platforms where registers can overlap---right
+-- now just x86-64, where Float and Double registers overlap---passing this set
+-- of registers is guaranteed to preserve the contents of all live registers. We
+-- only use this functionality in hand-written C-- code in the RTS.
+realArgRegsCover :: DynFlags -> [GlobalReg]
+realArgRegsCover dflags
+    | passFloatArgsInXmm dflags = map ($VGcPtr) (realVanillaRegs dflags) ++
+                                  realLongRegs dflags ++
+                                  map XmmReg (realXmmRegNos dflags)
+    | otherwise                 = map ($VGcPtr) (realVanillaRegs dflags) ++
+                                  realFloatRegs dflags ++
+                                  realDoubleRegs dflags ++
+                                  realLongRegs dflags ++
+                                  map XmmReg (realXmmRegNos dflags)
diff --git a/compiler/GHC/Cmm/CommonBlockElim.hs b/compiler/GHC/Cmm/CommonBlockElim.hs
new file mode 100644
index 0000000000..86ea0e94e2
--- /dev/null
+++ b/compiler/GHC/Cmm/CommonBlockElim.hs
@@ -0,0 +1,320 @@
+{-# LANGUAGE GADTs, BangPatterns, ScopedTypeVariables #-}
+
+module GHC.Cmm.CommonBlockElim
+  ( elimCommonBlocks
+  )
+where
+
+
+import GhcPrelude hiding (iterate, succ, unzip, zip)
+
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch (eqSwitchTargetWith)
+import GHC.Cmm.ContFlowOpt
+
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Dataflow.Collections
+import Data.Bits
+import Data.Maybe (mapMaybe)
+import qualified Data.List as List
+import Data.Word
+import qualified Data.Map as M
+import Outputable
+import qualified TrieMap as TM
+import UniqFM
+import Unique
+import Control.Arrow (first, second)
+
+-- -----------------------------------------------------------------------------
+-- Eliminate common blocks
+
+-- If two blocks are identical except for the label on the first node,
+-- then we can eliminate one of the blocks. To ensure that the semantics
+-- of the program are preserved, we have to rewrite each predecessor of the
+-- eliminated block to proceed with the block we keep.
+
+-- The algorithm iterates over the blocks in the graph,
+-- checking whether it has seen another block that is equal modulo labels.
+-- If so, then it adds an entry in a map indicating that the new block
+-- is made redundant by the old block.
+-- Otherwise, it is added to the useful blocks.
+
+-- To avoid comparing every block with every other block repeatedly, we group
+-- them by
+--   * a hash of the block, ignoring labels (explained below)
+--   * the list of outgoing labels
+-- The hash is invariant under relabeling, so we only ever compare within
+-- the same group of blocks.
+--
+-- The list of outgoing labels is updated as we merge blocks (that is why they
+-- are not included in the hash, which we want to calculate only once).
+--
+-- All in all, two blocks should never be compared if they have different
+-- hashes, and at most once otherwise. Previously, we were slower, and people
+-- rightfully complained: #10397
+
+-- TODO: Use optimization fuel
+elimCommonBlocks :: CmmGraph -> CmmGraph
+elimCommonBlocks g = replaceLabels env $ copyTicks env g
+  where
+     env = iterate mapEmpty blocks_with_key
+     -- The order of blocks doesn't matter here. While we could use
+     -- revPostorder which drops unreachable blocks this is done in
+     -- ContFlowOpt already which runs before this pass. So we use
+     -- toBlockList since it is faster.
+     groups = groupByInt hash_block (toBlockList g) :: [[CmmBlock]]
+     blocks_with_key = [ [ (successors b, [b]) | b <- bs] | bs <- groups]
+
+-- Invariant: The blocks in the list are pairwise distinct
+-- (so avoid comparing them again)
+type DistinctBlocks = [CmmBlock]
+type Key = [Label]
+type Subst = LabelMap BlockId
+
+-- The outer list groups by hash. We retain this grouping throughout.
+iterate :: Subst -> [[(Key, DistinctBlocks)]] -> Subst
+iterate subst blocks
+    | mapNull new_substs = subst
+    | otherwise = iterate subst' updated_blocks
+  where
+    grouped_blocks :: [[(Key, [DistinctBlocks])]]
+    grouped_blocks = map groupByLabel blocks
+
+    merged_blocks :: [[(Key, DistinctBlocks)]]
+    (new_substs, merged_blocks) = List.mapAccumL (List.mapAccumL go) mapEmpty grouped_blocks
+      where
+        go !new_subst1 (k,dbs) = (new_subst1 `mapUnion` new_subst2, (k,db))
+          where
+            (new_subst2, db) = mergeBlockList subst dbs
+
+    subst' = subst `mapUnion` new_substs
+    updated_blocks = map (map (first (map (lookupBid subst')))) merged_blocks
+
+-- Combine two lists of blocks.
+-- While they are internally distinct they can still share common blocks.
+mergeBlocks :: Subst -> DistinctBlocks -> DistinctBlocks -> (Subst, DistinctBlocks)
+mergeBlocks subst existing new = go new
+  where
+    go [] = (mapEmpty, existing)
+    go (b:bs) = case List.find (eqBlockBodyWith (eqBid subst) b) existing of
+        -- This block is a duplicate. Drop it, and add it to the substitution
+        Just b' -> first (mapInsert (entryLabel b) (entryLabel b')) $ go bs
+        -- This block is not a duplicate, keep it.
+        Nothing -> second (b:) $ go bs
+
+mergeBlockList :: Subst -> [DistinctBlocks] -> (Subst, DistinctBlocks)
+mergeBlockList _ [] = pprPanic "mergeBlockList" empty
+mergeBlockList subst (b:bs) = go mapEmpty b bs
+  where
+    go !new_subst1 b [] = (new_subst1, b)
+    go !new_subst1 b1 (b2:bs) = go new_subst b bs
+      where
+        (new_subst2, b) =  mergeBlocks subst b1 b2
+        new_subst = new_subst1 `mapUnion` new_subst2
+
+
+-- -----------------------------------------------------------------------------
+-- Hashing and equality on blocks
+
+-- Below here is mostly boilerplate: hashing blocks ignoring labels,
+-- and comparing blocks modulo a label mapping.
+
+-- To speed up comparisons, we hash each basic block modulo jump labels.
+-- The hashing is a bit arbitrary (the numbers are completely arbitrary),
+-- but it should be fast and good enough.
+
+-- We want to get as many small buckets as possible, as comparing blocks is
+-- expensive. So include as much as possible in the hash. Ideally everything
+-- that is compared with (==) in eqBlockBodyWith.
+
+type HashCode = Int
+
+hash_block :: CmmBlock -> HashCode
+hash_block block =
+  fromIntegral (foldBlockNodesB3 (hash_fst, hash_mid, hash_lst) block (0 :: Word32) .&. (0x7fffffff :: Word32))
+  -- UniqFM doesn't like negative Ints
+  where hash_fst _ h = h
+        hash_mid m h = hash_node m + h `shiftL` 1
+        hash_lst m h = hash_node m + h `shiftL` 1
+
+        hash_node :: CmmNode O x -> Word32
+        hash_node n | dont_care n = 0 -- don't care
+        hash_node (CmmAssign r e) = hash_reg r + hash_e e
+        hash_node (CmmStore e e') = hash_e e + hash_e e'
+        hash_node (CmmUnsafeForeignCall t _ as) = hash_tgt t + hash_list hash_e as
+        hash_node (CmmBranch _) = 23 -- NB. ignore the label
+        hash_node (CmmCondBranch p _ _ _) = hash_e p
+        hash_node (CmmCall e _ _ _ _ _) = hash_e e
+        hash_node (CmmForeignCall t _ _ _ _ _ _) = hash_tgt t
+        hash_node (CmmSwitch e _) = hash_e e
+        hash_node _ = error "hash_node: unknown Cmm node!"
+
+        hash_reg :: CmmReg -> Word32
+        hash_reg   (CmmLocal localReg) = hash_unique localReg -- important for performance, see #10397
+        hash_reg   (CmmGlobal _)    = 19
+
+        hash_e :: CmmExpr -> Word32
+        hash_e (CmmLit l) = hash_lit l
+        hash_e (CmmLoad e _) = 67 + hash_e e
+        hash_e (CmmReg r) = hash_reg r
+        hash_e (CmmMachOp _ es) = hash_list hash_e es -- pessimal - no operator check
+        hash_e (CmmRegOff r i) = hash_reg r + cvt i
+        hash_e (CmmStackSlot _ _) = 13
+
+        hash_lit :: CmmLit -> Word32
+        hash_lit (CmmInt i _) = fromInteger i
+        hash_lit (CmmFloat r _) = truncate r
+        hash_lit (CmmVec ls) = hash_list hash_lit ls
+        hash_lit (CmmLabel _) = 119 -- ugh
+        hash_lit (CmmLabelOff _ i) = cvt $ 199 + i
+        hash_lit (CmmLabelDiffOff _ _ i _) = cvt $ 299 + i
+        hash_lit (CmmBlock _) = 191 -- ugh
+        hash_lit (CmmHighStackMark) = cvt 313
+
+        hash_tgt (ForeignTarget e _) = hash_e e
+        hash_tgt (PrimTarget _) = 31 -- lots of these
+
+        hash_list f = foldl' (\z x -> f x + z) (0::Word32)
+
+        cvt = fromInteger . toInteger
+
+        hash_unique :: Uniquable a => a -> Word32
+        hash_unique = cvt . getKey . getUnique
+
+-- | Ignore these node types for equality
+dont_care :: CmmNode O x -> Bool
+dont_care CmmComment {}  = True
+dont_care CmmTick {}     = True
+dont_care CmmUnwind {}   = True
+dont_care _other         = False
+
+-- Utilities: equality and substitution on the graph.
+
+-- Given a map ``subst'' from BlockID -> BlockID, we define equality.
+eqBid :: LabelMap BlockId -> BlockId -> BlockId -> Bool
+eqBid subst bid bid' = lookupBid subst bid == lookupBid subst bid'
+lookupBid :: LabelMap BlockId -> BlockId -> BlockId
+lookupBid subst bid = case mapLookup bid subst of
+                        Just bid  -> lookupBid subst bid
+                        Nothing -> bid
+
+-- Middle nodes and expressions can contain BlockIds, in particular in
+-- CmmStackSlot and CmmBlock, so we have to use a special equality for
+-- these.
+--
+eqMiddleWith :: (BlockId -> BlockId -> Bool)
+             -> CmmNode O O -> CmmNode O O -> Bool
+eqMiddleWith eqBid (CmmAssign r1 e1) (CmmAssign r2 e2)
+  = r1 == r2 && eqExprWith eqBid e1 e2
+eqMiddleWith eqBid (CmmStore l1 r1) (CmmStore l2 r2)
+  = eqExprWith eqBid l1 l2 && eqExprWith eqBid r1 r2
+eqMiddleWith eqBid (CmmUnsafeForeignCall t1 r1 a1)
+                   (CmmUnsafeForeignCall t2 r2 a2)
+  = t1 == t2 && r1 == r2 && eqListWith (eqExprWith eqBid) a1 a2
+eqMiddleWith _ _ _ = False
+
+eqExprWith :: (BlockId -> BlockId -> Bool)
+           -> CmmExpr -> CmmExpr -> Bool
+eqExprWith eqBid = eq
+ where
+  CmmLit l1          `eq` CmmLit l2          = eqLit l1 l2
+  CmmLoad e1 _       `eq` CmmLoad e2 _       = e1 `eq` e2
+  CmmReg r1          `eq` CmmReg r2          = r1==r2
+  CmmRegOff r1 i1    `eq` CmmRegOff r2 i2    = r1==r2 && i1==i2
+  CmmMachOp op1 es1  `eq` CmmMachOp op2 es2  = op1==op2 && es1 `eqs` es2
+  CmmStackSlot a1 i1 `eq` CmmStackSlot a2 i2 = eqArea a1 a2 && i1==i2
+  _e1                `eq` _e2                = False
+
+  xs `eqs` ys = eqListWith eq xs ys
+
+  eqLit (CmmBlock id1) (CmmBlock id2) = eqBid id1 id2
+  eqLit l1 l2 = l1 == l2
+
+  eqArea Old Old = True
+  eqArea (Young id1) (Young id2) = eqBid id1 id2
+  eqArea _ _ = False
+
+-- Equality on the body of a block, modulo a function mapping block
+-- IDs to block IDs.
+eqBlockBodyWith :: (BlockId -> BlockId -> Bool) -> CmmBlock -> CmmBlock -> Bool
+eqBlockBodyWith eqBid block block'
+  {-
+  | equal     = pprTrace "equal" (vcat [ppr block, ppr block']) True
+  | otherwise = pprTrace "not equal" (vcat [ppr block, ppr block']) False
+  -}
+  = equal
+  where (_,m,l)   = blockSplit block
+        nodes     = filter (not . dont_care) (blockToList m)
+        (_,m',l') = blockSplit block'
+        nodes'    = filter (not . dont_care) (blockToList m')
+
+        equal = eqListWith (eqMiddleWith eqBid) nodes nodes' &&
+                eqLastWith eqBid l l'
+
+
+eqLastWith :: (BlockId -> BlockId -> Bool) -> CmmNode O C -> CmmNode O C -> Bool
+eqLastWith eqBid (CmmBranch bid1) (CmmBranch bid2) = eqBid bid1 bid2
+eqLastWith eqBid (CmmCondBranch c1 t1 f1 l1) (CmmCondBranch c2 t2 f2 l2) =
+  c1 == c2 && l1 == l2 && eqBid t1 t2 && eqBid f1 f2
+eqLastWith eqBid (CmmCall t1 c1 g1 a1 r1 u1) (CmmCall t2 c2 g2 a2 r2 u2) =
+  t1 == t2 && eqMaybeWith eqBid c1 c2 && a1 == a2 && r1 == r2 && u1 == u2 && g1 == g2
+eqLastWith eqBid (CmmSwitch e1 ids1) (CmmSwitch e2 ids2) =
+  e1 == e2 && eqSwitchTargetWith eqBid ids1 ids2
+eqLastWith _ _ _ = False
+
+eqMaybeWith :: (a -> b -> Bool) -> Maybe a -> Maybe b -> Bool
+eqMaybeWith eltEq (Just e) (Just e') = eltEq e e'
+eqMaybeWith _ Nothing Nothing = True
+eqMaybeWith _ _ _ = False
+
+eqListWith :: (a -> b -> Bool) -> [a] -> [b] -> Bool
+eqListWith f (a : as) (b : bs) = f a b && eqListWith f as bs
+eqListWith _ []       []       = True
+eqListWith _ _        _        = False
+
+-- | Given a block map, ensure that all "target" blocks are covered by
+-- the same ticks as the respective "source" blocks. This not only
+-- means copying ticks, but also adjusting tick scopes where
+-- necessary.
+copyTicks :: LabelMap BlockId -> CmmGraph -> CmmGraph
+copyTicks env g
+  | mapNull env = g
+  | otherwise   = ofBlockMap (g_entry g) $ mapMap copyTo blockMap
+  where -- Reverse block merge map
+        blockMap = toBlockMap g
+        revEnv = mapFoldlWithKey insertRev M.empty env
+        insertRev m k x = M.insertWith (const (k:)) x [k] m
+        -- Copy ticks and scopes into the given block
+        copyTo block = case M.lookup (entryLabel block) revEnv of
+          Nothing -> block
+          Just ls -> foldr copy block $ mapMaybe (flip mapLookup blockMap) ls
+        copy from to =
+          let ticks = blockTicks from
+              CmmEntry  _   scp0        = firstNode from
+              (CmmEntry lbl scp1, code) = blockSplitHead to
+          in CmmEntry lbl (combineTickScopes scp0 scp1) `blockJoinHead`
+             foldr blockCons code (map CmmTick ticks)
+
+-- Group by [Label]
+-- See Note [Compressed TrieMap] in coreSyn/TrieMap about the usage of GenMap.
+groupByLabel :: [(Key, DistinctBlocks)] -> [(Key, [DistinctBlocks])]
+groupByLabel =
+  go (TM.emptyTM :: TM.ListMap (TM.GenMap LabelMap) (Key, [DistinctBlocks]))
+    where
+      go !m [] = TM.foldTM (:) m []
+      go !m ((k,v) : entries) = go (TM.alterTM k adjust m) entries
+        where --k' = map (getKey . getUnique) k
+              adjust Nothing       = Just (k,[v])
+              adjust (Just (_,vs)) = Just (k,v:vs)
+
+groupByInt :: (a -> Int) -> [a] -> [[a]]
+groupByInt f xs = nonDetEltsUFM $ List.foldl' go emptyUFM xs
+   -- See Note [Unique Determinism and code generation]
+  where
+    go m x = alterUFM addEntry m (f x)
+      where
+        addEntry xs = Just $! maybe [x] (x:) xs
diff --git a/compiler/GHC/Cmm/ContFlowOpt.hs b/compiler/GHC/Cmm/ContFlowOpt.hs
new file mode 100644
index 0000000000..7765972d02
--- /dev/null
+++ b/compiler/GHC/Cmm/ContFlowOpt.hs
@@ -0,0 +1,451 @@
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE BangPatterns #-}
+{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
+module GHC.Cmm.ContFlowOpt
+    ( cmmCfgOpts
+    , cmmCfgOptsProc
+    , removeUnreachableBlocksProc
+    , replaceLabels
+    )
+where
+
+import GhcPrelude hiding (succ, unzip, zip)
+
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch (mapSwitchTargets, switchTargetsToList)
+import Maybes
+import Panic
+import Util
+
+import Control.Monad
+
+
+-- Note [What is shortcutting]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- Consider this Cmm code:
+--
+-- L1: ...
+--     goto L2;
+-- L2: goto L3;
+-- L3: ...
+--
+-- Here L2 is an empty block and contains only an unconditional branch
+-- to L3. In this situation any block that jumps to L2 can jump
+-- directly to L3:
+--
+-- L1: ...
+--     goto L3;
+-- L2: goto L3;
+-- L3: ...
+--
+-- In this situation we say that we shortcut L2 to L3. One of
+-- consequences of shortcutting is that some blocks of code may become
+-- unreachable (in the example above this is true for L2).
+
+
+-- Note [Control-flow optimisations]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- This optimisation does three things:
+--
+--   - If a block finishes in an unconditional branch to another block
+--     and that is the only jump to that block we concatenate the
+--     destination block at the end of the current one.
+--
+--   - If a block finishes in a call whose continuation block is a
+--     goto, then we can shortcut the destination, making the
+--     continuation block the destination of the goto - but see Note
+--     [Shortcut call returns].
+--
+--   - For any block that is not a call we try to shortcut the
+--     destination(s). Additionally, if a block ends with a
+--     conditional branch we try to invert the condition.
+--
+-- Blocks are processed using postorder DFS traversal. A side effect
+-- of determining traversal order with a graph search is elimination
+-- of any blocks that are unreachable.
+--
+-- Transformations are improved by working from the end of the graph
+-- towards the beginning, because we may be able to perform many
+-- shortcuts in one go.
+
+
+-- Note [Shortcut call returns]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- We are going to maintain the "current" graph (LabelMap CmmBlock) as
+-- we go, and also a mapping from BlockId to BlockId, representing
+-- continuation labels that we have renamed.  This latter mapping is
+-- important because we might shortcut a CmmCall continuation.  For
+-- example:
+--
+--    Sp[0] = L
+--    call g returns to L
+--    L: goto M
+--    M: ...
+--
+-- So when we shortcut the L block, we need to replace not only
+-- the continuation of the call, but also references to L in the
+-- code (e.g. the assignment Sp[0] = L):
+--
+--    Sp[0] = M
+--    call g returns to M
+--    M: ...
+--
+-- So we keep track of which labels we have renamed and apply the mapping
+-- at the end with replaceLabels.
+
+
+-- Note [Shortcut call returns and proc-points]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- Consider this code that you might get from a recursive
+-- let-no-escape:
+--
+--       goto L1
+--      L1:
+--       if (Hp > HpLim) then L2 else L3
+--      L2:
+--       call stg_gc_noregs returns to L4
+--      L4:
+--       goto L1
+--      L3:
+--       ...
+--       goto L1
+--
+-- Then the control-flow optimiser shortcuts L4.  But that turns L1
+-- into the call-return proc point, and every iteration of the loop
+-- has to shuffle variables to and from the stack.  So we must *not*
+-- shortcut L4.
+--
+-- Moreover not shortcutting call returns is probably fine.  If L4 can
+-- concat with its branch target then it will still do so.  And we
+-- save some compile time because we don't have to traverse all the
+-- code in replaceLabels.
+--
+-- However, we probably do want to do this if we are splitting proc
+-- points, because L1 will be a proc-point anyway, so merging it with
+-- L4 reduces the number of proc points.  Unfortunately recursive
+-- let-no-escapes won't generate very good code with proc-point
+-- splitting on - we should probably compile them to explicitly use
+-- the native calling convention instead.
+
+cmmCfgOpts :: Bool -> CmmGraph -> CmmGraph
+cmmCfgOpts split g = fst (blockConcat split g)
+
+cmmCfgOptsProc :: Bool -> CmmDecl -> CmmDecl
+cmmCfgOptsProc split (CmmProc info lbl live g) = CmmProc info' lbl live g'
+    where (g', env) = blockConcat split g
+          info' = info{ info_tbls = new_info_tbls }
+          new_info_tbls = mapFromList (map upd_info (mapToList (info_tbls info)))
+
+          -- If we changed any labels, then we have to update the info tables
+          -- too, except for the top-level info table because that might be
+          -- referred to by other procs.
+          upd_info (k,info)
+             | Just k' <- mapLookup k env
+             = (k', if k' == g_entry g'
+                       then info
+                       else info{ cit_lbl = infoTblLbl k' })
+             | otherwise
+             = (k,info)
+cmmCfgOptsProc _ top = top
+
+
+blockConcat :: Bool -> CmmGraph -> (CmmGraph, LabelMap BlockId)
+blockConcat splitting_procs g@CmmGraph { g_entry = entry_id }
+  = (replaceLabels shortcut_map $ ofBlockMap new_entry new_blocks, shortcut_map')
+  where
+     -- We might be able to shortcut the entry BlockId itself.
+     -- Remember to update the shortcut_map, since we also have to
+     -- update the info_tbls mapping now.
+     (new_entry, shortcut_map')
+       | Just entry_blk <- mapLookup entry_id new_blocks
+       , Just dest      <- canShortcut entry_blk
+       = (dest, mapInsert entry_id dest shortcut_map)
+       | otherwise
+       = (entry_id, shortcut_map)
+
+     -- blocks are sorted in reverse postorder, but we want to go from the exit
+     -- towards beginning, so we use foldr below.
+     blocks = revPostorder g
+     blockmap = foldl' (flip addBlock) emptyBody blocks
+
+     -- Accumulator contains three components:
+     --  * map of blocks in a graph
+     --  * map of shortcut labels. See Note [Shortcut call returns]
+     --  * map containing number of predecessors for each block. We discard
+     --    it after we process all blocks.
+     (new_blocks, shortcut_map, _) =
+           foldr maybe_concat (blockmap, mapEmpty, initialBackEdges) blocks
+
+     -- Map of predecessors for initial graph. We increase number of
+     -- predecessors for entry block by one to denote that it is
+     -- target of a jump, even if no block in the current graph jumps
+     -- to it.
+     initialBackEdges = incPreds entry_id (predMap blocks)
+
+     maybe_concat :: CmmBlock
+                  -> (LabelMap CmmBlock, LabelMap BlockId, LabelMap Int)
+                  -> (LabelMap CmmBlock, LabelMap BlockId, LabelMap Int)
+     maybe_concat block (!blocks, !shortcut_map, !backEdges)
+        -- If:
+        --   (1) current block ends with unconditional branch to b' and
+        --   (2) it has exactly one predecessor (namely, current block)
+        --
+        -- Then:
+        --   (1) append b' block at the end of current block
+        --   (2) remove b' from the map of blocks
+        --   (3) remove information about b' from predecessors map
+        --
+        -- Since we know that the block has only one predecessor we call
+        -- mapDelete directly instead of calling decPreds.
+        --
+        -- Note that we always maintain an up-to-date list of predecessors, so
+        -- we can ignore the contents of shortcut_map
+        | CmmBranch b' <- last
+        , hasOnePredecessor b'
+        , Just blk' <- mapLookup b' blocks
+        = let bid' = entryLabel blk'
+          in ( mapDelete bid' $ mapInsert bid (splice head blk') blocks
+             , shortcut_map
+             , mapDelete b' backEdges )
+
+        -- If:
+        --   (1) we are splitting proc points (see Note
+        --       [Shortcut call returns and proc-points]) and
+        --   (2) current block is a CmmCall or CmmForeignCall with
+        --       continuation b' and
+        --   (3) we can shortcut that continuation to dest
+        -- Then:
+        --   (1) we change continuation to point to b'
+        --   (2) create mapping from b' to dest
+        --   (3) increase number of predecessors of dest by 1
+        --   (4) decrease number of predecessors of b' by 1
+        --
+        -- Later we will use replaceLabels to substitute all occurrences of b'
+        -- with dest.
+        | splitting_procs
+        , Just b'   <- callContinuation_maybe last
+        , Just blk' <- mapLookup b' blocks
+        , Just dest <- canShortcut blk'
+        = ( mapInsert bid (blockJoinTail head (update_cont dest)) blocks
+          , mapInsert b' dest shortcut_map
+          , decPreds b' $ incPreds dest backEdges )
+
+        -- If:
+        --   (1) a block does not end with a call
+        -- Then:
+        --   (1) if it ends with a conditional attempt to invert the
+        --       conditional
+        --   (2) attempt to shortcut all destination blocks
+        --   (3) if new successors of a block are different from the old ones
+        --       update the of predecessors accordingly
+        --
+        -- A special case of this is a situation when a block ends with an
+        -- unconditional jump to a block that can be shortcut.
+        | Nothing <- callContinuation_maybe last
+        = let oldSuccs = successors last
+              newSuccs = successors rewrite_last
+          in ( mapInsert bid (blockJoinTail head rewrite_last) blocks
+             , shortcut_map
+             , if oldSuccs == newSuccs
+               then backEdges
+               else foldr incPreds (foldr decPreds backEdges oldSuccs) newSuccs )
+
+        -- Otherwise don't do anything
+        | otherwise
+        = ( blocks, shortcut_map, backEdges )
+        where
+          (head, last) = blockSplitTail block
+          bid = entryLabel block
+
+          -- Changes continuation of a call to a specified label
+          update_cont dest =
+              case last of
+                CmmCall{}        -> last { cml_cont = Just dest }
+                CmmForeignCall{} -> last { succ = dest }
+                _                -> panic "Can't shortcut continuation."
+
+          -- Attempts to shortcut successors of last node
+          shortcut_last = mapSuccessors shortcut last
+            where
+              shortcut l =
+                 case mapLookup l blocks of
+                   Just b | Just dest <- canShortcut b -> dest
+                   _otherwise -> l
+
+          rewrite_last
+            -- Sometimes we can get rid of the conditional completely.
+            | CmmCondBranch _cond t f _l <- shortcut_last
+            , t == f
+            = CmmBranch t
+
+            -- See Note [Invert Cmm conditionals]
+            | CmmCondBranch cond t f l <- shortcut_last
+            , hasOnePredecessor t -- inverting will make t a fallthrough
+            , likelyTrue l || (numPreds f > 1)
+            , Just cond' <- maybeInvertCmmExpr cond
+            = CmmCondBranch cond' f t (invertLikeliness l)
+
+            -- If all jump destinations of a switch go to the
+            -- same target eliminate the switch.
+            | CmmSwitch _expr targets <- shortcut_last
+            , (t:ts) <- switchTargetsToList targets
+            , all (== t) ts
+            = CmmBranch t
+
+            | otherwise
+            = shortcut_last
+
+          likelyTrue (Just True)   = True
+          likelyTrue _             = False
+
+          invertLikeliness :: Maybe Bool -> Maybe Bool
+          invertLikeliness         = fmap not
+
+          -- Number of predecessors for a block
+          numPreds bid = mapLookup bid backEdges `orElse` 0
+
+          hasOnePredecessor b = numPreds b == 1
+
+{-
+  Note [Invert Cmm conditionals]
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  The native code generator always produces jumps to the true branch.
+  Falling through to the false branch is however faster. So we try to
+  arrange for that to happen.
+  This means we invert the condition if:
+  * The likely path will become a fallthrough.
+  * We can't guarantee a fallthrough for the false branch but for the
+    true branch.
+
+  In some cases it's faster to avoid inverting when the false branch is likely.
+  However determining when that is the case is neither easy nor cheap so for
+  now we always invert as this produces smaller binaries and code that is
+  equally fast on average. (On an i7-6700K)
+
+  TODO:
+  There is also the edge case when both branches have multiple predecessors.
+  In this case we could assume that we will end up with a jump for BOTH
+  branches. In this case it might be best to put the likely path in the true
+  branch especially if there are large numbers of predecessors as this saves
+  us the jump thats not taken. However I haven't tested this and as of early
+  2018 we almost never generate cmm where this would apply.
+-}
+
+-- Functions for incrementing and decrementing number of predecessors. If
+-- decrementing would set the predecessor count to 0, we remove entry from the
+-- map.
+-- Invariant: if a block has no predecessors it should be dropped from the
+-- graph because it is unreachable. maybe_concat is constructed to maintain
+-- that invariant, but calling replaceLabels may introduce unreachable blocks.
+-- We rely on subsequent passes in the Cmm pipeline to remove unreachable
+-- blocks.
+incPreds, decPreds :: BlockId -> LabelMap Int -> LabelMap Int
+incPreds bid edges = mapInsertWith (+) bid 1 edges
+decPreds bid edges = case mapLookup bid edges of
+                       Just preds | preds > 1 -> mapInsert bid (preds - 1) edges
+                       Just _                 -> mapDelete bid edges
+                       _                      -> edges
+
+
+-- Checks if a block consists only of "goto dest". If it does than we return
+-- "Just dest" label. See Note [What is shortcutting]
+canShortcut :: CmmBlock -> Maybe BlockId
+canShortcut block
+    | (_, middle, CmmBranch dest) <- blockSplit block
+    , all dont_care $ blockToList middle
+    = Just dest
+    | otherwise
+    = Nothing
+    where dont_care CmmComment{} = True
+          dont_care CmmTick{}    = True
+          dont_care _other       = False
+
+-- Concatenates two blocks. First one is assumed to be open on exit, the second
+-- is assumed to be closed on entry (i.e. it has a label attached to it, which
+-- the splice function removes by calling snd on result of blockSplitHead).
+splice :: Block CmmNode C O -> CmmBlock -> CmmBlock
+splice head rest = entry `blockJoinHead` code0 `blockAppend` code1
+  where (CmmEntry lbl sc0, code0) = blockSplitHead head
+        (CmmEntry _   sc1, code1) = blockSplitHead rest
+        entry = CmmEntry lbl (combineTickScopes sc0 sc1)
+
+-- If node is a call with continuation call return Just label of that
+-- continuation. Otherwise return Nothing.
+callContinuation_maybe :: CmmNode O C -> Maybe BlockId
+callContinuation_maybe (CmmCall { cml_cont = Just b }) = Just b
+callContinuation_maybe (CmmForeignCall { succ = b })   = Just b
+callContinuation_maybe _ = Nothing
+
+
+-- Map over the CmmGraph, replacing each label with its mapping in the
+-- supplied LabelMap.
+replaceLabels :: LabelMap BlockId -> CmmGraph -> CmmGraph
+replaceLabels env g
+  | mapNull env = g
+  | otherwise   = replace_eid $ mapGraphNodes1 txnode g
+   where
+     replace_eid g = g {g_entry = lookup (g_entry g)}
+     lookup id = mapLookup id env `orElse` id
+
+     txnode :: CmmNode e x -> CmmNode e x
+     txnode (CmmBranch bid) = CmmBranch (lookup bid)
+     txnode (CmmCondBranch p t f l) =
+       mkCmmCondBranch (exp p) (lookup t) (lookup f) l
+     txnode (CmmSwitch e ids) =
+       CmmSwitch (exp e) (mapSwitchTargets lookup ids)
+     txnode (CmmCall t k rg a res r) =
+       CmmCall (exp t) (liftM lookup k) rg a res r
+     txnode fc@CmmForeignCall{} =
+       fc{ args = map exp (args fc), succ = lookup (succ fc) }
+     txnode other = mapExpDeep exp other
+
+     exp :: CmmExpr -> CmmExpr
+     exp (CmmLit (CmmBlock bid))                = CmmLit (CmmBlock (lookup bid))
+     exp (CmmStackSlot (Young id) i) = CmmStackSlot (Young (lookup id)) i
+     exp e                                      = e
+
+mkCmmCondBranch :: CmmExpr -> Label -> Label -> Maybe Bool -> CmmNode O C
+mkCmmCondBranch p t f l =
+  if t == f then CmmBranch t else CmmCondBranch p t f l
+
+-- Build a map from a block to its set of predecessors.
+predMap :: [CmmBlock] -> LabelMap Int
+predMap blocks = foldr add_preds mapEmpty blocks
+  where
+    add_preds block env = foldr add env (successors block)
+      where add lbl env = mapInsertWith (+) lbl 1 env
+
+-- Removing unreachable blocks
+removeUnreachableBlocksProc :: CmmDecl -> CmmDecl
+removeUnreachableBlocksProc proc@(CmmProc info lbl live g)
+   | used_blocks `lengthLessThan` mapSize (toBlockMap g)
+   = CmmProc info' lbl live g'
+   | otherwise
+   = proc
+   where
+     g'    = ofBlockList (g_entry g) used_blocks
+     info' = info { info_tbls = keep_used (info_tbls info) }
+             -- Remove any info_tbls for unreachable
+
+     keep_used :: LabelMap CmmInfoTable -> LabelMap CmmInfoTable
+     keep_used bs = mapFoldlWithKey keep mapEmpty bs
+
+     keep :: LabelMap CmmInfoTable -> Label -> CmmInfoTable -> LabelMap CmmInfoTable
+     keep env l i | l `setMember` used_lbls = mapInsert l i env
+                  | otherwise               = env
+
+     used_blocks :: [CmmBlock]
+     used_blocks = revPostorder g
+
+     used_lbls :: LabelSet
+     used_lbls = setFromList $ map entryLabel used_blocks
diff --git a/compiler/GHC/Cmm/Dataflow.hs b/compiler/GHC/Cmm/Dataflow.hs
new file mode 100644
index 0000000000..fcabb1df0f
--- /dev/null
+++ b/compiler/GHC/Cmm/Dataflow.hs
@@ -0,0 +1,441 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE MultiParamTypeClasses  #-}
+{-# LANGUAGE RankNTypes #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeFamilies #-}
+
+--
+-- Copyright (c) 2010, João Dias, Simon Marlow, Simon Peyton Jones,
+-- and Norman Ramsey
+--
+-- Modifications copyright (c) The University of Glasgow 2012
+--
+-- This module is a specialised and optimised version of
+-- Compiler.Hoopl.Dataflow in the hoopl package.  In particular it is
+-- specialised to the UniqSM monad.
+--
+
+module GHC.Cmm.Dataflow
+  ( C, O, Block
+  , lastNode, entryLabel
+  , foldNodesBwdOO
+  , foldRewriteNodesBwdOO
+  , DataflowLattice(..), OldFact(..), NewFact(..), JoinedFact(..)
+  , TransferFun, RewriteFun
+  , Fact, FactBase
+  , getFact, mkFactBase
+  , analyzeCmmFwd, analyzeCmmBwd
+  , rewriteCmmBwd
+  , changedIf
+  , joinOutFacts
+  , joinFacts
+  )
+where
+
+import GhcPrelude
+
+import GHC.Cmm
+import UniqSupply
+
+import Data.Array
+import Data.Maybe
+import Data.IntSet (IntSet)
+import qualified Data.IntSet as IntSet
+
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+
+type family   Fact (x :: Extensibility) f :: *
+type instance Fact C f = FactBase f
+type instance Fact O f = f
+
+newtype OldFact a = OldFact a
+
+newtype NewFact a = NewFact a
+
+-- | The result of joining OldFact and NewFact.
+data JoinedFact a
+    = Changed !a     -- ^ Result is different than OldFact.
+    | NotChanged !a  -- ^ Result is the same as OldFact.
+
+getJoined :: JoinedFact a -> a
+getJoined (Changed a) = a
+getJoined (NotChanged a) = a
+
+changedIf :: Bool -> a -> JoinedFact a
+changedIf True = Changed
+changedIf False = NotChanged
+
+type JoinFun a = OldFact a -> NewFact a -> JoinedFact a
+
+data DataflowLattice a = DataflowLattice
+    { fact_bot :: a
+    , fact_join :: JoinFun a
+    }
+
+data Direction = Fwd | Bwd
+
+type TransferFun f = CmmBlock -> FactBase f -> FactBase f
+
+-- | Function for rewrtiting and analysis combined. To be used with
+-- @rewriteCmm@.
+--
+-- Currently set to work with @UniqSM@ monad, but we could probably abstract
+-- that away (if we do that, we might want to specialize the fixpoint algorithms
+-- to the particular monads through SPECIALIZE).
+type RewriteFun f = CmmBlock -> FactBase f -> UniqSM (CmmBlock, FactBase f)
+
+analyzeCmmBwd, analyzeCmmFwd
+    :: DataflowLattice f
+    -> TransferFun f
+    -> CmmGraph
+    -> FactBase f
+    -> FactBase f
+analyzeCmmBwd = analyzeCmm Bwd
+analyzeCmmFwd = analyzeCmm Fwd
+
+analyzeCmm
+    :: Direction
+    -> DataflowLattice f
+    -> TransferFun f
+    -> CmmGraph
+    -> FactBase f
+    -> FactBase f
+analyzeCmm dir lattice transfer cmmGraph initFact =
+    {-# SCC analyzeCmm #-}
+    let entry = g_entry cmmGraph
+        hooplGraph = g_graph cmmGraph
+        blockMap =
+            case hooplGraph of
+                GMany NothingO bm NothingO -> bm
+    in fixpointAnalysis dir lattice transfer entry blockMap initFact
+
+-- Fixpoint algorithm.
+fixpointAnalysis
+    :: forall f.
+       Direction
+    -> DataflowLattice f
+    -> TransferFun f
+    -> Label
+    -> LabelMap CmmBlock
+    -> FactBase f
+    -> FactBase f
+fixpointAnalysis direction lattice do_block entry blockmap = loop start
+  where
+    -- Sorting the blocks helps to minimize the number of times we need to
+    -- process blocks. For instance, for forward analysis we want to look at
+    -- blocks in reverse postorder. Also, see comments for sortBlocks.
+    blocks     = sortBlocks direction entry blockmap
+    num_blocks = length blocks
+    block_arr  = {-# SCC "block_arr" #-} listArray (0, num_blocks - 1) blocks
+    start      = {-# SCC "start" #-} IntSet.fromDistinctAscList
+      [0 .. num_blocks - 1]
+    dep_blocks = {-# SCC "dep_blocks" #-} mkDepBlocks direction blocks
+    join       = fact_join lattice
+
+    loop
+        :: IntHeap     -- ^ Worklist, i.e., blocks to process
+        -> FactBase f  -- ^ Current result (increases monotonically)
+        -> FactBase f
+    loop todo !fbase1 | Just (index, todo1) <- IntSet.minView todo =
+        let block = block_arr ! index
+            out_facts = {-# SCC "do_block" #-} do_block block fbase1
+            -- For each of the outgoing edges, we join it with the current
+            -- information in fbase1 and (if something changed) we update it
+            -- and add the affected blocks to the worklist.
+            (todo2, fbase2) = {-# SCC "mapFoldWithKey" #-}
+                mapFoldlWithKey
+                    (updateFact join dep_blocks) (todo1, fbase1) out_facts
+        in loop todo2 fbase2
+    loop _ !fbase1 = fbase1
+
+rewriteCmmBwd
+    :: DataflowLattice f
+    -> RewriteFun f
+    -> CmmGraph
+    -> FactBase f
+    -> UniqSM (CmmGraph, FactBase f)
+rewriteCmmBwd = rewriteCmm Bwd
+
+rewriteCmm
+    :: Direction
+    -> DataflowLattice f
+    -> RewriteFun f
+    -> CmmGraph
+    -> FactBase f
+    -> UniqSM (CmmGraph, FactBase f)
+rewriteCmm dir lattice rwFun cmmGraph initFact = {-# SCC rewriteCmm #-} do
+    let entry = g_entry cmmGraph
+        hooplGraph = g_graph cmmGraph
+        blockMap1 =
+            case hooplGraph of
+                GMany NothingO bm NothingO -> bm
+    (blockMap2, facts) <-
+        fixpointRewrite dir lattice rwFun entry blockMap1 initFact
+    return (cmmGraph {g_graph = GMany NothingO blockMap2 NothingO}, facts)
+
+fixpointRewrite
+    :: forall f.
+       Direction
+    -> DataflowLattice f
+    -> RewriteFun f
+    -> Label
+    -> LabelMap CmmBlock
+    -> FactBase f
+    -> UniqSM (LabelMap CmmBlock, FactBase f)
+fixpointRewrite dir lattice do_block entry blockmap = loop start blockmap
+  where
+    -- Sorting the blocks helps to minimize the number of times we need to
+    -- process blocks. For instance, for forward analysis we want to look at
+    -- blocks in reverse postorder. Also, see comments for sortBlocks.
+    blocks     = sortBlocks dir entry blockmap
+    num_blocks = length blocks
+    block_arr  = {-# SCC "block_arr_rewrite" #-}
+                 listArray (0, num_blocks - 1) blocks
+    start      = {-# SCC "start_rewrite" #-}
+                 IntSet.fromDistinctAscList [0 .. num_blocks - 1]
+    dep_blocks = {-# SCC "dep_blocks_rewrite" #-} mkDepBlocks dir blocks
+    join       = fact_join lattice
+
+    loop
+        :: IntHeap            -- ^ Worklist, i.e., blocks to process
+        -> LabelMap CmmBlock  -- ^ Rewritten blocks.
+        -> FactBase f         -- ^ Current facts.
+        -> UniqSM (LabelMap CmmBlock, FactBase f)
+    loop todo !blocks1 !fbase1
+      | Just (index, todo1) <- IntSet.minView todo = do
+        -- Note that we use the *original* block here. This is important.
+        -- We're optimistically rewriting blocks even before reaching the fixed
+        -- point, which means that the rewrite might be incorrect. So if the
+        -- facts change, we need to rewrite the original block again (taking
+        -- into account the new facts).
+        let block = block_arr ! index
+        (new_block, out_facts) <- {-# SCC "do_block_rewrite" #-}
+            do_block block fbase1
+        let blocks2 = mapInsert (entryLabel new_block) new_block blocks1
+            (todo2, fbase2) = {-# SCC "mapFoldWithKey_rewrite" #-}
+                mapFoldlWithKey
+                    (updateFact join dep_blocks) (todo1, fbase1) out_facts
+        loop todo2 blocks2 fbase2
+    loop _ !blocks1 !fbase1 = return (blocks1, fbase1)
+
+
+{-
+Note [Unreachable blocks]
+~~~~~~~~~~~~~~~~~~~~~~~~~
+A block that is not in the domain of tfb_fbase is "currently unreachable".
+A currently-unreachable block is not even analyzed.  Reason: consider
+constant prop and this graph, with entry point L1:
+  L1: x:=3; goto L4
+  L2: x:=4; goto L4
+  L4: if x>3 goto L2 else goto L5
+Here L2 is actually unreachable, but if we process it with bottom input fact,
+we'll propagate (x=4) to L4, and nuke the otherwise-good rewriting of L4.
+
+* If a currently-unreachable block is not analyzed, then its rewritten
+  graph will not be accumulated in tfb_rg.  And that is good:
+  unreachable blocks simply do not appear in the output.
+
+* Note that clients must be careful to provide a fact (even if bottom)
+  for each entry point. Otherwise useful blocks may be garbage collected.
+
+* Note that updateFact must set the change-flag if a label goes from
+  not-in-fbase to in-fbase, even if its fact is bottom.  In effect the
+  real fact lattice is
+       UNR
+       bottom
+       the points above bottom
+
+* Even if the fact is going from UNR to bottom, we still call the
+  client's fact_join function because it might give the client
+  some useful debugging information.
+
+* All of this only applies for *forward* ixpoints.  For the backward
+  case we must treat every block as reachable; it might finish with a
+  'return', and therefore have no successors, for example.
+-}
+
+
+-----------------------------------------------------------------------------
+--  Pieces that are shared by fixpoint and fixpoint_anal
+-----------------------------------------------------------------------------
+
+-- | Sort the blocks into the right order for analysis. This means reverse
+-- postorder for a forward analysis. For the backward one, we simply reverse
+-- that (see Note [Backward vs forward analysis]).
+sortBlocks
+    :: NonLocal n
+    => Direction -> Label -> LabelMap (Block n C C) -> [Block n C C]
+sortBlocks direction entry blockmap =
+    case direction of
+        Fwd -> fwd
+        Bwd -> reverse fwd
+  where
+    fwd = revPostorderFrom blockmap entry
+
+-- Note [Backward vs forward analysis]
+--
+-- The forward and backward cases are not dual.  In the forward case, the entry
+-- points are known, and one simply traverses the body blocks from those points.
+-- In the backward case, something is known about the exit points, but a
+-- backward analysis must also include reachable blocks that don't reach the
+-- exit, as in a procedure that loops forever and has side effects.)
+-- For instance, let E be the entry and X the exit blocks (arrows indicate
+-- control flow)
+--   E -> X
+--   E -> B
+--   B -> C
+--   C -> B
+-- We do need to include B and C even though they're unreachable in the
+-- *reverse* graph (that we could use for backward analysis):
+--   E <- X
+--   E <- B
+--   B <- C
+--   C <- B
+-- So when sorting the blocks for the backward analysis, we simply take the
+-- reverse of what is used for the forward one.
+
+
+-- | Construct a mapping from a @Label@ to the block indexes that should be
+-- re-analyzed if the facts at that @Label@ change.
+--
+-- Note that we're considering here the entry point of the block, so if the
+-- facts change at the entry:
+-- * for a backward analysis we need to re-analyze all the predecessors, but
+-- * for a forward analysis, we only need to re-analyze the current block
+--   (and that will in turn propagate facts into its successors).
+mkDepBlocks :: Direction -> [CmmBlock] -> LabelMap IntSet
+mkDepBlocks Fwd blocks = go blocks 0 mapEmpty
+  where
+    go []     !_ !dep_map = dep_map
+    go (b:bs) !n !dep_map =
+        go bs (n + 1) $ mapInsert (entryLabel b) (IntSet.singleton n) dep_map
+mkDepBlocks Bwd blocks = go blocks 0 mapEmpty
+  where
+    go []     !_ !dep_map = dep_map
+    go (b:bs) !n !dep_map =
+        let insert m l = mapInsertWith IntSet.union l (IntSet.singleton n) m
+        in go bs (n + 1) $ foldl' insert dep_map (successors b)
+
+-- | After some new facts have been generated by analysing a block, we
+-- fold this function over them to generate (a) a list of block
+-- indices to (re-)analyse, and (b) the new FactBase.
+updateFact
+    :: JoinFun f
+    -> LabelMap IntSet
+    -> (IntHeap, FactBase f)
+    -> Label
+    -> f -- out fact
+    -> (IntHeap, FactBase f)
+updateFact fact_join dep_blocks (todo, fbase) lbl new_fact
+  = case lookupFact lbl fbase of
+      Nothing ->
+          -- Note [No old fact]
+          let !z = mapInsert lbl new_fact fbase in (changed, z)
+      Just old_fact ->
+          case fact_join (OldFact old_fact) (NewFact new_fact) of
+              (NotChanged _) -> (todo, fbase)
+              (Changed f) -> let !z = mapInsert lbl f fbase in (changed, z)
+  where
+    changed = todo `IntSet.union`
+              mapFindWithDefault IntSet.empty lbl dep_blocks
+
+{-
+Note [No old fact]
+
+We know that the new_fact is >= _|_, so we don't need to join.  However,
+if the new fact is also _|_, and we have already analysed its block,
+we don't need to record a change.  So there's a tradeoff here.  It turns
+out that always recording a change is faster.
+-}
+
+----------------------------------------------------------------
+--       Utilities
+----------------------------------------------------------------
+
+-- Fact lookup: the fact `orelse` bottom
+getFact  :: DataflowLattice f -> Label -> FactBase f -> f
+getFact lat l fb = case lookupFact l fb of Just  f -> f
+                                           Nothing -> fact_bot lat
+
+-- | Returns the result of joining the facts from all the successors of the
+-- provided node or block.
+joinOutFacts :: (NonLocal n) => DataflowLattice f -> n e C -> FactBase f -> f
+joinOutFacts lattice nonLocal fact_base = foldl' join (fact_bot lattice) facts
+  where
+    join new old = getJoined $ fact_join lattice (OldFact old) (NewFact new)
+    facts =
+        [ fromJust fact
+        | s <- successors nonLocal
+        , let fact = lookupFact s fact_base
+        , isJust fact
+        ]
+
+joinFacts :: DataflowLattice f -> [f] -> f
+joinFacts lattice facts  = foldl' join (fact_bot lattice) facts
+  where
+    join new old = getJoined $ fact_join lattice (OldFact old) (NewFact new)
+
+-- | Returns the joined facts for each label.
+mkFactBase :: DataflowLattice f -> [(Label, f)] -> FactBase f
+mkFactBase lattice = foldl' add mapEmpty
+  where
+    join = fact_join lattice
+
+    add result (l, f1) =
+        let !newFact =
+                case mapLookup l result of
+                    Nothing -> f1
+                    Just f2 -> getJoined $ join (OldFact f1) (NewFact f2)
+        in mapInsert l newFact result
+
+-- | Folds backward over all nodes of an open-open block.
+-- Strict in the accumulator.
+foldNodesBwdOO :: (CmmNode O O -> f -> f) -> Block CmmNode O O -> f -> f
+foldNodesBwdOO funOO = go
+  where
+    go (BCat b1 b2) f = go b1 $! go b2 f
+    go (BSnoc h n) f = go h $! funOO n f
+    go (BCons n t) f = funOO n $! go t f
+    go (BMiddle n) f = funOO n f
+    go BNil f = f
+{-# INLINABLE foldNodesBwdOO #-}
+
+-- | Folds backward over all the nodes of an open-open block and allows
+-- rewriting them. The accumulator is both the block of nodes and @f@ (usually
+-- dataflow facts).
+-- Strict in both accumulated parts.
+foldRewriteNodesBwdOO
+    :: forall f.
+       (CmmNode O O -> f -> UniqSM (Block CmmNode O O, f))
+    -> Block CmmNode O O
+    -> f
+    -> UniqSM (Block CmmNode O O, f)
+foldRewriteNodesBwdOO rewriteOO initBlock initFacts = go initBlock initFacts
+  where
+    go (BCons node1 block1) !fact1 = (rewriteOO node1 `comp` go block1) fact1
+    go (BSnoc block1 node1) !fact1 = (go block1 `comp` rewriteOO node1) fact1
+    go (BCat blockA1 blockB1) !fact1 = (go blockA1 `comp` go blockB1) fact1
+    go (BMiddle node) !fact1 = rewriteOO node fact1
+    go BNil !fact = return (BNil, fact)
+
+    comp rew1 rew2 = \f1 -> do
+        (b, f2) <- rew2 f1
+        (a, !f3) <- rew1 f2
+        let !c = joinBlocksOO a b
+        return (c, f3)
+    {-# INLINE comp #-}
+{-# INLINABLE foldRewriteNodesBwdOO #-}
+
+joinBlocksOO :: Block n O O -> Block n O O -> Block n O O
+joinBlocksOO BNil b = b
+joinBlocksOO b BNil = b
+joinBlocksOO (BMiddle n) b = blockCons n b
+joinBlocksOO b (BMiddle n) = blockSnoc b n
+joinBlocksOO b1 b2 = BCat b1 b2
+
+type IntHeap = IntSet
diff --git a/compiler/GHC/Cmm/Dataflow/Block.hs b/compiler/GHC/Cmm/Dataflow/Block.hs
new file mode 100644
index 0000000000..d2e52a8904
--- /dev/null
+++ b/compiler/GHC/Cmm/Dataflow/Block.hs
@@ -0,0 +1,329 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE PolyKinds #-}
+{-# LANGUAGE RankNTypes #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE StandaloneDeriving #-}
+{-# LANGUAGE TypeFamilies #-}
+module GHC.Cmm.Dataflow.Block
+    ( Extensibility (..)
+    , O
+    , C
+    , MaybeO(..)
+    , IndexedCO
+    , Block(..)
+    , blockAppend
+    , blockCons
+    , blockFromList
+    , blockJoin
+    , blockJoinHead
+    , blockJoinTail
+    , blockSnoc
+    , blockSplit
+    , blockSplitHead
+    , blockSplitTail
+    , blockToList
+    , emptyBlock
+    , firstNode
+    , foldBlockNodesB
+    , foldBlockNodesB3
+    , foldBlockNodesF
+    , isEmptyBlock
+    , lastNode
+    , mapBlock
+    , mapBlock'
+    , mapBlock3'
+    , replaceFirstNode
+    , replaceLastNode
+    ) where
+
+import GhcPrelude
+
+-- -----------------------------------------------------------------------------
+-- Shapes: Open and Closed
+
+-- | Used at the type level to indicate "open" vs "closed" structure.
+data Extensibility
+  -- | An "open" structure with a unique, unnamed control-flow edge flowing in
+  -- or out. "Fallthrough" and concatenation are permitted at an open point.
+  = Open
+  -- | A "closed" structure which supports control transfer only through the use
+  -- of named labels---no "fallthrough" is permitted. The number of control-flow
+  -- edges is unconstrained.
+  | Closed
+
+type O = 'Open
+type C = 'Closed
+
+-- | Either type indexed by closed/open using type families
+type family IndexedCO (ex :: Extensibility) (a :: k) (b :: k) :: k
+type instance IndexedCO C a _b = a
+type instance IndexedCO O _a b = b
+
+-- | Maybe type indexed by open/closed
+data MaybeO ex t where
+  JustO    :: t -> MaybeO O t
+  NothingO ::      MaybeO C t
+
+-- | Maybe type indexed by closed/open
+data MaybeC ex t where
+  JustC    :: t -> MaybeC C t
+  NothingC ::      MaybeC O t
+
+deriving instance Functor (MaybeO ex)
+deriving instance Functor (MaybeC ex)
+
+-- -----------------------------------------------------------------------------
+-- The Block type
+
+-- | A sequence of nodes.  May be any of four shapes (O/O, O/C, C/O, C/C).
+-- Open at the entry means single entry, mutatis mutandis for exit.
+-- A closed/closed block is a /basic/ block and can't be extended further.
+-- Clients should avoid manipulating blocks and should stick to either nodes
+-- or graphs.
+data Block n e x where
+  BlockCO  :: n C O -> Block n O O          -> Block n C O
+  BlockCC  :: n C O -> Block n O O -> n O C -> Block n C C
+  BlockOC  ::          Block n O O -> n O C -> Block n O C
+
+  BNil    :: Block n O O
+  BMiddle :: n O O                      -> Block n O O
+  BCat    :: Block n O O -> Block n O O -> Block n O O
+  BSnoc   :: Block n O O -> n O O       -> Block n O O
+  BCons   :: n O O       -> Block n O O -> Block n O O
+
+
+-- -----------------------------------------------------------------------------
+-- Simple operations on Blocks
+
+-- Predicates
+
+isEmptyBlock :: Block n e x -> Bool
+isEmptyBlock BNil       = True
+isEmptyBlock (BCat l r) = isEmptyBlock l && isEmptyBlock r
+isEmptyBlock _          = False
+
+
+-- Building
+
+emptyBlock :: Block n O O
+emptyBlock = BNil
+
+blockCons :: n O O -> Block n O x -> Block n O x
+blockCons n b = case b of
+  BlockOC b l  -> (BlockOC $! (n `blockCons` b)) l
+  BNil{}    -> BMiddle n
+  BMiddle{} -> n `BCons` b
+  BCat{}    -> n `BCons` b
+  BSnoc{}   -> n `BCons` b
+  BCons{}   -> n `BCons` b
+
+blockSnoc :: Block n e O -> n O O -> Block n e O
+blockSnoc b n = case b of
+  BlockCO f b -> BlockCO f $! (b `blockSnoc` n)
+  BNil{}      -> BMiddle n
+  BMiddle{}   -> b `BSnoc` n
+  BCat{}      -> b `BSnoc` n
+  BSnoc{}     -> b `BSnoc` n
+  BCons{}     -> b `BSnoc` n
+
+blockJoinHead :: n C O -> Block n O x -> Block n C x
+blockJoinHead f (BlockOC b l) = BlockCC f b l
+blockJoinHead f b = BlockCO f BNil `cat` b
+
+blockJoinTail :: Block n e O -> n O C -> Block n e C
+blockJoinTail (BlockCO f b) t = BlockCC f b t
+blockJoinTail b t = b `cat` BlockOC BNil t
+
+blockJoin :: n C O -> Block n O O -> n O C -> Block n C C
+blockJoin f b t = BlockCC f b t
+
+blockAppend :: Block n e O -> Block n O x -> Block n e x
+blockAppend = cat
+
+
+-- Taking apart
+
+firstNode :: Block n C x -> n C O
+firstNode (BlockCO n _)   = n
+firstNode (BlockCC n _ _) = n
+
+lastNode :: Block n x C -> n O C
+lastNode (BlockOC   _ n) = n
+lastNode (BlockCC _ _ n) = n
+
+blockSplitHead :: Block n C x -> (n C O, Block n O x)
+blockSplitHead (BlockCO n b)   = (n, b)
+blockSplitHead (BlockCC n b t) = (n, BlockOC b t)
+
+blockSplitTail :: Block n e C -> (Block n e O, n O C)
+blockSplitTail (BlockOC b n)   = (b, n)
+blockSplitTail (BlockCC f b t) = (BlockCO f b, t)
+
+-- | Split a closed block into its entry node, open middle block, and
+-- exit node.
+blockSplit :: Block n C C -> (n C O, Block n O O, n O C)
+blockSplit (BlockCC f b t) = (f, b, t)
+
+blockToList :: Block n O O -> [n O O]
+blockToList b = go b []
+   where go :: Block n O O -> [n O O] -> [n O O]
+         go BNil         r = r
+         go (BMiddle n)  r = n : r
+         go (BCat b1 b2) r = go b1 $! go b2 r
+         go (BSnoc b1 n) r = go b1 (n:r)
+         go (BCons n b1) r = n : go b1 r
+
+blockFromList :: [n O O] -> Block n O O
+blockFromList = foldr BCons BNil
+
+-- Modifying
+
+replaceFirstNode :: Block n C x -> n C O -> Block n C x
+replaceFirstNode (BlockCO _ b)   f = BlockCO f b
+replaceFirstNode (BlockCC _ b n) f = BlockCC f b n
+
+replaceLastNode :: Block n x C -> n O C -> Block n x C
+replaceLastNode (BlockOC   b _) n = BlockOC b n
+replaceLastNode (BlockCC l b _) n = BlockCC l b n
+
+-- -----------------------------------------------------------------------------
+-- General concatenation
+
+cat :: Block n e O -> Block n O x -> Block n e x
+cat x y = case x of
+  BNil -> y
+
+  BlockCO l b1 -> case y of
+                   BlockOC b2 n -> (BlockCC l $! (b1 `cat` b2)) n
+                   BNil         -> x
+                   BMiddle _    -> BlockCO l $! (b1 `cat` y)
+                   BCat{}       -> BlockCO l $! (b1 `cat` y)
+                   BSnoc{}      -> BlockCO l $! (b1 `cat` y)
+                   BCons{}      -> BlockCO l $! (b1 `cat` y)
+
+  BMiddle n -> case y of
+                   BlockOC b2 n2 -> (BlockOC $! (x `cat` b2)) n2
+                   BNil          -> x
+                   BMiddle{}     -> BCons n y
+                   BCat{}        -> BCons n y
+                   BSnoc{}       -> BCons n y
+                   BCons{}       -> BCons n y
+
+  BCat{} -> case y of
+                   BlockOC b3 n2 -> (BlockOC $! (x `cat` b3)) n2
+                   BNil          -> x
+                   BMiddle n     -> BSnoc x n
+                   BCat{}        -> BCat x y
+                   BSnoc{}       -> BCat x y
+                   BCons{}       -> BCat x y
+
+  BSnoc{} -> case y of
+                   BlockOC b2 n2 -> (BlockOC $! (x `cat` b2)) n2
+                   BNil          -> x
+                   BMiddle n     -> BSnoc x n
+                   BCat{}        -> BCat x y
+                   BSnoc{}       -> BCat x y
+                   BCons{}       -> BCat x y
+
+
+  BCons{} -> case y of
+                   BlockOC b2 n2 -> (BlockOC $! (x `cat` b2)) n2
+                   BNil          -> x
+                   BMiddle n     -> BSnoc x n
+                   BCat{}        -> BCat x y
+                   BSnoc{}       -> BCat x y
+                   BCons{}       -> BCat x y
+
+
+-- -----------------------------------------------------------------------------
+-- Mapping
+
+-- | map a function over the nodes of a 'Block'
+mapBlock :: (forall e x. n e x -> n' e x) -> Block n e x -> Block n' e x
+mapBlock f (BlockCO n b  ) = BlockCO (f n) (mapBlock f b)
+mapBlock f (BlockOC   b n) = BlockOC       (mapBlock f b) (f n)
+mapBlock f (BlockCC n b m) = BlockCC (f n) (mapBlock f b) (f m)
+mapBlock _  BNil           = BNil
+mapBlock f (BMiddle n)     = BMiddle (f n)
+mapBlock f (BCat b1 b2)    = BCat    (mapBlock f b1) (mapBlock f b2)
+mapBlock f (BSnoc b n)     = BSnoc   (mapBlock f b)  (f n)
+mapBlock f (BCons n b)     = BCons   (f n)  (mapBlock f b)
+
+-- | A strict 'mapBlock'
+mapBlock' :: (forall e x. n e x -> n' e x) -> (Block n e x -> Block n' e x)
+mapBlock' f = mapBlock3' (f, f, f)
+
+-- | map over a block, with different functions to apply to first nodes,
+-- middle nodes and last nodes respectively.  The map is strict.
+--
+mapBlock3' :: forall n n' e x .
+             ( n C O -> n' C O
+             , n O O -> n' O O,
+               n O C -> n' O C)
+          -> Block n e x -> Block n' e x
+mapBlock3' (f, m, l) b = go b
+  where go :: forall e x . Block n e x -> Block n' e x
+        go (BlockOC b y)   = (BlockOC $! go b) $! l y
+        go (BlockCO x b)   = (BlockCO $! f x) $! (go b)
+        go (BlockCC x b y) = ((BlockCC $! f x) $! go b) $! (l y)
+        go BNil            = BNil
+        go (BMiddle n)     = BMiddle $! m n
+        go (BCat x y)      = (BCat $! go x) $! (go y)
+        go (BSnoc x n)     = (BSnoc $! go x) $! (m n)
+        go (BCons n x)     = (BCons $! m n) $! (go x)
+
+-- -----------------------------------------------------------------------------
+-- Folding
+
+
+-- | Fold a function over every node in a block, forward or backward.
+-- The fold function must be polymorphic in the shape of the nodes.
+foldBlockNodesF3 :: forall n a b c .
+                   ( n C O       -> a -> b
+                   , n O O       -> b -> b
+                   , n O C       -> b -> c)
+                 -> (forall e x . Block n e x -> IndexedCO e a b -> IndexedCO x c b)
+foldBlockNodesF  :: forall n a .
+                    (forall e x . n e x       -> a -> a)
+                 -> (forall e x . Block n e x -> IndexedCO e a a -> IndexedCO x a a)
+foldBlockNodesB3 :: forall n a b c .
+                   ( n C O       -> b -> c
+                   , n O O       -> b -> b
+                   , n O C       -> a -> b)
+                 -> (forall e x . Block n e x -> IndexedCO x a b -> IndexedCO e c b)
+foldBlockNodesB  :: forall n a .
+                    (forall e x . n e x       -> a -> a)
+                 -> (forall e x . Block n e x -> IndexedCO x a a -> IndexedCO e a a)
+
+foldBlockNodesF3 (ff, fm, fl) = block
+  where block :: forall e x . Block n e x -> IndexedCO e a b -> IndexedCO x c b
+        block (BlockCO f b  )   = ff f `cat` block b
+        block (BlockCC f b l)   = ff f `cat` block b `cat` fl l
+        block (BlockOC   b l)   =            block b `cat` fl l
+        block BNil              = id
+        block (BMiddle node)    = fm node
+        block (b1 `BCat`    b2) = block b1 `cat` block b2
+        block (b1 `BSnoc` n)    = block b1 `cat` fm n
+        block (n `BCons` b2)    = fm n `cat` block b2
+        cat :: forall a b c. (a -> b) -> (b -> c) -> a -> c
+        cat f f' = f' . f
+
+foldBlockNodesF f = foldBlockNodesF3 (f, f, f)
+
+foldBlockNodesB3 (ff, fm, fl) = block
+  where block :: forall e x . Block n e x -> IndexedCO x a b -> IndexedCO e c b
+        block (BlockCO f b  )   = ff f `cat` block b
+        block (BlockCC f b l)   = ff f `cat` block b `cat` fl l
+        block (BlockOC   b l)   =            block b `cat` fl l
+        block BNil              = id
+        block (BMiddle node)    = fm node
+        block (b1 `BCat`    b2) = block b1 `cat` block b2
+        block (b1 `BSnoc` n)    = block b1 `cat` fm n
+        block (n `BCons` b2)    = fm n `cat` block b2
+        cat :: forall a b c. (b -> c) -> (a -> b) -> a -> c
+        cat f f' = f . f'
+
+foldBlockNodesB f = foldBlockNodesB3 (f, f, f)
+
diff --git a/compiler/GHC/Cmm/Dataflow/Collections.hs b/compiler/GHC/Cmm/Dataflow/Collections.hs
new file mode 100644
index 0000000000..f131f17cc1
--- /dev/null
+++ b/compiler/GHC/Cmm/Dataflow/Collections.hs
@@ -0,0 +1,177 @@
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE DeriveTraversable #-}
+{-# LANGUAGE DeriveFoldable #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+
+module GHC.Cmm.Dataflow.Collections
+    ( IsSet(..)
+    , setInsertList, setDeleteList, setUnions
+    , IsMap(..)
+    , mapInsertList, mapDeleteList, mapUnions
+    , UniqueMap, UniqueSet
+    ) where
+
+import GhcPrelude
+
+import qualified Data.IntMap.Strict as M
+import qualified Data.IntSet as S
+
+import Data.List (foldl1')
+
+class IsSet set where
+  type ElemOf set
+
+  setNull :: set -> Bool
+  setSize :: set -> Int
+  setMember :: ElemOf set -> set -> Bool
+
+  setEmpty :: set
+  setSingleton :: ElemOf set -> set
+  setInsert :: ElemOf set -> set -> set
+  setDelete :: ElemOf set -> set -> set
+
+  setUnion :: set -> set -> set
+  setDifference :: set -> set -> set
+  setIntersection :: set -> set -> set
+  setIsSubsetOf :: set -> set -> Bool
+  setFilter :: (ElemOf set -> Bool) -> set -> set
+
+  setFoldl :: (b -> ElemOf set -> b) -> b -> set -> b
+  setFoldr :: (ElemOf set -> b -> b) -> b -> set -> b
+
+  setElems :: set -> [ElemOf set]
+  setFromList :: [ElemOf set] -> set
+
+-- Helper functions for IsSet class
+setInsertList :: IsSet set => [ElemOf set] -> set -> set
+setInsertList keys set = foldl' (flip setInsert) set keys
+
+setDeleteList :: IsSet set => [ElemOf set] -> set -> set
+setDeleteList keys set = foldl' (flip setDelete) set keys
+
+setUnions :: IsSet set => [set] -> set
+setUnions [] = setEmpty
+setUnions sets = foldl1' setUnion sets
+
+
+class IsMap map where
+  type KeyOf map
+
+  mapNull :: map a -> Bool
+  mapSize :: map a -> Int
+  mapMember :: KeyOf map -> map a -> Bool
+  mapLookup :: KeyOf map -> map a -> Maybe a
+  mapFindWithDefault :: a -> KeyOf map -> map a -> a
+
+  mapEmpty :: map a
+  mapSingleton :: KeyOf map -> a -> map a
+  mapInsert :: KeyOf map -> a -> map a -> map a
+  mapInsertWith :: (a -> a -> a) -> KeyOf map -> a -> map a -> map a
+  mapDelete :: KeyOf map -> map a -> map a
+  mapAlter :: (Maybe a -> Maybe a) -> KeyOf map -> map a -> map a
+  mapAdjust :: (a -> a) -> KeyOf map -> map a -> map a
+
+  mapUnion :: map a -> map a -> map a
+  mapUnionWithKey :: (KeyOf map -> a -> a -> a) -> map a -> map a -> map a
+  mapDifference :: map a -> map a -> map a
+  mapIntersection :: map a -> map a -> map a
+  mapIsSubmapOf :: Eq a => map a -> map a -> Bool
+
+  mapMap :: (a -> b) -> map a -> map b
+  mapMapWithKey :: (KeyOf map -> a -> b) -> map a -> map b
+  mapFoldl :: (b -> a -> b) -> b -> map a -> b
+  mapFoldr :: (a -> b -> b) -> b -> map a -> b
+  mapFoldlWithKey :: (b -> KeyOf map -> a -> b) -> b -> map a -> b
+  mapFoldMapWithKey :: Monoid m => (KeyOf map -> a -> m) -> map a -> m
+  mapFilter :: (a -> Bool) -> map a -> map a
+  mapFilterWithKey :: (KeyOf map -> a -> Bool) -> map a -> map a
+
+
+  mapElems :: map a -> [a]
+  mapKeys :: map a -> [KeyOf map]
+  mapToList :: map a -> [(KeyOf map, a)]
+  mapFromList :: [(KeyOf map, a)] -> map a
+  mapFromListWith :: (a -> a -> a) -> [(KeyOf map,a)] -> map a
+
+-- Helper functions for IsMap class
+mapInsertList :: IsMap map => [(KeyOf map, a)] -> map a -> map a
+mapInsertList assocs map = foldl' (flip (uncurry mapInsert)) map assocs
+
+mapDeleteList :: IsMap map => [KeyOf map] -> map a -> map a
+mapDeleteList keys map = foldl' (flip mapDelete) map keys
+
+mapUnions :: IsMap map => [map a] -> map a
+mapUnions [] = mapEmpty
+mapUnions maps = foldl1' mapUnion maps
+
+-----------------------------------------------------------------------------
+-- Basic instances
+-----------------------------------------------------------------------------
+
+newtype UniqueSet = US S.IntSet deriving (Eq, Ord, Show, Semigroup, Monoid)
+
+instance IsSet UniqueSet where
+  type ElemOf UniqueSet = Int
+
+  setNull (US s) = S.null s
+  setSize (US s) = S.size s
+  setMember k (US s) = S.member k s
+
+  setEmpty = US S.empty
+  setSingleton k = US (S.singleton k)
+  setInsert k (US s) = US (S.insert k s)
+  setDelete k (US s) = US (S.delete k s)
+
+  setUnion (US x) (US y) = US (S.union x y)
+  setDifference (US x) (US y) = US (S.difference x y)
+  setIntersection (US x) (US y) = US (S.intersection x y)
+  setIsSubsetOf (US x) (US y) = S.isSubsetOf x y
+  setFilter f (US s) = US (S.filter f s)
+
+  setFoldl k z (US s) = S.foldl' k z s
+  setFoldr k z (US s) = S.foldr k z s
+
+  setElems (US s) = S.elems s
+  setFromList ks = US (S.fromList ks)
+
+newtype UniqueMap v = UM (M.IntMap v)
+  deriving (Eq, Ord, Show, Functor, Foldable, Traversable)
+
+instance IsMap UniqueMap where
+  type KeyOf UniqueMap = Int
+
+  mapNull (UM m) = M.null m
+  mapSize (UM m) = M.size m
+  mapMember k (UM m) = M.member k m
+  mapLookup k (UM m) = M.lookup k m
+  mapFindWithDefault def k (UM m) = M.findWithDefault def k m
+
+  mapEmpty = UM M.empty
+  mapSingleton k v = UM (M.singleton k v)
+  mapInsert k v (UM m) = UM (M.insert k v m)
+  mapInsertWith f k v (UM m) = UM (M.insertWith f k v m)
+  mapDelete k (UM m) = UM (M.delete k m)
+  mapAlter f k (UM m) = UM (M.alter f k m)
+  mapAdjust f k (UM m) = UM (M.adjust f k m)
+
+  mapUnion (UM x) (UM y) = UM (M.union x y)
+  mapUnionWithKey f (UM x) (UM y) = UM (M.unionWithKey f x y)
+  mapDifference (UM x) (UM y) = UM (M.difference x y)
+  mapIntersection (UM x) (UM y) = UM (M.intersection x y)
+  mapIsSubmapOf (UM x) (UM y) = M.isSubmapOf x y
+
+  mapMap f (UM m) = UM (M.map f m)
+  mapMapWithKey f (UM m) = UM (M.mapWithKey f m)
+  mapFoldl k z (UM m) = M.foldl' k z m
+  mapFoldr k z (UM m) = M.foldr k z m
+  mapFoldlWithKey k z (UM m) = M.foldlWithKey' k z m
+  mapFoldMapWithKey f (UM m) = M.foldMapWithKey f m
+  mapFilter f (UM m) = UM (M.filter f m)
+  mapFilterWithKey f (UM m) = UM (M.filterWithKey f m)
+
+  mapElems (UM m) = M.elems m
+  mapKeys (UM m) = M.keys m
+  mapToList (UM m) = M.toList m
+  mapFromList assocs = UM (M.fromList assocs)
+  mapFromListWith f assocs = UM (M.fromListWith f assocs)
diff --git a/compiler/GHC/Cmm/Dataflow/Graph.hs b/compiler/GHC/Cmm/Dataflow/Graph.hs
new file mode 100644
index 0000000000..3f361de0fb
--- /dev/null
+++ b/compiler/GHC/Cmm/Dataflow/Graph.hs
@@ -0,0 +1,186 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE RankNTypes #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeFamilies #-}
+module GHC.Cmm.Dataflow.Graph
+    ( Body
+    , Graph
+    , Graph'(..)
+    , NonLocal(..)
+    , addBlock
+    , bodyList
+    , emptyBody
+    , labelsDefined
+    , mapGraph
+    , mapGraphBlocks
+    , revPostorderFrom
+    ) where
+
+
+import GhcPrelude
+import Util
+
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Collections
+
+-- | A (possibly empty) collection of closed/closed blocks
+type Body n = LabelMap (Block n C C)
+
+-- | @Body@ abstracted over @block@
+type Body' block (n :: Extensibility -> Extensibility -> *) = LabelMap (block n C C)
+
+-------------------------------
+-- | Gives access to the anchor points for
+-- nonlocal edges as well as the edges themselves
+class NonLocal thing where
+  entryLabel :: thing C x -> Label   -- ^ The label of a first node or block
+  successors :: thing e C -> [Label] -- ^ Gives control-flow successors
+
+instance NonLocal n => NonLocal (Block n) where
+  entryLabel (BlockCO f _)   = entryLabel f
+  entryLabel (BlockCC f _ _) = entryLabel f
+
+  successors (BlockOC   _ n) = successors n
+  successors (BlockCC _ _ n) = successors n
+
+
+emptyBody :: Body' block n
+emptyBody = mapEmpty
+
+bodyList :: Body' block n -> [(Label,block n C C)]
+bodyList body = mapToList body
+
+addBlock
+    :: (NonLocal block, HasDebugCallStack)
+    => block C C -> LabelMap (block C C) -> LabelMap (block C C)
+addBlock block body = mapAlter add lbl body
+  where
+    lbl = entryLabel block
+    add Nothing = Just block
+    add _ = error $ "duplicate label " ++ show lbl ++ " in graph"
+
+
+-- ---------------------------------------------------------------------------
+-- Graph
+
+-- | A control-flow graph, which may take any of four shapes (O/O,
+-- O/C, C/O, C/C).  A graph open at the entry has a single,
+-- distinguished, anonymous entry point; if a graph is closed at the
+-- entry, its entry point(s) are supplied by a context.
+type Graph = Graph' Block
+
+-- | @Graph'@ is abstracted over the block type, so that we can build
+-- graphs of annotated blocks for example (Compiler.Hoopl.Dataflow
+-- needs this).
+data Graph' block (n :: Extensibility -> Extensibility -> *) e x where
+  GNil  :: Graph' block n O O
+  GUnit :: block n O O -> Graph' block n O O
+  GMany :: MaybeO e (block n O C)
+        -> Body' block n
+        -> MaybeO x (block n C O)
+        -> Graph' block n e x
+
+
+-- -----------------------------------------------------------------------------
+-- Mapping over graphs
+
+-- | Maps over all nodes in a graph.
+mapGraph :: (forall e x. n e x -> n' e x) -> Graph n e x -> Graph n' e x
+mapGraph f = mapGraphBlocks (mapBlock f)
+
+-- | Function 'mapGraphBlocks' enables a change of representation of blocks,
+-- nodes, or both.  It lifts a polymorphic block transform into a polymorphic
+-- graph transform.  When the block representation stabilizes, a similar
+-- function should be provided for blocks.
+mapGraphBlocks :: forall block n block' n' e x .
+                  (forall e x . block n e x -> block' n' e x)
+               -> (Graph' block n e x -> Graph' block' n' e x)
+
+mapGraphBlocks f = map
+  where map :: Graph' block n e x -> Graph' block' n' e x
+        map GNil = GNil
+        map (GUnit b) = GUnit (f b)
+        map (GMany e b x) = GMany (fmap f e) (mapMap f b) (fmap f x)
+
+-- -----------------------------------------------------------------------------
+-- Extracting Labels from graphs
+
+labelsDefined :: forall block n e x . NonLocal (block n) => Graph' block n e x
+              -> LabelSet
+labelsDefined GNil      = setEmpty
+labelsDefined (GUnit{}) = setEmpty
+labelsDefined (GMany _ body x) = mapFoldlWithKey addEntry (exitLabel x) body
+  where addEntry :: forall a. LabelSet -> ElemOf LabelSet -> a -> LabelSet
+        addEntry labels label _ = setInsert label labels
+        exitLabel :: MaybeO x (block n C O) -> LabelSet
+        exitLabel NothingO  = setEmpty
+        exitLabel (JustO b) = setSingleton (entryLabel b)
+
+
+----------------------------------------------------------------
+
+-- | Returns a list of blocks reachable from the provided Labels in the reverse
+-- postorder.
+--
+-- This is the most important traversal over this data structure.  It drops
+-- unreachable code and puts blocks in an order that is good for solving forward
+-- dataflow problems quickly.  The reverse order is good for solving backward
+-- dataflow problems quickly.  The forward order is also reasonably good for
+-- emitting instructions, except that it will not usually exploit Forrest
+-- Baskett's trick of eliminating the unconditional branch from a loop.  For
+-- that you would need a more serious analysis, probably based on dominators, to
+-- identify loop headers.
+--
+-- For forward analyses we want reverse postorder visitation, consider:
+-- @
+--      A -> [B,C]
+--      B -> D
+--      C -> D
+-- @
+-- Postorder: [D, C, B, A] (or [D, B, C, A])
+-- Reverse postorder: [A, B, C, D] (or [A, C, B, D])
+-- This matters for, e.g., forward analysis, because we want to analyze *both*
+-- B and C before we analyze D.
+revPostorderFrom
+  :: forall block.  (NonLocal block)
+  => LabelMap (block C C) -> Label -> [block C C]
+revPostorderFrom graph start = go start_worklist setEmpty []
+  where
+    start_worklist = lookup_for_descend start Nil
+
+    -- To compute the postorder we need to "visit" a block (mark as done)
+    -- *after* visiting all its successors. So we need to know whether we
+    -- already processed all successors of each block (and @NonLocal@ allows
+    -- arbitrary many successors). So we use an explicit stack with an extra bit
+    -- of information:
+    -- * @ConsTodo@ means to explore the block if it wasn't visited before
+    -- * @ConsMark@ means that all successors were already done and we can add
+    --   the block to the result.
+    --
+    -- NOTE: We add blocks to the result list in postorder, but we *prepend*
+    -- them (i.e., we use @(:)@), which means that the final list is in reverse
+    -- postorder.
+    go :: DfsStack (block C C) -> LabelSet -> [block C C] -> [block C C]
+    go Nil                      !_           !result = result
+    go (ConsMark block rest)    !wip_or_done !result =
+        go rest wip_or_done (block : result)
+    go (ConsTodo block rest)    !wip_or_done !result
+        | entryLabel block `setMember` wip_or_done = go rest wip_or_done result
+        | otherwise =
+            let new_worklist =
+                    foldr lookup_for_descend
+                          (ConsMark block rest)
+                          (successors block)
+            in go new_worklist (setInsert (entryLabel block) wip_or_done) result
+
+    lookup_for_descend :: Label -> DfsStack (block C C) -> DfsStack (block C C)
+    lookup_for_descend label wl
+      | Just b <- mapLookup label graph = ConsTodo b wl
+      | otherwise =
+           error $ "Label that doesn't have a block?! " ++ show label
+
+data DfsStack a = ConsTodo a (DfsStack a) | ConsMark a (DfsStack a) | Nil
diff --git a/compiler/GHC/Cmm/Dataflow/Label.hs b/compiler/GHC/Cmm/Dataflow/Label.hs
new file mode 100644
index 0000000000..c571cedb48
--- /dev/null
+++ b/compiler/GHC/Cmm/Dataflow/Label.hs
@@ -0,0 +1,142 @@
+{-# LANGUAGE DeriveFoldable #-}
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE DeriveTraversable #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+
+module GHC.Cmm.Dataflow.Label
+    ( Label
+    , LabelMap
+    , LabelSet
+    , FactBase
+    , lookupFact
+    , mkHooplLabel
+    ) where
+
+import GhcPrelude
+
+import Outputable
+
+-- TODO: This should really just use GHC's Unique and Uniq{Set,FM}
+import GHC.Cmm.Dataflow.Collections
+
+import Unique (Uniquable(..))
+import TrieMap
+
+
+-----------------------------------------------------------------------------
+--              Label
+-----------------------------------------------------------------------------
+
+newtype Label = Label { lblToUnique :: Int }
+  deriving (Eq, Ord)
+
+mkHooplLabel :: Int -> Label
+mkHooplLabel = Label
+
+instance Show Label where
+  show (Label n) = "L" ++ show n
+
+instance Uniquable Label where
+  getUnique label = getUnique (lblToUnique label)
+
+instance Outputable Label where
+  ppr label = ppr (getUnique label)
+
+-----------------------------------------------------------------------------
+-- LabelSet
+
+newtype LabelSet = LS UniqueSet deriving (Eq, Ord, Show, Monoid, Semigroup)
+
+instance IsSet LabelSet where
+  type ElemOf LabelSet = Label
+
+  setNull (LS s) = setNull s
+  setSize (LS s) = setSize s
+  setMember (Label k) (LS s) = setMember k s
+
+  setEmpty = LS setEmpty
+  setSingleton (Label k) = LS (setSingleton k)
+  setInsert (Label k) (LS s) = LS (setInsert k s)
+  setDelete (Label k) (LS s) = LS (setDelete k s)
+
+  setUnion (LS x) (LS y) = LS (setUnion x y)
+  setDifference (LS x) (LS y) = LS (setDifference x y)
+  setIntersection (LS x) (LS y) = LS (setIntersection x y)
+  setIsSubsetOf (LS x) (LS y) = setIsSubsetOf x y
+  setFilter f (LS s) = LS (setFilter (f . mkHooplLabel) s)
+  setFoldl k z (LS s) = setFoldl (\a v -> k a (mkHooplLabel v)) z s
+  setFoldr k z (LS s) = setFoldr (\v a -> k (mkHooplLabel v) a) z s
+
+  setElems (LS s) = map mkHooplLabel (setElems s)
+  setFromList ks = LS (setFromList (map lblToUnique ks))
+
+-----------------------------------------------------------------------------
+-- LabelMap
+
+newtype LabelMap v = LM (UniqueMap v)
+  deriving (Eq, Ord, Show, Functor, Foldable, Traversable)
+
+instance IsMap LabelMap where
+  type KeyOf LabelMap = Label
+
+  mapNull (LM m) = mapNull m
+  mapSize (LM m) = mapSize m
+  mapMember (Label k) (LM m) = mapMember k m
+  mapLookup (Label k) (LM m) = mapLookup k m
+  mapFindWithDefault def (Label k) (LM m) = mapFindWithDefault def k m
+
+  mapEmpty = LM mapEmpty
+  mapSingleton (Label k) v = LM (mapSingleton k v)
+  mapInsert (Label k) v (LM m) = LM (mapInsert k v m)
+  mapInsertWith f (Label k) v (LM m) = LM (mapInsertWith f k v m)
+  mapDelete (Label k) (LM m) = LM (mapDelete k m)
+  mapAlter f (Label k) (LM m) = LM (mapAlter f k m)
+  mapAdjust f (Label k) (LM m) = LM (mapAdjust f k m)
+
+  mapUnion (LM x) (LM y) = LM (mapUnion x y)
+  mapUnionWithKey f (LM x) (LM y) = LM (mapUnionWithKey (f . mkHooplLabel) x y)
+  mapDifference (LM x) (LM y) = LM (mapDifference x y)
+  mapIntersection (LM x) (LM y) = LM (mapIntersection x y)
+  mapIsSubmapOf (LM x) (LM y) = mapIsSubmapOf x y
+
+  mapMap f (LM m) = LM (mapMap f m)
+  mapMapWithKey f (LM m) = LM (mapMapWithKey (f . mkHooplLabel) m)
+  mapFoldl k z (LM m) = mapFoldl k z m
+  mapFoldr k z (LM m) = mapFoldr k z m
+  mapFoldlWithKey k z (LM m) =
+      mapFoldlWithKey (\a v -> k a (mkHooplLabel v)) z m
+  mapFoldMapWithKey f (LM m) = mapFoldMapWithKey (\k v -> f (mkHooplLabel k) v) m
+  mapFilter f (LM m) = LM (mapFilter f m)
+  mapFilterWithKey f (LM m) = LM (mapFilterWithKey (f . mkHooplLabel) m)
+
+  mapElems (LM m) = mapElems m
+  mapKeys (LM m) = map mkHooplLabel (mapKeys m)
+  mapToList (LM m) = [(mkHooplLabel k, v) | (k, v) <- mapToList m]
+  mapFromList assocs = LM (mapFromList [(lblToUnique k, v) | (k, v) <- assocs])
+  mapFromListWith f assocs = LM (mapFromListWith f [(lblToUnique k, v) | (k, v) <- assocs])
+
+-----------------------------------------------------------------------------
+-- Instances
+
+instance Outputable LabelSet where
+  ppr = ppr . setElems
+
+instance Outputable a => Outputable (LabelMap a) where
+  ppr = ppr . mapToList
+
+instance TrieMap LabelMap where
+  type Key LabelMap = Label
+  emptyTM = mapEmpty
+  lookupTM k m = mapLookup k m
+  alterTM k f m = mapAlter f k m
+  foldTM k m z = mapFoldr k z m
+  mapTM f m = mapMap f m
+
+-----------------------------------------------------------------------------
+-- FactBase
+
+type FactBase f = LabelMap f
+
+lookupFact :: Label -> FactBase f -> Maybe f
+lookupFact = mapLookup
diff --git a/compiler/GHC/Cmm/DebugBlock.hs b/compiler/GHC/Cmm/DebugBlock.hs
new file mode 100644
index 0000000000..70fc08ee94
--- /dev/null
+++ b/compiler/GHC/Cmm/DebugBlock.hs
@@ -0,0 +1,546 @@
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE MultiWayIf #-}
+
+-----------------------------------------------------------------------------
+--
+-- Debugging data
+--
+-- Association of debug data on the Cmm level, with methods to encode it in
+-- event log format for later inclusion in profiling event logs.
+--
+-----------------------------------------------------------------------------
+
+module GHC.Cmm.DebugBlock (
+
+  DebugBlock(..),
+  cmmDebugGen,
+  cmmDebugLabels,
+  cmmDebugLink,
+  debugToMap,
+
+  -- * Unwinding information
+  UnwindTable, UnwindPoint(..),
+  UnwindExpr(..), toUnwindExpr
+  ) where
+
+import GhcPrelude
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import GHC.Cmm
+import GHC.Cmm.Utils
+import CoreSyn
+import FastString      ( nilFS, mkFastString )
+import Module
+import Outputable
+import GHC.Cmm.Ppr.Expr ( pprExpr )
+import SrcLoc
+import Util            ( seqList )
+
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+
+import Data.Maybe
+import Data.List     ( minimumBy, nubBy )
+import Data.Ord      ( comparing )
+import qualified Data.Map as Map
+import Data.Either   ( partitionEithers )
+
+-- | Debug information about a block of code. Ticks scope over nested
+-- blocks.
+data DebugBlock =
+  DebugBlock
+  { dblProcedure  :: !Label        -- ^ Entry label of containing proc
+  , dblLabel      :: !Label        -- ^ Hoopl label
+  , dblCLabel     :: !CLabel       -- ^ Output label
+  , dblHasInfoTbl :: !Bool         -- ^ Has an info table?
+  , dblParent     :: !(Maybe DebugBlock)
+    -- ^ The parent of this proc. See Note [Splitting DebugBlocks]
+  , dblTicks      :: ![CmmTickish] -- ^ Ticks defined in this block
+  , dblSourceTick :: !(Maybe CmmTickish) -- ^ Best source tick covering block
+  , dblPosition   :: !(Maybe Int)  -- ^ Output position relative to
+                                   -- other blocks. @Nothing@ means
+                                   -- the block was optimized out
+  , dblUnwind     :: [UnwindPoint]
+  , dblBlocks     :: ![DebugBlock] -- ^ Nested blocks
+  }
+
+instance Outputable DebugBlock where
+  ppr blk = (if | dblProcedure blk == dblLabel blk
+                -> text "proc"
+                | dblHasInfoTbl blk
+                -> text "pp-blk"
+                | otherwise
+                -> text "blk") <+>
+            ppr (dblLabel blk) <+> parens (ppr (dblCLabel blk)) <+>
+            (maybe empty ppr (dblSourceTick blk)) <+>
+            (maybe (text "removed") ((text "pos " <>) . ppr)
+                   (dblPosition blk)) <+>
+            (ppr (dblUnwind blk)) $+$
+            (if null (dblBlocks blk) then empty else nest 4 (ppr (dblBlocks blk)))
+
+-- | Intermediate data structure holding debug-relevant context information
+-- about a block.
+type BlockContext = (CmmBlock, RawCmmDecl)
+
+-- | Extract debug data from a group of procedures. We will prefer
+-- source notes that come from the given module (presumably the module
+-- that we are currently compiling).
+cmmDebugGen :: ModLocation -> RawCmmGroup -> [DebugBlock]
+cmmDebugGen modLoc decls = map (blocksForScope Nothing) topScopes
+  where
+      blockCtxs :: Map.Map CmmTickScope [BlockContext]
+      blockCtxs = blockContexts decls
+
+      -- Analyse tick scope structure: Each one is either a top-level
+      -- tick scope, or the child of another.
+      (topScopes, childScopes)
+        = partitionEithers $ map (\a -> findP a a) $ Map.keys blockCtxs
+      findP tsc GlobalScope = Left tsc -- top scope
+      findP tsc scp | scp' `Map.member` blockCtxs = Right (scp', tsc)
+                    | otherwise                   = findP tsc scp'
+        where -- Note that we only following the left parent of
+              -- combined scopes. This loses us ticks, which we will
+              -- recover by copying ticks below.
+              scp' | SubScope _ scp' <- scp      = scp'
+                   | CombinedScope scp' _ <- scp = scp'
+                   | otherwise                   = panic "findP impossible"
+
+      scopeMap = foldr (uncurry insertMulti) Map.empty childScopes
+
+      -- This allows us to recover ticks that we lost by flattening
+      -- the graph. Basically, if the parent is A but the child is
+      -- CBA, we know that there is no BA, because it would have taken
+      -- priority - but there might be a B scope, with ticks that
+      -- would not be associated with our child anymore. Note however
+      -- that there might be other childs (DB), which we have to
+      -- filter out.
+      --
+      -- We expect this to be called rarely, which is why we are not
+      -- trying too hard to be efficient here. In many cases we won't
+      -- have to construct blockCtxsU in the first place.
+      ticksToCopy :: CmmTickScope -> [CmmTickish]
+      ticksToCopy (CombinedScope scp s) = go s
+        where go s | scp `isTickSubScope` s   = [] -- done
+                   | SubScope _ s' <- s       = ticks ++ go s'
+                   | CombinedScope s1 s2 <- s = ticks ++ go s1 ++ go s2
+                   | otherwise                = panic "ticksToCopy impossible"
+                where ticks = bCtxsTicks $ fromMaybe [] $ Map.lookup s blockCtxs
+      ticksToCopy _ = []
+      bCtxsTicks = concatMap (blockTicks . fst)
+
+      -- Finding the "best" source tick is somewhat arbitrary -- we
+      -- select the first source span, while preferring source ticks
+      -- from the same source file.  Furthermore, dumps take priority
+      -- (if we generated one, we probably want debug information to
+      -- refer to it).
+      bestSrcTick = minimumBy (comparing rangeRating)
+      rangeRating (SourceNote span _)
+        | srcSpanFile span == thisFile = 1
+        | otherwise                    = 2 :: Int
+      rangeRating note                 = pprPanic "rangeRating" (ppr note)
+      thisFile = maybe nilFS mkFastString $ ml_hs_file modLoc
+
+      -- Returns block tree for this scope as well as all nested
+      -- scopes. Note that if there are multiple blocks in the (exact)
+      -- same scope we elect one as the "branch" node and add the rest
+      -- as children.
+      blocksForScope :: Maybe CmmTickish -> CmmTickScope -> DebugBlock
+      blocksForScope cstick scope = mkBlock True (head bctxs)
+        where bctxs = fromJust $ Map.lookup scope blockCtxs
+              nested = fromMaybe [] $ Map.lookup scope scopeMap
+              childs = map (mkBlock False) (tail bctxs) ++
+                       map (blocksForScope stick) nested
+
+              mkBlock :: Bool -> BlockContext -> DebugBlock
+              mkBlock top (block, prc)
+                = DebugBlock { dblProcedure    = g_entry graph
+                             , dblLabel        = label
+                             , dblCLabel       = case info of
+                                 Just (Statics infoLbl _)   -> infoLbl
+                                 Nothing
+                                   | g_entry graph == label -> entryLbl
+                                   | otherwise              -> blockLbl label
+                             , dblHasInfoTbl   = isJust info
+                             , dblParent       = Nothing
+                             , dblTicks        = ticks
+                             , dblPosition     = Nothing -- see cmmDebugLink
+                             , dblSourceTick   = stick
+                             , dblBlocks       = blocks
+                             , dblUnwind       = []
+                             }
+                where (CmmProc infos entryLbl _ graph) = prc
+                      label = entryLabel block
+                      info = mapLookup label infos
+                      blocks | top       = seqList childs childs
+                             | otherwise = []
+
+              -- A source tick scopes over all nested blocks. However
+              -- their source ticks might take priority.
+              isSourceTick SourceNote {} = True
+              isSourceTick _             = False
+              -- Collect ticks from all blocks inside the tick scope.
+              -- We attempt to filter out duplicates while we're at it.
+              ticks = nubBy (flip tickishContains) $
+                      bCtxsTicks bctxs ++ ticksToCopy scope
+              stick = case filter isSourceTick ticks of
+                []     -> cstick
+                sticks -> Just $! bestSrcTick (sticks ++ maybeToList cstick)
+
+-- | Build a map of blocks sorted by their tick scopes
+--
+-- This involves a pre-order traversal, as we want blocks in rough
+-- control flow order (so ticks have a chance to be sorted in the
+-- right order).
+blockContexts :: RawCmmGroup -> Map.Map CmmTickScope [BlockContext]
+blockContexts decls = Map.map reverse $ foldr walkProc Map.empty decls
+  where walkProc :: RawCmmDecl
+                 -> Map.Map CmmTickScope [BlockContext]
+                 -> Map.Map CmmTickScope [BlockContext]
+        walkProc CmmData{}                 m = m
+        walkProc prc@(CmmProc _ _ _ graph) m
+          | mapNull blocks = m
+          | otherwise      = snd $ walkBlock prc entry (emptyLbls, m)
+          where blocks = toBlockMap graph
+                entry  = [mapFind (g_entry graph) blocks]
+                emptyLbls = setEmpty :: LabelSet
+
+        walkBlock :: RawCmmDecl -> [Block CmmNode C C]
+                  -> (LabelSet, Map.Map CmmTickScope [BlockContext])
+                  -> (LabelSet, Map.Map CmmTickScope [BlockContext])
+        walkBlock _   []             c            = c
+        walkBlock prc (block:blocks) (visited, m)
+          | lbl `setMember` visited
+          = walkBlock prc blocks (visited, m)
+          | otherwise
+          = walkBlock prc blocks $
+            walkBlock prc succs
+              (lbl `setInsert` visited,
+               insertMulti scope (block, prc) m)
+          where CmmEntry lbl scope = firstNode block
+                (CmmProc _ _ _ graph) = prc
+                succs = map (flip mapFind (toBlockMap graph))
+                            (successors (lastNode block))
+        mapFind = mapFindWithDefault (error "contextTree: block not found!")
+
+insertMulti :: Ord k => k -> a -> Map.Map k [a] -> Map.Map k [a]
+insertMulti k v = Map.insertWith (const (v:)) k [v]
+
+cmmDebugLabels :: (i -> Bool) -> GenCmmGroup d g (ListGraph i) -> [Label]
+cmmDebugLabels isMeta nats = seqList lbls lbls
+  where -- Find order in which procedures will be generated by the
+        -- back-end (that actually matters for DWARF generation).
+        --
+        -- Note that we might encounter blocks that are missing or only
+        -- consist of meta instructions -- we will declare them missing,
+        -- which will skip debug data generation without messing up the
+        -- block hierarchy.
+        lbls = map blockId $ filter (not . allMeta) $ concatMap getBlocks nats
+        getBlocks (CmmProc _ _ _ (ListGraph bs)) = bs
+        getBlocks _other                         = []
+        allMeta (BasicBlock _ instrs) = all isMeta instrs
+
+-- | Sets position and unwind table fields in the debug block tree according to
+-- native generated code.
+cmmDebugLink :: [Label] -> LabelMap [UnwindPoint]
+             -> [DebugBlock] -> [DebugBlock]
+cmmDebugLink labels unwindPts blocks = map link blocks
+  where blockPos :: LabelMap Int
+        blockPos = mapFromList $ flip zip [0..] labels
+        link block = block { dblPosition = mapLookup (dblLabel block) blockPos
+                           , dblBlocks   = map link (dblBlocks block)
+                           , dblUnwind   = fromMaybe mempty
+                                         $ mapLookup (dblLabel block) unwindPts
+                           }
+
+-- | Converts debug blocks into a label map for easier lookups
+debugToMap :: [DebugBlock] -> LabelMap DebugBlock
+debugToMap = mapUnions . map go
+   where go b = mapInsert (dblLabel b) b $ mapUnions $ map go (dblBlocks b)
+
+{-
+Note [What is this unwinding business?]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Unwinding tables are a variety of debugging information used by debugging tools
+to reconstruct the execution history of a program at runtime. These tables
+consist of sets of "instructions", one set for every instruction in the program,
+which describe how to reconstruct the state of the machine at the point where
+the current procedure was called. For instance, consider the following annotated
+pseudo-code,
+
+  a_fun:
+    add rsp, 8            -- unwind: rsp = rsp - 8
+    mov rax, 1            -- unwind: rax = unknown
+    call another_block
+    sub rsp, 8            -- unwind: rsp = rsp
+
+We see that attached to each instruction there is an "unwind" annotation, which
+provides a relationship between each updated register and its value at the
+time of entry to a_fun. This is the sort of information that allows gdb to give
+you a stack backtrace given the execution state of your program. This
+unwinding information is captured in various ways by various debug information
+formats; in the case of DWARF (the only format supported by GHC) it is known as
+Call Frame Information (CFI) and can be found in the .debug.frames section of
+your object files.
+
+Currently we only bother to produce unwinding information for registers which
+are necessary to reconstruct flow-of-execution. On x86_64 this includes $rbp
+(which is the STG stack pointer) and $rsp (the C stack pointer).
+
+Let's consider how GHC would annotate a C-- program with unwinding information
+with a typical C-- procedure as would come from the STG-to-Cmm code generator,
+
+  entry()
+     { c2fe:
+           v :: P64 = R2;
+           if ((Sp + 8) - 32 < SpLim) (likely: False) goto c2ff; else goto c2fg;
+       c2ff:
+           R2 = v :: P64;
+           R1 = test_closure;
+           call (stg_gc_fun)(R2, R1) args: 8, res: 0, upd: 8;
+       c2fg:
+           I64[Sp - 8] = c2dD;
+           R1 = v :: P64;
+           Sp = Sp - 8;          // Sp updated here
+           if (R1 & 7 != 0) goto c2dD; else goto c2dE;
+       c2dE:
+           call (I64[R1])(R1) returns to c2dD, args: 8, res: 8, upd: 8;
+       c2dD:
+           w :: P64 = R1;
+           Hp = Hp + 48;
+           if (Hp > HpLim) (likely: False) goto c2fj; else goto c2fi;
+       ...
+  },
+
+Let's consider how this procedure will be decorated with unwind information
+(largely by GHC.Cmm.LayoutStack). Naturally, when we enter the procedure `entry` the
+value of Sp is no different from what it was at its call site. Therefore we will
+add an `unwind` statement saying this at the beginning of its unwind-annotated
+code,
+
+  entry()
+     { c2fe:
+           unwind Sp = Just Sp + 0;
+           v :: P64 = R2;
+           if ((Sp + 8) - 32 < SpLim) (likely: False) goto c2ff; else goto c2fg;
+
+After c2fe we may pass to either c2ff or c2fg; let's first consider the
+former. In this case there is nothing in particular that we need to do other
+than reiterate what we already know about Sp,
+
+       c2ff:
+           unwind Sp = Just Sp + 0;
+           R2 = v :: P64;
+           R1 = test_closure;
+           call (stg_gc_fun)(R2, R1) args: 8, res: 0, upd: 8;
+
+In contrast, c2fg updates Sp midway through its body. To ensure that unwinding
+can happen correctly after this point we must include an unwind statement there,
+in addition to the usual beginning-of-block statement,
+
+       c2fg:
+           unwind Sp = Just Sp + 0;
+           I64[Sp - 8] = c2dD;
+           R1 = v :: P64;
+           Sp = Sp - 8;
+           unwind Sp = Just Sp + 8;
+           if (R1 & 7 != 0) goto c2dD; else goto c2dE;
+
+The remaining blocks are simple,
+
+       c2dE:
+           unwind Sp = Just Sp + 8;
+           call (I64[R1])(R1) returns to c2dD, args: 8, res: 8, upd: 8;
+       c2dD:
+           unwind Sp = Just Sp + 8;
+           w :: P64 = R1;
+           Hp = Hp + 48;
+           if (Hp > HpLim) (likely: False) goto c2fj; else goto c2fi;
+       ...
+  },
+
+
+The flow of unwinding information through the compiler is a bit convoluted:
+
+ * C-- begins life in StgToCmm without any unwind information. This is because we
+   haven't actually done any register assignment or stack layout yet, so there
+   is no need for unwind information.
+
+ * GHC.Cmm.LayoutStack figures out how to layout each procedure's stack, and produces
+   appropriate unwinding nodes for each adjustment of the STG Sp register.
+
+ * The unwind nodes are carried through the sinking pass. Currently this is
+   guaranteed not to invalidate unwind information since it won't touch stores
+   to Sp, but this will need revisiting if CmmSink gets smarter in the future.
+
+ * Eventually we make it to the native code generator backend which can then
+   preserve the unwind nodes in its machine-specific instructions. In so doing
+   the backend can also modify or add unwinding information; this is necessary,
+   for instance, in the case of x86-64, where adjustment of $rsp may be
+   necessary during calls to native foreign code due to the native calling
+   convention.
+
+ * The NCG then retrieves the final unwinding table for each block from the
+   backend with extractUnwindPoints.
+
+ * This unwind information is converted to DebugBlocks by Debug.cmmDebugGen
+
+ * These DebugBlocks are then converted to, e.g., DWARF unwinding tables
+   (by the Dwarf module) and emitted in the final object.
+
+See also:
+  Note [Unwinding information in the NCG] in AsmCodeGen,
+  Note [Unwind pseudo-instruction in Cmm],
+  Note [Debugging DWARF unwinding info].
+
+
+Note [Debugging DWARF unwinding info]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For debugging generated unwinding info I've found it most useful to dump the
+disassembled binary with objdump -D and dump the debug info with
+readelf --debug-dump=frames-interp.
+
+You should get something like this:
+
+  0000000000000010 <stg_catch_frame_info>:
+    10:   48 83 c5 18             add    $0x18,%rbp
+    14:   ff 65 00                jmpq   *0x0(%rbp)
+
+and:
+
+  Contents of the .debug_frame section:
+
+  00000000 0000000000000014 ffffffff CIE "" cf=1 df=-8 ra=16
+     LOC           CFA      rbp   rsp   ra
+  0000000000000000 rbp+0    v+0   s     c+0
+
+  00000018 0000000000000024 00000000 FDE cie=00000000 pc=000000000000000f..0000000000000017
+     LOC           CFA      rbp   rsp   ra
+  000000000000000f rbp+0    v+0   s     c+0
+  000000000000000f rbp+24   v+0   s     c+0
+
+To read it http://www.dwarfstd.org/doc/dwarf-2.0.0.pdf has a nice example in
+Appendix 5 (page 101 of the pdf) and more details in the relevant section.
+
+The key thing to keep in mind is that the value at LOC is the value from
+*before* the instruction at LOC executes. In other words it answers the
+question: if my $rip is at LOC, how do I get the relevant values given the
+values obtained through unwinding so far.
+
+If the readelf --debug-dump=frames-interp output looks wrong, it may also be
+useful to look at readelf --debug-dump=frames, which is closer to the
+information that GHC generated.
+
+It's also useful to dump the relevant Cmm with -ddump-cmm -ddump-opt-cmm
+-ddump-cmm-proc -ddump-cmm-verbose. Note [Unwind pseudo-instruction in Cmm]
+explains how to interpret it.
+
+Inside gdb there are a couple useful commands for inspecting frames.
+For example:
+
+  gdb> info frame <num>
+
+It shows the values of registers obtained through unwinding.
+
+Another useful thing to try when debugging the DWARF unwinding is to enable
+extra debugging output in GDB:
+
+  gdb> set debug frame 1
+
+This makes GDB produce a trace of its internal workings. Having gone this far,
+it's just a tiny step to run GDB in GDB. Make sure you install debugging
+symbols for gdb if you obtain it through a package manager.
+
+Keep in mind that the current release of GDB has an instruction pointer handling
+heuristic that works well for C-like languages, but doesn't always work for
+Haskell. See Note [Info Offset] in Dwarf.Types for more details.
+
+Note [Unwind pseudo-instruction in Cmm]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+One of the possible CmmNodes is a CmmUnwind pseudo-instruction. It doesn't
+generate any assembly, but controls what DWARF unwinding information gets
+generated.
+
+It's important to understand what ranges of code the unwind pseudo-instruction
+refers to.
+For a sequence of CmmNodes like:
+
+  A // starts at addr X and ends at addr Y-1
+  unwind Sp = Just Sp + 16;
+  B // starts at addr Y and ends at addr Z
+
+the unwind statement reflects the state after A has executed, but before B
+has executed. If you consult the Note [Debugging DWARF unwinding info], the
+LOC this information will end up in is Y.
+-}
+
+-- | A label associated with an 'UnwindTable'
+data UnwindPoint = UnwindPoint !CLabel !UnwindTable
+
+instance Outputable UnwindPoint where
+  ppr (UnwindPoint lbl uws) =
+      braces $ ppr lbl<>colon
+      <+> hsep (punctuate comma $ map pprUw $ Map.toList uws)
+    where
+      pprUw (g, expr) = ppr g <> char '=' <> ppr expr
+
+-- | Maps registers to expressions that yield their "old" values
+-- further up the stack. Most interesting for the stack pointer @Sp@,
+-- but might be useful to document saved registers, too. Note that a
+-- register's value will be 'Nothing' when the register's previous
+-- value cannot be reconstructed.
+type UnwindTable = Map.Map GlobalReg (Maybe UnwindExpr)
+
+-- | Expressions, used for unwind information
+data UnwindExpr = UwConst !Int                  -- ^ literal value
+                | UwReg !GlobalReg !Int         -- ^ register plus offset
+                | UwDeref UnwindExpr            -- ^ pointer dereferencing
+                | UwLabel CLabel
+                | UwPlus UnwindExpr UnwindExpr
+                | UwMinus UnwindExpr UnwindExpr
+                | UwTimes UnwindExpr UnwindExpr
+                deriving (Eq)
+
+instance Outputable UnwindExpr where
+  pprPrec _ (UwConst i)     = ppr i
+  pprPrec _ (UwReg g 0)     = ppr g
+  pprPrec p (UwReg g x)     = pprPrec p (UwPlus (UwReg g 0) (UwConst x))
+  pprPrec _ (UwDeref e)     = char '*' <> pprPrec 3 e
+  pprPrec _ (UwLabel l)     = pprPrec 3 l
+  pprPrec p (UwPlus e0 e1)  | p <= 0
+                            = pprPrec 0 e0 <> char '+' <> pprPrec 0 e1
+  pprPrec p (UwMinus e0 e1) | p <= 0
+                            = pprPrec 1 e0 <> char '-' <> pprPrec 1 e1
+  pprPrec p (UwTimes e0 e1) | p <= 1
+                            = pprPrec 2 e0 <> char '*' <> pprPrec 2 e1
+  pprPrec _ other           = parens (pprPrec 0 other)
+
+-- | Conversion of Cmm expressions to unwind expressions. We check for
+-- unsupported operator usages and simplify the expression as far as
+-- possible.
+toUnwindExpr :: CmmExpr -> UnwindExpr
+toUnwindExpr (CmmLit (CmmInt i _))       = UwConst (fromIntegral i)
+toUnwindExpr (CmmLit (CmmLabel l))       = UwLabel l
+toUnwindExpr (CmmRegOff (CmmGlobal g) i) = UwReg g i
+toUnwindExpr (CmmReg (CmmGlobal g))      = UwReg g 0
+toUnwindExpr (CmmLoad e _)               = UwDeref (toUnwindExpr e)
+toUnwindExpr e@(CmmMachOp op [e1, e2])   =
+  case (op, toUnwindExpr e1, toUnwindExpr e2) of
+    (MO_Add{}, UwReg r x, UwConst y) -> UwReg r (x + y)
+    (MO_Sub{}, UwReg r x, UwConst y) -> UwReg r (x - y)
+    (MO_Add{}, UwConst x, UwReg r y) -> UwReg r (x + y)
+    (MO_Add{}, UwConst x, UwConst y) -> UwConst (x + y)
+    (MO_Sub{}, UwConst x, UwConst y) -> UwConst (x - y)
+    (MO_Mul{}, UwConst x, UwConst y) -> UwConst (x * y)
+    (MO_Add{}, u1,        u2       ) -> UwPlus u1 u2
+    (MO_Sub{}, u1,        u2       ) -> UwMinus u1 u2
+    (MO_Mul{}, u1,        u2       ) -> UwTimes u1 u2
+    _otherwise -> pprPanic "Unsupported operator in unwind expression!"
+                           (pprExpr e)
+toUnwindExpr e
+  = pprPanic "Unsupported unwind expression!" (ppr e)
diff --git a/compiler/GHC/Cmm/Expr.hs b/compiler/GHC/Cmm/Expr.hs
new file mode 100644
index 0000000000..3b4f0156a0
--- /dev/null
+++ b/compiler/GHC/Cmm/Expr.hs
@@ -0,0 +1,619 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE UndecidableInstances #-}
+
+module GHC.Cmm.Expr
+    ( CmmExpr(..), cmmExprType, cmmExprWidth, cmmExprAlignment, maybeInvertCmmExpr
+    , CmmReg(..), cmmRegType, cmmRegWidth
+    , CmmLit(..), cmmLitType
+    , LocalReg(..), localRegType
+    , GlobalReg(..), isArgReg, globalRegType
+    , spReg, hpReg, spLimReg, hpLimReg, nodeReg
+    , currentTSOReg, currentNurseryReg, hpAllocReg, cccsReg
+    , node, baseReg
+    , VGcPtr(..)
+
+    , DefinerOfRegs, UserOfRegs
+    , foldRegsDefd, foldRegsUsed
+    , foldLocalRegsDefd, foldLocalRegsUsed
+
+    , RegSet, LocalRegSet, GlobalRegSet
+    , emptyRegSet, elemRegSet, extendRegSet, deleteFromRegSet, mkRegSet
+    , plusRegSet, minusRegSet, timesRegSet, sizeRegSet, nullRegSet
+    , regSetToList
+
+    , Area(..)
+    , module GHC.Cmm.MachOp
+    , module GHC.Cmm.Type
+    )
+where
+
+import GhcPrelude
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import GHC.Cmm.MachOp
+import GHC.Cmm.Type
+import DynFlags
+import Outputable (panic)
+import Unique
+
+import Data.Set (Set)
+import qualified Data.Set as Set
+
+import BasicTypes (Alignment, mkAlignment, alignmentOf)
+
+-----------------------------------------------------------------------------
+--              CmmExpr
+-- An expression.  Expressions have no side effects.
+-----------------------------------------------------------------------------
+
+data CmmExpr
+  = CmmLit CmmLit               -- Literal
+  | CmmLoad !CmmExpr !CmmType   -- Read memory location
+  | CmmReg !CmmReg              -- Contents of register
+  | CmmMachOp MachOp [CmmExpr]  -- Machine operation (+, -, *, etc.)
+  | CmmStackSlot Area {-# UNPACK #-} !Int
+                                -- addressing expression of a stack slot
+                                -- See Note [CmmStackSlot aliasing]
+  | CmmRegOff !CmmReg Int
+        -- CmmRegOff reg i
+        --        ** is shorthand only, meaning **
+        -- CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt (fromIntegral i) rep)]
+        --      where rep = typeWidth (cmmRegType reg)
+
+instance Eq CmmExpr where       -- Equality ignores the types
+  CmmLit l1          == CmmLit l2          = l1==l2
+  CmmLoad e1 _       == CmmLoad e2 _       = e1==e2
+  CmmReg r1          == CmmReg r2          = r1==r2
+  CmmRegOff r1 i1    == CmmRegOff r2 i2    = r1==r2 && i1==i2
+  CmmMachOp op1 es1  == CmmMachOp op2 es2  = op1==op2 && es1==es2
+  CmmStackSlot a1 i1 == CmmStackSlot a2 i2 = a1==a2 && i1==i2
+  _e1                == _e2                = False
+
+data CmmReg
+  = CmmLocal  {-# UNPACK #-} !LocalReg
+  | CmmGlobal GlobalReg
+  deriving( Eq, Ord )
+
+-- | A stack area is either the stack slot where a variable is spilled
+-- or the stack space where function arguments and results are passed.
+data Area
+  = Old            -- See Note [Old Area]
+  | Young {-# UNPACK #-} !BlockId  -- Invariant: must be a continuation BlockId
+                   -- See Note [Continuation BlockId] in GHC.Cmm.Node.
+  deriving (Eq, Ord)
+
+{- Note [Old Area]
+~~~~~~~~~~~~~~~~~~
+There is a single call area 'Old', allocated at the extreme old
+end of the stack frame (ie just younger than the return address)
+which holds:
+  * incoming (overflow) parameters,
+  * outgoing (overflow) parameter to tail calls,
+  * outgoing (overflow) result values
+  * the update frame (if any)
+
+Its size is the max of all these requirements.  On entry, the stack
+pointer will point to the youngest incoming parameter, which is not
+necessarily at the young end of the Old area.
+
+End of note -}
+
+
+{- Note [CmmStackSlot aliasing]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When do two CmmStackSlots alias?
+
+ - T[old+N] aliases with U[young(L)+M] for all T, U, L, N and M
+ - T[old+N] aliases with U[old+M] only if the areas actually overlap
+
+Or more informally, different Areas may overlap with each other.
+
+An alternative semantics, that we previously had, was that different
+Areas do not overlap.  The problem that lead to redefining the
+semantics of stack areas is described below.
+
+e.g. if we had
+
+    x = Sp[old + 8]
+    y = Sp[old + 16]
+
+    Sp[young(L) + 8]  = L
+    Sp[young(L) + 16] = y
+    Sp[young(L) + 24] = x
+    call f() returns to L
+
+if areas semantically do not overlap, then we might optimise this to
+
+    Sp[young(L) + 8]  = L
+    Sp[young(L) + 16] = Sp[old + 8]
+    Sp[young(L) + 24] = Sp[old + 16]
+    call f() returns to L
+
+and now young(L) cannot be allocated at the same place as old, and we
+are doomed to use more stack.
+
+  - old+8  conflicts with young(L)+8
+  - old+16 conflicts with young(L)+16 and young(L)+8
+
+so young(L)+8 == old+24 and we get
+
+    Sp[-8]  = L
+    Sp[-16] = Sp[8]
+    Sp[-24] = Sp[0]
+    Sp -= 24
+    call f() returns to L
+
+However, if areas are defined to be "possibly overlapping" in the
+semantics, then we cannot commute any loads/stores of old with
+young(L), and we will be able to re-use both old+8 and old+16 for
+young(L).
+
+    x = Sp[8]
+    y = Sp[0]
+
+    Sp[8] = L
+    Sp[0] = y
+    Sp[-8] = x
+    Sp = Sp - 8
+    call f() returns to L
+
+Now, the assignments of y go away,
+
+    x = Sp[8]
+    Sp[8] = L
+    Sp[-8] = x
+    Sp = Sp - 8
+    call f() returns to L
+-}
+
+data CmmLit
+  = CmmInt !Integer  Width
+        -- Interpretation: the 2's complement representation of the value
+        -- is truncated to the specified size.  This is easier than trying
+        -- to keep the value within range, because we don't know whether
+        -- it will be used as a signed or unsigned value (the CmmType doesn't
+        -- distinguish between signed & unsigned).
+  | CmmFloat  Rational Width
+  | CmmVec [CmmLit]                     -- Vector literal
+  | CmmLabel    CLabel                  -- Address of label
+  | CmmLabelOff CLabel Int              -- Address of label + byte offset
+
+        -- Due to limitations in the C backend, the following
+        -- MUST ONLY be used inside the info table indicated by label2
+        -- (label2 must be the info label), and label1 must be an
+        -- SRT, a slow entrypoint or a large bitmap (see the Mangler)
+        -- Don't use it at all unless tablesNextToCode.
+        -- It is also used inside the NCG during when generating
+        -- position-independent code.
+  | CmmLabelDiffOff CLabel CLabel Int Width -- label1 - label2 + offset
+        -- In an expression, the width just has the effect of MO_SS_Conv
+        -- from wordWidth to the desired width.
+        --
+        -- In a static literal, the supported Widths depend on the
+        -- architecture: wordWidth is supported on all
+        -- architectures. Additionally W32 is supported on x86_64 when
+        -- using the small memory model.
+
+  | CmmBlock {-# UNPACK #-} !BlockId     -- Code label
+        -- Invariant: must be a continuation BlockId
+        -- See Note [Continuation BlockId] in GHC.Cmm.Node.
+
+  | CmmHighStackMark -- A late-bound constant that stands for the max
+                     -- #bytes of stack space used during a procedure.
+                     -- During the stack-layout pass, CmmHighStackMark
+                     -- is replaced by a CmmInt for the actual number
+                     -- of bytes used
+  deriving Eq
+
+cmmExprType :: DynFlags -> CmmExpr -> CmmType
+cmmExprType dflags (CmmLit lit)        = cmmLitType dflags lit
+cmmExprType _      (CmmLoad _ rep)     = rep
+cmmExprType dflags (CmmReg reg)        = cmmRegType dflags reg
+cmmExprType dflags (CmmMachOp op args) = machOpResultType dflags op (map (cmmExprType dflags) args)
+cmmExprType dflags (CmmRegOff reg _)   = cmmRegType dflags reg
+cmmExprType dflags (CmmStackSlot _ _)  = bWord dflags -- an address
+-- Careful though: what is stored at the stack slot may be bigger than
+-- an address
+
+cmmLitType :: DynFlags -> CmmLit -> CmmType
+cmmLitType _      (CmmInt _ width)     = cmmBits  width
+cmmLitType _      (CmmFloat _ width)   = cmmFloat width
+cmmLitType _      (CmmVec [])          = panic "cmmLitType: CmmVec []"
+cmmLitType cflags (CmmVec (l:ls))      = let ty = cmmLitType cflags l
+                                         in if all (`cmmEqType` ty) (map (cmmLitType cflags) ls)
+                                            then cmmVec (1+length ls) ty
+                                            else panic "cmmLitType: CmmVec"
+cmmLitType dflags (CmmLabel lbl)       = cmmLabelType dflags lbl
+cmmLitType dflags (CmmLabelOff lbl _)  = cmmLabelType dflags lbl
+cmmLitType _      (CmmLabelDiffOff _ _ _ width) = cmmBits width
+cmmLitType dflags (CmmBlock _)         = bWord dflags
+cmmLitType dflags (CmmHighStackMark)   = bWord dflags
+
+cmmLabelType :: DynFlags -> CLabel -> CmmType
+cmmLabelType dflags lbl
+ | isGcPtrLabel lbl = gcWord dflags
+ | otherwise        = bWord dflags
+
+cmmExprWidth :: DynFlags -> CmmExpr -> Width
+cmmExprWidth dflags e = typeWidth (cmmExprType dflags e)
+
+-- | Returns an alignment in bytes of a CmmExpr when it's a statically
+-- known integer constant, otherwise returns an alignment of 1 byte.
+-- The caller is responsible for using with a sensible CmmExpr
+-- argument.
+cmmExprAlignment :: CmmExpr -> Alignment
+cmmExprAlignment (CmmLit (CmmInt intOff _)) = alignmentOf (fromInteger intOff)
+cmmExprAlignment _                          = mkAlignment 1
+--------
+--- Negation for conditional branches
+
+maybeInvertCmmExpr :: CmmExpr -> Maybe CmmExpr
+maybeInvertCmmExpr (CmmMachOp op args) = do op' <- maybeInvertComparison op
+                                            return (CmmMachOp op' args)
+maybeInvertCmmExpr _ = Nothing
+
+-----------------------------------------------------------------------------
+--              Local registers
+-----------------------------------------------------------------------------
+
+data LocalReg
+  = LocalReg {-# UNPACK #-} !Unique CmmType
+    -- ^ Parameters:
+    --   1. Identifier
+    --   2. Type
+
+instance Eq LocalReg where
+  (LocalReg u1 _) == (LocalReg u2 _) = u1 == u2
+
+-- This is non-deterministic but we do not currently support deterministic
+-- code-generation. See Note [Unique Determinism and code generation]
+-- See Note [No Ord for Unique]
+instance Ord LocalReg where
+  compare (LocalReg u1 _) (LocalReg u2 _) = nonDetCmpUnique u1 u2
+
+instance Uniquable LocalReg where
+  getUnique (LocalReg uniq _) = uniq
+
+cmmRegType :: DynFlags -> CmmReg -> CmmType
+cmmRegType _      (CmmLocal  reg) = localRegType reg
+cmmRegType dflags (CmmGlobal reg) = globalRegType dflags reg
+
+cmmRegWidth :: DynFlags -> CmmReg -> Width
+cmmRegWidth dflags = typeWidth . cmmRegType dflags
+
+localRegType :: LocalReg -> CmmType
+localRegType (LocalReg _ rep) = rep
+
+-----------------------------------------------------------------------------
+--    Register-use information for expressions and other types
+-----------------------------------------------------------------------------
+
+-- | Sets of registers
+
+-- These are used for dataflow facts, and a common operation is taking
+-- the union of two RegSets and then asking whether the union is the
+-- same as one of the inputs.  UniqSet isn't good here, because
+-- sizeUniqSet is O(n) whereas Set.size is O(1), so we use ordinary
+-- Sets.
+
+type RegSet r     = Set r
+type LocalRegSet  = RegSet LocalReg
+type GlobalRegSet = RegSet GlobalReg
+
+emptyRegSet             :: RegSet r
+nullRegSet              :: RegSet r -> Bool
+elemRegSet              :: Ord r => r -> RegSet r -> Bool
+extendRegSet            :: Ord r => RegSet r -> r -> RegSet r
+deleteFromRegSet        :: Ord r => RegSet r -> r -> RegSet r
+mkRegSet                :: Ord r => [r] -> RegSet r
+minusRegSet, plusRegSet, timesRegSet :: Ord r => RegSet r -> RegSet r -> RegSet r
+sizeRegSet              :: RegSet r -> Int
+regSetToList            :: RegSet r -> [r]
+
+emptyRegSet      = Set.empty
+nullRegSet       = Set.null
+elemRegSet       = Set.member
+extendRegSet     = flip Set.insert
+deleteFromRegSet = flip Set.delete
+mkRegSet         = Set.fromList
+minusRegSet      = Set.difference
+plusRegSet       = Set.union
+timesRegSet      = Set.intersection
+sizeRegSet       = Set.size
+regSetToList     = Set.toList
+
+class Ord r => UserOfRegs r a where
+  foldRegsUsed :: DynFlags -> (b -> r -> b) -> b -> a -> b
+
+foldLocalRegsUsed :: UserOfRegs LocalReg a
+                  => DynFlags -> (b -> LocalReg -> b) -> b -> a -> b
+foldLocalRegsUsed = foldRegsUsed
+
+class Ord r => DefinerOfRegs r a where
+  foldRegsDefd :: DynFlags -> (b -> r -> b) -> b -> a -> b
+
+foldLocalRegsDefd :: DefinerOfRegs LocalReg a
+                  => DynFlags -> (b -> LocalReg -> b) -> b -> a -> b
+foldLocalRegsDefd = foldRegsDefd
+
+instance UserOfRegs LocalReg CmmReg where
+    foldRegsUsed _ f z (CmmLocal reg) = f z reg
+    foldRegsUsed _ _ z (CmmGlobal _)  = z
+
+instance DefinerOfRegs LocalReg CmmReg where
+    foldRegsDefd _ f z (CmmLocal reg) = f z reg
+    foldRegsDefd _ _ z (CmmGlobal _)  = z
+
+instance UserOfRegs GlobalReg CmmReg where
+    foldRegsUsed _ _ z (CmmLocal _)    = z
+    foldRegsUsed _ f z (CmmGlobal reg) = f z reg
+
+instance DefinerOfRegs GlobalReg CmmReg where
+    foldRegsDefd _ _ z (CmmLocal _)    = z
+    foldRegsDefd _ f z (CmmGlobal reg) = f z reg
+
+instance Ord r => UserOfRegs r r where
+    foldRegsUsed _ f z r = f z r
+
+instance Ord r => DefinerOfRegs r r where
+    foldRegsDefd _ f z r = f z r
+
+instance (Ord r, UserOfRegs r CmmReg) => UserOfRegs r CmmExpr where
+  -- The (Ord r) in the context is necessary here
+  -- See Note [Recursive superclasses] in TcInstDcls
+  foldRegsUsed dflags f !z e = expr z e
+    where expr z (CmmLit _)          = z
+          expr z (CmmLoad addr _)    = foldRegsUsed dflags f z addr
+          expr z (CmmReg r)          = foldRegsUsed dflags f z r
+          expr z (CmmMachOp _ exprs) = foldRegsUsed dflags f z exprs
+          expr z (CmmRegOff r _)     = foldRegsUsed dflags f z r
+          expr z (CmmStackSlot _ _)  = z
+
+instance UserOfRegs r a => UserOfRegs r [a] where
+  foldRegsUsed dflags f set as = foldl' (foldRegsUsed dflags f) set as
+  {-# INLINABLE foldRegsUsed #-}
+
+instance DefinerOfRegs r a => DefinerOfRegs r [a] where
+  foldRegsDefd dflags f set as = foldl' (foldRegsDefd dflags f) set as
+  {-# INLINABLE foldRegsDefd #-}
+
+-----------------------------------------------------------------------------
+--              Global STG registers
+-----------------------------------------------------------------------------
+
+data VGcPtr = VGcPtr | VNonGcPtr deriving( Eq, Show )
+
+-----------------------------------------------------------------------------
+--              Global STG registers
+-----------------------------------------------------------------------------
+{-
+Note [Overlapping global registers]
+
+The backend might not faithfully implement the abstraction of the STG
+machine with independent registers for different values of type
+GlobalReg. Specifically, certain pairs of registers (r1, r2) may
+overlap in the sense that a store to r1 invalidates the value in r2,
+and vice versa.
+
+Currently this occurs only on the x86_64 architecture where FloatReg n
+and DoubleReg n are assigned the same microarchitectural register, in
+order to allow functions to receive more Float# or Double# arguments
+in registers (as opposed to on the stack).
+
+There are no specific rules about which registers might overlap with
+which other registers, but presumably it's safe to assume that nothing
+will overlap with special registers like Sp or BaseReg.
+
+Use GHC.Cmm.Utils.regsOverlap to determine whether two GlobalRegs overlap
+on a particular platform. The instance Eq GlobalReg is syntactic
+equality of STG registers and does not take overlap into
+account. However it is still used in UserOfRegs/DefinerOfRegs and
+there are likely still bugs there, beware!
+-}
+
+data GlobalReg
+  -- Argument and return registers
+  = VanillaReg                  -- pointers, unboxed ints and chars
+        {-# UNPACK #-} !Int     -- its number
+        VGcPtr
+
+  | FloatReg            -- single-precision floating-point registers
+        {-# UNPACK #-} !Int     -- its number
+
+  | DoubleReg           -- double-precision floating-point registers
+        {-# UNPACK #-} !Int     -- its number
+
+  | LongReg             -- long int registers (64-bit, really)
+        {-# UNPACK #-} !Int     -- its number
+
+  | XmmReg                      -- 128-bit SIMD vector register
+        {-# UNPACK #-} !Int     -- its number
+
+  | YmmReg                      -- 256-bit SIMD vector register
+        {-# UNPACK #-} !Int     -- its number
+
+  | ZmmReg                      -- 512-bit SIMD vector register
+        {-# UNPACK #-} !Int     -- its number
+
+  -- STG registers
+  | Sp                  -- Stack ptr; points to last occupied stack location.
+  | SpLim               -- Stack limit
+  | Hp                  -- Heap ptr; points to last occupied heap location.
+  | HpLim               -- Heap limit register
+  | CCCS                -- Current cost-centre stack
+  | CurrentTSO          -- pointer to current thread's TSO
+  | CurrentNursery      -- pointer to allocation area
+  | HpAlloc             -- allocation count for heap check failure
+
+                -- We keep the address of some commonly-called
+                -- functions in the register table, to keep code
+                -- size down:
+  | EagerBlackholeInfo  -- stg_EAGER_BLACKHOLE_info
+  | GCEnter1            -- stg_gc_enter_1
+  | GCFun               -- stg_gc_fun
+
+  -- Base offset for the register table, used for accessing registers
+  -- which do not have real registers assigned to them.  This register
+  -- will only appear after we have expanded GlobalReg into memory accesses
+  -- (where necessary) in the native code generator.
+  | BaseReg
+
+  -- The register used by the platform for the C stack pointer. This is
+  -- a break in the STG abstraction used exclusively to setup stack unwinding
+  -- information.
+  | MachSp
+
+  -- The is a dummy register used to indicate to the stack unwinder where
+  -- a routine would return to.
+  | UnwindReturnReg
+
+  -- Base Register for PIC (position-independent code) calculations
+  -- Only used inside the native code generator. It's exact meaning differs
+  -- from platform to platform (see module PositionIndependentCode).
+  | PicBaseReg
+
+  deriving( Show )
+
+instance Eq GlobalReg where
+   VanillaReg i _ == VanillaReg j _ = i==j -- Ignore type when seeking clashes
+   FloatReg i == FloatReg j = i==j
+   DoubleReg i == DoubleReg j = i==j
+   LongReg i == LongReg j = i==j
+   -- NOTE: XMM, YMM, ZMM registers actually are the same registers
+   -- at least with respect to store at YMM i and then read from XMM i
+   -- and similarly for ZMM etc.
+   XmmReg i == XmmReg j = i==j
+   YmmReg i == YmmReg j = i==j
+   ZmmReg i == ZmmReg j = i==j
+   Sp == Sp = True
+   SpLim == SpLim = True
+   Hp == Hp = True
+   HpLim == HpLim = True
+   CCCS == CCCS = True
+   CurrentTSO == CurrentTSO = True
+   CurrentNursery == CurrentNursery = True
+   HpAlloc == HpAlloc = True
+   EagerBlackholeInfo == EagerBlackholeInfo = True
+   GCEnter1 == GCEnter1 = True
+   GCFun == GCFun = True
+   BaseReg == BaseReg = True
+   MachSp == MachSp = True
+   UnwindReturnReg == UnwindReturnReg = True
+   PicBaseReg == PicBaseReg = True
+   _r1 == _r2 = False
+
+instance Ord GlobalReg where
+   compare (VanillaReg i _) (VanillaReg j _) = compare i j
+     -- Ignore type when seeking clashes
+   compare (FloatReg i)  (FloatReg  j) = compare i j
+   compare (DoubleReg i) (DoubleReg j) = compare i j
+   compare (LongReg i)   (LongReg   j) = compare i j
+   compare (XmmReg i)    (XmmReg    j) = compare i j
+   compare (YmmReg i)    (YmmReg    j) = compare i j
+   compare (ZmmReg i)    (ZmmReg    j) = compare i j
+   compare Sp Sp = EQ
+   compare SpLim SpLim = EQ
+   compare Hp Hp = EQ
+   compare HpLim HpLim = EQ
+   compare CCCS CCCS = EQ
+   compare CurrentTSO CurrentTSO = EQ
+   compare CurrentNursery CurrentNursery = EQ
+   compare HpAlloc HpAlloc = EQ
+   compare EagerBlackholeInfo EagerBlackholeInfo = EQ
+   compare GCEnter1 GCEnter1 = EQ
+   compare GCFun GCFun = EQ
+   compare BaseReg BaseReg = EQ
+   compare MachSp MachSp = EQ
+   compare UnwindReturnReg UnwindReturnReg = EQ
+   compare PicBaseReg PicBaseReg = EQ
+   compare (VanillaReg _ _) _ = LT
+   compare _ (VanillaReg _ _) = GT
+   compare (FloatReg _) _     = LT
+   compare _ (FloatReg _)     = GT
+   compare (DoubleReg _) _    = LT
+   compare _ (DoubleReg _)    = GT
+   compare (LongReg _) _      = LT
+   compare _ (LongReg _)      = GT
+   compare (XmmReg _) _       = LT
+   compare _ (XmmReg _)       = GT
+   compare (YmmReg _) _       = LT
+   compare _ (YmmReg _)       = GT
+   compare (ZmmReg _) _       = LT
+   compare _ (ZmmReg _)       = GT
+   compare Sp _ = LT
+   compare _ Sp = GT
+   compare SpLim _ = LT
+   compare _ SpLim = GT
+   compare Hp _ = LT
+   compare _ Hp = GT
+   compare HpLim _ = LT
+   compare _ HpLim = GT
+   compare CCCS _ = LT
+   compare _ CCCS = GT
+   compare CurrentTSO _ = LT
+   compare _ CurrentTSO = GT
+   compare CurrentNursery _ = LT
+   compare _ CurrentNursery = GT
+   compare HpAlloc _ = LT
+   compare _ HpAlloc = GT
+   compare GCEnter1 _ = LT
+   compare _ GCEnter1 = GT
+   compare GCFun _ = LT
+   compare _ GCFun = GT
+   compare BaseReg _ = LT
+   compare _ BaseReg = GT
+   compare MachSp _ = LT
+   compare _ MachSp = GT
+   compare UnwindReturnReg _ = LT
+   compare _ UnwindReturnReg = GT
+   compare EagerBlackholeInfo _ = LT
+   compare _ EagerBlackholeInfo = GT
+
+-- convenient aliases
+baseReg, spReg, hpReg, spLimReg, hpLimReg, nodeReg,
+  currentTSOReg, currentNurseryReg, hpAllocReg, cccsReg  :: CmmReg
+baseReg = CmmGlobal BaseReg
+spReg = CmmGlobal Sp
+hpReg = CmmGlobal Hp
+hpLimReg = CmmGlobal HpLim
+spLimReg = CmmGlobal SpLim
+nodeReg = CmmGlobal node
+currentTSOReg = CmmGlobal CurrentTSO
+currentNurseryReg = CmmGlobal CurrentNursery
+hpAllocReg = CmmGlobal HpAlloc
+cccsReg = CmmGlobal CCCS
+
+node :: GlobalReg
+node = VanillaReg 1 VGcPtr
+
+globalRegType :: DynFlags -> GlobalReg -> CmmType
+globalRegType dflags (VanillaReg _ VGcPtr)    = gcWord dflags
+globalRegType dflags (VanillaReg _ VNonGcPtr) = bWord dflags
+globalRegType _      (FloatReg _)      = cmmFloat W32
+globalRegType _      (DoubleReg _)     = cmmFloat W64
+globalRegType _      (LongReg _)       = cmmBits W64
+-- TODO: improve the internal model of SIMD/vectorized registers
+-- the right design SHOULd improve handling of float and double code too.
+-- see remarks in "NOTE [SIMD Design for the future]"" in GHC.StgToCmm.Prim
+globalRegType _      (XmmReg _)        = cmmVec 4 (cmmBits W32)
+globalRegType _      (YmmReg _)        = cmmVec 8 (cmmBits W32)
+globalRegType _      (ZmmReg _)        = cmmVec 16 (cmmBits W32)
+
+globalRegType dflags Hp                = gcWord dflags
+                                            -- The initialiser for all
+                                            -- dynamically allocated closures
+globalRegType dflags _                 = bWord dflags
+
+isArgReg :: GlobalReg -> Bool
+isArgReg (VanillaReg {}) = True
+isArgReg (FloatReg {})   = True
+isArgReg (DoubleReg {})  = True
+isArgReg (LongReg {})    = True
+isArgReg (XmmReg {})     = True
+isArgReg (YmmReg {})     = True
+isArgReg (ZmmReg {})     = True
+isArgReg _               = False
diff --git a/compiler/GHC/Cmm/Graph.hs b/compiler/GHC/Cmm/Graph.hs
new file mode 100644
index 0000000000..8d19e7fdb9
--- /dev/null
+++ b/compiler/GHC/Cmm/Graph.hs
@@ -0,0 +1,484 @@
+{-# LANGUAGE BangPatterns, GADTs #-}
+
+module GHC.Cmm.Graph
+  ( CmmAGraph, CmmAGraphScoped, CgStmt(..)
+  , (<*>), catAGraphs
+  , mkLabel, mkMiddle, mkLast, outOfLine
+  , lgraphOfAGraph, labelAGraph
+
+  , stackStubExpr
+  , mkNop, mkAssign, mkStore
+  , mkUnsafeCall, mkFinalCall, mkCallReturnsTo
+  , mkJumpReturnsTo
+  , mkJump, mkJumpExtra
+  , mkRawJump
+  , mkCbranch, mkSwitch
+  , mkReturn, mkComment, mkCallEntry, mkBranch
+  , mkUnwind
+  , copyInOflow, copyOutOflow
+  , noExtraStack
+  , toCall, Transfer(..)
+  )
+where
+
+import GhcPrelude hiding ( (<*>) ) -- avoid importing (<*>)
+
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.CallConv
+import GHC.Cmm.Switch (SwitchTargets)
+
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+import DynFlags
+import FastString
+import ForeignCall
+import OrdList
+import GHC.Runtime.Layout (ByteOff)
+import UniqSupply
+import Util
+import Panic
+
+
+-----------------------------------------------------------------------------
+-- Building Graphs
+
+
+-- | CmmAGraph is a chunk of code consisting of:
+--
+--   * ordinary statements (assignments, stores etc.)
+--   * jumps
+--   * labels
+--   * out-of-line labelled blocks
+--
+-- The semantics is that control falls through labels and out-of-line
+-- blocks.  Everything after a jump up to the next label is by
+-- definition unreachable code, and will be discarded.
+--
+-- Two CmmAGraphs can be stuck together with <*>, with the meaning that
+-- control flows from the first to the second.
+--
+-- A 'CmmAGraph' can be turned into a 'CmmGraph' (closed at both ends)
+-- by providing a label for the entry point and a tick scope; see
+-- 'labelAGraph'.
+type CmmAGraph = OrdList CgStmt
+-- | Unlabeled graph with tick scope
+type CmmAGraphScoped = (CmmAGraph, CmmTickScope)
+
+data CgStmt
+  = CgLabel BlockId CmmTickScope
+  | CgStmt  (CmmNode O O)
+  | CgLast  (CmmNode O C)
+  | CgFork  BlockId CmmAGraph CmmTickScope
+
+flattenCmmAGraph :: BlockId -> CmmAGraphScoped -> CmmGraph
+flattenCmmAGraph id (stmts_t, tscope) =
+    CmmGraph { g_entry = id,
+               g_graph = GMany NothingO body NothingO }
+  where
+  body = foldr addBlock emptyBody $ flatten id stmts_t tscope []
+
+  --
+  -- flatten: given an entry label and a CmmAGraph, make a list of blocks.
+  --
+  -- NB. avoid the quadratic-append trap by passing in the tail of the
+  -- list.  This is important for Very Long Functions (e.g. in T783).
+  --
+  flatten :: Label -> CmmAGraph -> CmmTickScope -> [Block CmmNode C C]
+          -> [Block CmmNode C C]
+  flatten id g tscope blocks
+      = flatten1 (fromOL g) block' blocks
+      where !block' = blockJoinHead (CmmEntry id tscope) emptyBlock
+  --
+  -- flatten0: we are outside a block at this point: any code before
+  -- the first label is unreachable, so just drop it.
+  --
+  flatten0 :: [CgStmt] -> [Block CmmNode C C] -> [Block CmmNode C C]
+  flatten0 [] blocks = blocks
+
+  flatten0 (CgLabel id tscope : stmts) blocks
+    = flatten1 stmts block blocks
+    where !block = blockJoinHead (CmmEntry id tscope) emptyBlock
+
+  flatten0 (CgFork fork_id stmts_t tscope : rest) blocks
+    = flatten fork_id stmts_t tscope $ flatten0 rest blocks
+
+  flatten0 (CgLast _ : stmts) blocks = flatten0 stmts blocks
+  flatten0 (CgStmt _ : stmts) blocks = flatten0 stmts blocks
+
+  --
+  -- flatten1: we have a partial block, collect statements until the
+  -- next last node to make a block, then call flatten0 to get the rest
+  -- of the blocks
+  --
+  flatten1 :: [CgStmt] -> Block CmmNode C O
+           -> [Block CmmNode C C] -> [Block CmmNode C C]
+
+  -- The current block falls through to the end of a function or fork:
+  -- this code should not be reachable, but it may be referenced by
+  -- other code that is not reachable.  We'll remove it later with
+  -- dead-code analysis, but for now we have to keep the graph
+  -- well-formed, so we terminate the block with a branch to the
+  -- beginning of the current block.
+  flatten1 [] block blocks
+    = blockJoinTail block (CmmBranch (entryLabel block)) : blocks
+
+  flatten1 (CgLast stmt : stmts) block blocks
+    = block' : flatten0 stmts blocks
+    where !block' = blockJoinTail block stmt
+
+  flatten1 (CgStmt stmt : stmts) block blocks
+    = flatten1 stmts block' blocks
+    where !block' = blockSnoc block stmt
+
+  flatten1 (CgFork fork_id stmts_t tscope : rest) block blocks
+    = flatten fork_id stmts_t tscope $ flatten1 rest block blocks
+
+  -- a label here means that we should start a new block, and the
+  -- current block should fall through to the new block.
+  flatten1 (CgLabel id tscp : stmts) block blocks
+    = blockJoinTail block (CmmBranch id) :
+      flatten1 stmts (blockJoinHead (CmmEntry id tscp) emptyBlock) blocks
+
+
+
+---------- AGraph manipulation
+
+(<*>)          :: CmmAGraph -> CmmAGraph -> CmmAGraph
+(<*>)           = appOL
+
+catAGraphs     :: [CmmAGraph] -> CmmAGraph
+catAGraphs      = concatOL
+
+-- | creates a sequence "goto id; id:" as an AGraph
+mkLabel        :: BlockId -> CmmTickScope -> CmmAGraph
+mkLabel bid scp = unitOL (CgLabel bid scp)
+
+-- | creates an open AGraph from a given node
+mkMiddle        :: CmmNode O O -> CmmAGraph
+mkMiddle middle = unitOL (CgStmt middle)
+
+-- | creates a closed AGraph from a given node
+mkLast         :: CmmNode O C -> CmmAGraph
+mkLast last     = unitOL (CgLast last)
+
+-- | A labelled code block; should end in a last node
+outOfLine      :: BlockId -> CmmAGraphScoped -> CmmAGraph
+outOfLine l (c,s) = unitOL (CgFork l c s)
+
+-- | allocate a fresh label for the entry point
+lgraphOfAGraph :: CmmAGraphScoped -> UniqSM CmmGraph
+lgraphOfAGraph g = do
+  u <- getUniqueM
+  return (labelAGraph (mkBlockId u) g)
+
+-- | use the given BlockId as the label of the entry point
+labelAGraph    :: BlockId -> CmmAGraphScoped -> CmmGraph
+labelAGraph lbl ag = flattenCmmAGraph lbl ag
+
+---------- No-ops
+mkNop        :: CmmAGraph
+mkNop         = nilOL
+
+mkComment    :: FastString -> CmmAGraph
+mkComment fs
+  -- SDM: generating all those comments takes time, this saved about 4% for me
+  | debugIsOn = mkMiddle $ CmmComment fs
+  | otherwise = nilOL
+
+---------- Assignment and store
+mkAssign     :: CmmReg  -> CmmExpr -> CmmAGraph
+mkAssign l (CmmReg r) | l == r  = mkNop
+mkAssign l r  = mkMiddle $ CmmAssign l r
+
+mkStore      :: CmmExpr -> CmmExpr -> CmmAGraph
+mkStore  l r  = mkMiddle $ CmmStore  l r
+
+---------- Control transfer
+mkJump          :: DynFlags -> Convention -> CmmExpr
+                -> [CmmExpr]
+                -> UpdFrameOffset
+                -> CmmAGraph
+mkJump dflags conv e actuals updfr_off =
+  lastWithArgs dflags Jump Old conv actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+-- | A jump where the caller says what the live GlobalRegs are.  Used
+-- for low-level hand-written Cmm.
+mkRawJump       :: DynFlags -> CmmExpr -> UpdFrameOffset -> [GlobalReg]
+                -> CmmAGraph
+mkRawJump dflags e updfr_off vols =
+  lastWithArgs dflags Jump Old NativeNodeCall [] updfr_off $
+    \arg_space _  -> toCall e Nothing updfr_off 0 arg_space vols
+
+
+mkJumpExtra :: DynFlags -> Convention -> CmmExpr -> [CmmExpr]
+                -> UpdFrameOffset -> [CmmExpr]
+                -> CmmAGraph
+mkJumpExtra dflags conv e actuals updfr_off extra_stack =
+  lastWithArgsAndExtraStack dflags Jump Old conv actuals updfr_off extra_stack $
+    toCall e Nothing updfr_off 0
+
+mkCbranch       :: CmmExpr -> BlockId -> BlockId -> Maybe Bool -> CmmAGraph
+mkCbranch pred ifso ifnot likely =
+  mkLast (CmmCondBranch pred ifso ifnot likely)
+
+mkSwitch        :: CmmExpr -> SwitchTargets -> CmmAGraph
+mkSwitch e tbl   = mkLast $ CmmSwitch e tbl
+
+mkReturn        :: DynFlags -> CmmExpr -> [CmmExpr] -> UpdFrameOffset
+                -> CmmAGraph
+mkReturn dflags e actuals updfr_off =
+  lastWithArgs dflags Ret  Old NativeReturn actuals updfr_off $
+    toCall e Nothing updfr_off 0
+
+mkBranch        :: BlockId -> CmmAGraph
+mkBranch bid     = mkLast (CmmBranch bid)
+
+mkFinalCall   :: DynFlags
+              -> CmmExpr -> CCallConv -> [CmmExpr] -> UpdFrameOffset
+              -> CmmAGraph
+mkFinalCall dflags f _ actuals updfr_off =
+  lastWithArgs dflags Call Old NativeDirectCall actuals updfr_off $
+    toCall f Nothing updfr_off 0
+
+mkCallReturnsTo :: DynFlags -> CmmExpr -> Convention -> [CmmExpr]
+                -> BlockId
+                -> ByteOff
+                -> UpdFrameOffset
+                -> [CmmExpr]
+                -> CmmAGraph
+mkCallReturnsTo dflags f callConv actuals ret_lbl ret_off updfr_off extra_stack = do
+  lastWithArgsAndExtraStack dflags Call (Young ret_lbl) callConv actuals
+     updfr_off extra_stack $
+       toCall f (Just ret_lbl) updfr_off ret_off
+
+-- Like mkCallReturnsTo, but does not push the return address (it is assumed to be
+-- already on the stack).
+mkJumpReturnsTo :: DynFlags -> CmmExpr -> Convention -> [CmmExpr]
+                -> BlockId
+                -> ByteOff
+                -> UpdFrameOffset
+                -> CmmAGraph
+mkJumpReturnsTo dflags f callConv actuals ret_lbl ret_off updfr_off  = do
+  lastWithArgs dflags JumpRet (Young ret_lbl) callConv actuals updfr_off $
+       toCall f (Just ret_lbl) updfr_off ret_off
+
+mkUnsafeCall  :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> CmmAGraph
+mkUnsafeCall t fs as = mkMiddle $ CmmUnsafeForeignCall t fs as
+
+-- | Construct a 'CmmUnwind' node for the given register and unwinding
+-- expression.
+mkUnwind     :: GlobalReg -> CmmExpr -> CmmAGraph
+mkUnwind r e  = mkMiddle $ CmmUnwind [(r, Just e)]
+
+--------------------------------------------------------------------------
+
+
+
+
+-- Why are we inserting extra blocks that simply branch to the successors?
+-- Because in addition to the branch instruction, @mkBranch@ will insert
+-- a necessary adjustment to the stack pointer.
+
+
+-- For debugging purposes, we can stub out dead stack slots:
+stackStubExpr :: Width -> CmmExpr
+stackStubExpr w = CmmLit (CmmInt 0 w)
+
+-- When we copy in parameters, we usually want to put overflow
+-- parameters on the stack, but sometimes we want to pass the
+-- variables in their spill slots.  Therefore, for copying arguments
+-- and results, we provide different functions to pass the arguments
+-- in an overflow area and to pass them in spill slots.
+copyInOflow  :: DynFlags -> Convention -> Area
+             -> [CmmFormal]
+             -> [CmmFormal]
+             -> (Int, [GlobalReg], CmmAGraph)
+
+copyInOflow dflags conv area formals extra_stk
+  = (offset, gregs, catAGraphs $ map mkMiddle nodes)
+  where (offset, gregs, nodes) = copyIn dflags conv area formals extra_stk
+
+-- Return the number of bytes used for copying arguments, as well as the
+-- instructions to copy the arguments.
+copyIn :: DynFlags -> Convention -> Area
+       -> [CmmFormal]
+       -> [CmmFormal]
+       -> (ByteOff, [GlobalReg], [CmmNode O O])
+copyIn dflags conv area formals extra_stk
+  = (stk_size, [r | (_, RegisterParam r) <- args], map ci (stk_args ++ args))
+  where
+    -- See Note [Width of parameters]
+    ci (reg, RegisterParam r@(VanillaReg {})) =
+        let local = CmmLocal reg
+            global = CmmReg (CmmGlobal r)
+            width = cmmRegWidth dflags local
+            expr
+                | width == wordWidth dflags = global
+                | width < wordWidth dflags =
+                    CmmMachOp (MO_XX_Conv (wordWidth dflags) width) [global]
+                | otherwise = panic "Parameter width greater than word width"
+
+        in CmmAssign local expr
+
+    -- Non VanillaRegs
+    ci (reg, RegisterParam r) =
+        CmmAssign (CmmLocal reg) (CmmReg (CmmGlobal r))
+
+    ci (reg, StackParam off)
+      | isBitsType $ localRegType reg
+      , typeWidth (localRegType reg) < wordWidth dflags =
+        let
+          stack_slot = (CmmLoad (CmmStackSlot area off) (cmmBits $ wordWidth dflags))
+          local = CmmLocal reg
+          width = cmmRegWidth dflags local
+          expr  = CmmMachOp (MO_XX_Conv (wordWidth dflags) width) [stack_slot]
+        in CmmAssign local expr
+
+      | otherwise =
+         CmmAssign (CmmLocal reg) (CmmLoad (CmmStackSlot area off) ty)
+         where ty = localRegType reg
+
+    init_offset = widthInBytes (wordWidth dflags) -- infotable
+
+    (stk_off, stk_args) = assignStack dflags init_offset localRegType extra_stk
+
+    (stk_size, args) = assignArgumentsPos dflags stk_off conv
+                                          localRegType formals
+
+-- Factoring out the common parts of the copyout functions yielded something
+-- more complicated:
+
+data Transfer = Call | JumpRet | Jump | Ret deriving Eq
+
+copyOutOflow :: DynFlags -> Convention -> Transfer -> Area -> [CmmExpr]
+             -> UpdFrameOffset
+             -> [CmmExpr] -- extra stack args
+             -> (Int, [GlobalReg], CmmAGraph)
+
+-- Generate code to move the actual parameters into the locations
+-- required by the calling convention.  This includes a store for the
+-- return address.
+--
+-- The argument layout function ignores the pointer to the info table,
+-- so we slot that in here. When copying-out to a young area, we set
+-- the info table for return and adjust the offsets of the other
+-- parameters.  If this is a call instruction, we adjust the offsets
+-- of the other parameters.
+copyOutOflow dflags conv transfer area actuals updfr_off extra_stack_stuff
+  = (stk_size, regs, graph)
+  where
+    (regs, graph) = foldr co ([], mkNop) (setRA ++ args ++ stack_params)
+
+    -- See Note [Width of parameters]
+    co (v, RegisterParam r@(VanillaReg {})) (rs, ms) =
+        let width = cmmExprWidth dflags v
+            value
+                | width == wordWidth dflags = v
+                | width < wordWidth dflags =
+                    CmmMachOp (MO_XX_Conv width (wordWidth dflags)) [v]
+                | otherwise = panic "Parameter width greater than word width"
+
+        in (r:rs, mkAssign (CmmGlobal r) value <*> ms)
+
+    -- Non VanillaRegs
+    co (v, RegisterParam r) (rs, ms) =
+        (r:rs, mkAssign (CmmGlobal r) v <*> ms)
+
+    -- See Note [Width of parameters]
+    co (v, StackParam off)  (rs, ms)
+      = (rs, mkStore (CmmStackSlot area off) (value v) <*> ms)
+
+    width v = cmmExprWidth dflags v
+    value v
+      | isBitsType $ cmmExprType dflags v
+      , width v < wordWidth dflags =
+        CmmMachOp (MO_XX_Conv (width v) (wordWidth dflags)) [v]
+      | otherwise = v
+
+    (setRA, init_offset) =
+      case area of
+            Young id ->  -- Generate a store instruction for
+                         -- the return address if making a call
+                  case transfer of
+                     Call ->
+                       ([(CmmLit (CmmBlock id), StackParam init_offset)],
+                       widthInBytes (wordWidth dflags))
+                     JumpRet ->
+                       ([],
+                       widthInBytes (wordWidth dflags))
+                     _other ->
+                       ([], 0)
+            Old -> ([], updfr_off)
+
+    (extra_stack_off, stack_params) =
+       assignStack dflags init_offset (cmmExprType dflags) extra_stack_stuff
+
+    args :: [(CmmExpr, ParamLocation)]   -- The argument and where to put it
+    (stk_size, args) = assignArgumentsPos dflags extra_stack_off conv
+                                          (cmmExprType dflags) actuals
+
+
+-- Note [Width of parameters]
+--
+-- Consider passing a small (< word width) primitive like Int8# to a function.
+-- It's actually non-trivial to do this without extending/narrowing:
+-- * Global registers are considered to have native word width (i.e., 64-bits on
+--   x86-64), so CmmLint would complain if we assigned an 8-bit parameter to a
+--   global register.
+-- * Same problem exists with LLVM IR.
+-- * Lowering gets harder since on x86-32 not every register exposes its lower
+--   8 bits (e.g., for %eax we can use %al, but there isn't a corresponding
+--   8-bit register for %edi). So we would either need to extend/narrow anyway,
+--   or complicate the calling convention.
+-- * Passing a small integer in a stack slot, which has native word width,
+--   requires extending to word width when writing to the stack and narrowing
+--   when reading off the stack (see #16258).
+-- So instead, we always extend every parameter smaller than native word width
+-- in copyOutOflow and then truncate it back to the expected width in copyIn.
+-- Note that we do this in cmm using MO_XX_Conv to avoid requiring
+-- zero-/sign-extending - it's up to a backend to handle this in a most
+-- efficient way (e.g., a simple register move or a smaller size store).
+-- This convention (of ignoring the upper bits) is different from some C ABIs,
+-- e.g. all PowerPC ELF ABIs, that require sign or zero extending parameters.
+--
+-- There was some discussion about this on this PR:
+-- https://github.com/ghc-proposals/ghc-proposals/pull/74
+
+
+mkCallEntry :: DynFlags -> Convention -> [CmmFormal] -> [CmmFormal]
+            -> (Int, [GlobalReg], CmmAGraph)
+mkCallEntry dflags conv formals extra_stk
+  = copyInOflow dflags conv Old formals extra_stk
+
+lastWithArgs :: DynFlags -> Transfer -> Area -> Convention -> [CmmExpr]
+             -> UpdFrameOffset
+             -> (ByteOff -> [GlobalReg] -> CmmAGraph)
+             -> CmmAGraph
+lastWithArgs dflags transfer area conv actuals updfr_off last =
+  lastWithArgsAndExtraStack dflags transfer area conv actuals
+                            updfr_off noExtraStack last
+
+lastWithArgsAndExtraStack :: DynFlags
+             -> Transfer -> Area -> Convention -> [CmmExpr]
+             -> UpdFrameOffset -> [CmmExpr]
+             -> (ByteOff -> [GlobalReg] -> CmmAGraph)
+             -> CmmAGraph
+lastWithArgsAndExtraStack dflags transfer area conv actuals updfr_off
+                          extra_stack last =
+  copies <*> last outArgs regs
+ where
+  (outArgs, regs, copies) = copyOutOflow dflags conv transfer area actuals
+                               updfr_off extra_stack
+
+
+noExtraStack :: [CmmExpr]
+noExtraStack = []
+
+toCall :: CmmExpr -> Maybe BlockId -> UpdFrameOffset -> ByteOff
+       -> ByteOff -> [GlobalReg]
+       -> CmmAGraph
+toCall e cont updfr_off res_space arg_space regs =
+  mkLast $ CmmCall e cont regs arg_space res_space updfr_off
diff --git a/compiler/GHC/Cmm/Info.hs b/compiler/GHC/Cmm/Info.hs
new file mode 100644
index 0000000000..a10db2b292
--- /dev/null
+++ b/compiler/GHC/Cmm/Info.hs
@@ -0,0 +1,593 @@
+{-# LANGUAGE CPP #-}
+module GHC.Cmm.Info (
+  mkEmptyContInfoTable,
+  cmmToRawCmm,
+  mkInfoTable,
+  srtEscape,
+
+  -- info table accessors
+  closureInfoPtr,
+  entryCode,
+  getConstrTag,
+  cmmGetClosureType,
+  infoTable,
+  infoTableConstrTag,
+  infoTableSrtBitmap,
+  infoTableClosureType,
+  infoTablePtrs,
+  infoTableNonPtrs,
+  funInfoTable,
+  funInfoArity,
+
+  -- info table sizes and offsets
+  stdInfoTableSizeW,
+  fixedInfoTableSizeW,
+  profInfoTableSizeW,
+  maxStdInfoTableSizeW,
+  maxRetInfoTableSizeW,
+  stdInfoTableSizeB,
+  conInfoTableSizeB,
+  stdSrtBitmapOffset,
+  stdClosureTypeOffset,
+  stdPtrsOffset, stdNonPtrsOffset,
+) where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.CLabel
+import GHC.Runtime.Layout
+import GHC.Data.Bitmap
+import Stream (Stream)
+import qualified Stream
+import GHC.Cmm.Dataflow.Collections
+
+import GHC.Platform
+import Maybes
+import DynFlags
+import ErrUtils (withTimingSilent)
+import Panic
+import UniqSupply
+import MonadUtils
+import Util
+import Outputable
+
+import Data.ByteString (ByteString)
+import Data.Bits
+
+-- When we split at proc points, we need an empty info table.
+mkEmptyContInfoTable :: CLabel -> CmmInfoTable
+mkEmptyContInfoTable info_lbl
+  = CmmInfoTable { cit_lbl  = info_lbl
+                 , cit_rep  = mkStackRep []
+                 , cit_prof = NoProfilingInfo
+                 , cit_srt  = Nothing
+                 , cit_clo  = Nothing }
+
+cmmToRawCmm :: DynFlags -> Stream IO CmmGroup a
+            -> IO (Stream IO RawCmmGroup a)
+cmmToRawCmm dflags cmms
+  = do { uniqs <- mkSplitUniqSupply 'i'
+       ; let do_one :: UniqSupply -> [CmmDecl] -> IO (UniqSupply, [RawCmmDecl])
+             do_one uniqs cmm =
+               -- NB. strictness fixes a space leak.  DO NOT REMOVE.
+               withTimingSilent dflags (text "Cmm -> Raw Cmm")
+                                forceRes $
+                 case initUs uniqs $ concatMapM (mkInfoTable dflags) cmm of
+                   (b,uniqs') -> return (uniqs',b)
+       ; return (snd <$> Stream.mapAccumL_ do_one uniqs cmms)
+       }
+
+    where forceRes (uniqs, rawcmms) =
+            uniqs `seq` foldr (\decl r -> decl `seq` r) () rawcmms
+
+-- Make a concrete info table, represented as a list of CmmStatic
+-- (it can't be simply a list of Word, because the SRT field is
+-- represented by a label+offset expression).
+--
+-- With tablesNextToCode, the layout is
+--      <reversed variable part>
+--      <normal forward StgInfoTable, but without
+--              an entry point at the front>
+--      <code>
+--
+-- Without tablesNextToCode, the layout of an info table is
+--      <entry label>
+--      <normal forward rest of StgInfoTable>
+--      <forward variable part>
+--
+--      See includes/rts/storage/InfoTables.h
+--
+-- For return-points these are as follows
+--
+-- Tables next to code:
+--
+--                      <srt slot>
+--                      <standard info table>
+--      ret-addr -->    <entry code (if any)>
+--
+-- Not tables-next-to-code:
+--
+--      ret-addr -->    <ptr to entry code>
+--                      <standard info table>
+--                      <srt slot>
+--
+--  * The SRT slot is only there if there is SRT info to record
+
+mkInfoTable :: DynFlags -> CmmDecl -> UniqSM [RawCmmDecl]
+mkInfoTable _ (CmmData sec dat)
+  = return [CmmData sec dat]
+
+mkInfoTable dflags proc@(CmmProc infos entry_lbl live blocks)
+  --
+  -- in the non-tables-next-to-code case, procs can have at most a
+  -- single info table associated with the entry label of the proc.
+  --
+  | not (tablesNextToCode dflags)
+  = case topInfoTable proc of   --  must be at most one
+      -- no info table
+      Nothing ->
+         return [CmmProc mapEmpty entry_lbl live blocks]
+
+      Just info@CmmInfoTable { cit_lbl = info_lbl } -> do
+        (top_decls, (std_info, extra_bits)) <-
+             mkInfoTableContents dflags info Nothing
+        let
+          rel_std_info   = map (makeRelativeRefTo dflags info_lbl) std_info
+          rel_extra_bits = map (makeRelativeRefTo dflags info_lbl) extra_bits
+        --
+        -- Separately emit info table (with the function entry
+        -- point as first entry) and the entry code
+        --
+        return (top_decls ++
+                [CmmProc mapEmpty entry_lbl live blocks,
+                 mkRODataLits info_lbl
+                    (CmmLabel entry_lbl : rel_std_info ++ rel_extra_bits)])
+
+  --
+  -- With tables-next-to-code, we can have many info tables,
+  -- associated with some of the BlockIds of the proc.  For each info
+  -- table we need to turn it into CmmStatics, and collect any new
+  -- CmmDecls that arise from doing so.
+  --
+  | otherwise
+  = do
+    (top_declss, raw_infos) <-
+       unzip `fmap` mapM do_one_info (mapToList (info_tbls infos))
+    return (concat top_declss ++
+            [CmmProc (mapFromList raw_infos) entry_lbl live blocks])
+
+  where
+   do_one_info (lbl,itbl) = do
+     (top_decls, (std_info, extra_bits)) <-
+         mkInfoTableContents dflags itbl Nothing
+     let
+        info_lbl = cit_lbl itbl
+        rel_std_info   = map (makeRelativeRefTo dflags info_lbl) std_info
+        rel_extra_bits = map (makeRelativeRefTo dflags info_lbl) extra_bits
+     --
+     return (top_decls, (lbl, Statics info_lbl $ map CmmStaticLit $
+                              reverse rel_extra_bits ++ rel_std_info))
+
+-----------------------------------------------------
+type InfoTableContents = ( [CmmLit]          -- The standard part
+                         , [CmmLit] )        -- The "extra bits"
+-- These Lits have *not* had mkRelativeTo applied to them
+
+mkInfoTableContents :: DynFlags
+                    -> CmmInfoTable
+                    -> Maybe Int               -- Override default RTS type tag?
+                    -> UniqSM ([RawCmmDecl],             -- Auxiliary top decls
+                               InfoTableContents)       -- Info tbl + extra bits
+
+mkInfoTableContents dflags
+                    info@(CmmInfoTable { cit_lbl  = info_lbl
+                                       , cit_rep  = smrep
+                                       , cit_prof = prof
+                                       , cit_srt = srt })
+                    mb_rts_tag
+  | RTSRep rts_tag rep <- smrep
+  = mkInfoTableContents dflags info{cit_rep = rep} (Just rts_tag)
+    -- Completely override the rts_tag that mkInfoTableContents would
+    -- otherwise compute, with the rts_tag stored in the RTSRep
+    -- (which in turn came from a handwritten .cmm file)
+
+  | StackRep frame <- smrep
+  = do { (prof_lits, prof_data) <- mkProfLits dflags prof
+       ; let (srt_label, srt_bitmap) = mkSRTLit dflags info_lbl srt
+       ; (liveness_lit, liveness_data) <- mkLivenessBits dflags frame
+       ; let
+             std_info = mkStdInfoTable dflags prof_lits rts_tag srt_bitmap liveness_lit
+             rts_tag | Just tag <- mb_rts_tag = tag
+                     | null liveness_data     = rET_SMALL -- Fits in extra_bits
+                     | otherwise              = rET_BIG   -- Does not; extra_bits is
+                                                          -- a label
+       ; return (prof_data ++ liveness_data, (std_info, srt_label)) }
+
+  | HeapRep _ ptrs nonptrs closure_type <- smrep
+  = do { let layout  = packIntsCLit dflags ptrs nonptrs
+       ; (prof_lits, prof_data) <- mkProfLits dflags prof
+       ; let (srt_label, srt_bitmap) = mkSRTLit dflags info_lbl srt
+       ; (mb_srt_field, mb_layout, extra_bits, ct_data)
+                                <- mk_pieces closure_type srt_label
+       ; let std_info = mkStdInfoTable dflags prof_lits
+                                       (mb_rts_tag   `orElse` rtsClosureType smrep)
+                                       (mb_srt_field `orElse` srt_bitmap)
+                                       (mb_layout    `orElse` layout)
+       ; return (prof_data ++ ct_data, (std_info, extra_bits)) }
+  where
+    mk_pieces :: ClosureTypeInfo -> [CmmLit]
+              -> UniqSM ( Maybe CmmLit  -- Override the SRT field with this
+                        , Maybe CmmLit  -- Override the layout field with this
+                        , [CmmLit]           -- "Extra bits" for info table
+                        , [RawCmmDecl])      -- Auxiliary data decls
+    mk_pieces (Constr con_tag con_descr) _no_srt    -- A data constructor
+      = do { (descr_lit, decl) <- newStringLit con_descr
+           ; return ( Just (CmmInt (fromIntegral con_tag)
+                                   (halfWordWidth dflags))
+                    , Nothing, [descr_lit], [decl]) }
+
+    mk_pieces Thunk srt_label
+      = return (Nothing, Nothing, srt_label, [])
+
+    mk_pieces (ThunkSelector offset) _no_srt
+      = return (Just (CmmInt 0 (halfWordWidth dflags)),
+                Just (mkWordCLit dflags (fromIntegral offset)), [], [])
+         -- Layout known (one free var); we use the layout field for offset
+
+    mk_pieces (Fun arity (ArgSpec fun_type)) srt_label
+      = do { let extra_bits = packIntsCLit dflags fun_type arity : srt_label
+           ; return (Nothing, Nothing,  extra_bits, []) }
+
+    mk_pieces (Fun arity (ArgGen arg_bits)) srt_label
+      = do { (liveness_lit, liveness_data) <- mkLivenessBits dflags arg_bits
+           ; let fun_type | null liveness_data = aRG_GEN
+                          | otherwise          = aRG_GEN_BIG
+                 extra_bits = [ packIntsCLit dflags fun_type arity ]
+                           ++ (if inlineSRT dflags then [] else [ srt_lit ])
+                           ++ [ liveness_lit, slow_entry ]
+           ; return (Nothing, Nothing, extra_bits, liveness_data) }
+      where
+        slow_entry = CmmLabel (toSlowEntryLbl info_lbl)
+        srt_lit = case srt_label of
+                    []          -> mkIntCLit dflags 0
+                    (lit:_rest) -> ASSERT( null _rest ) lit
+
+    mk_pieces other _ = pprPanic "mk_pieces" (ppr other)
+
+mkInfoTableContents _ _ _ = panic "mkInfoTableContents"   -- NonInfoTable dealt with earlier
+
+packIntsCLit :: DynFlags -> Int -> Int -> CmmLit
+packIntsCLit dflags a b = packHalfWordsCLit dflags
+                           (toStgHalfWord dflags (fromIntegral a))
+                           (toStgHalfWord dflags (fromIntegral b))
+
+
+mkSRTLit :: DynFlags
+         -> CLabel
+         -> Maybe CLabel
+         -> ([CmmLit],    -- srt_label, if any
+             CmmLit)      -- srt_bitmap
+mkSRTLit dflags info_lbl (Just lbl)
+  | inlineSRT dflags
+  = ([], CmmLabelDiffOff lbl info_lbl 0 (halfWordWidth dflags))
+mkSRTLit dflags _ Nothing    = ([], CmmInt 0 (halfWordWidth dflags))
+mkSRTLit dflags _ (Just lbl) = ([CmmLabel lbl], CmmInt 1 (halfWordWidth dflags))
+
+
+-- | Is the SRT offset field inline in the info table on this platform?
+--
+-- See the section "Referring to an SRT from the info table" in
+-- Note [SRTs] in GHC.Cmm.Info.Build
+inlineSRT :: DynFlags -> Bool
+inlineSRT dflags = platformArch (targetPlatform dflags) == ArchX86_64
+  && tablesNextToCode dflags
+
+-------------------------------------------------------------------------
+--
+--      Lay out the info table and handle relative offsets
+--
+-------------------------------------------------------------------------
+
+-- This function takes
+--   * the standard info table portion (StgInfoTable)
+--   * the "extra bits" (StgFunInfoExtraRev etc.)
+--   * the entry label
+--   * the code
+-- and lays them out in memory, producing a list of RawCmmDecl
+
+-------------------------------------------------------------------------
+--
+--      Position independent code
+--
+-------------------------------------------------------------------------
+-- In order to support position independent code, we mustn't put absolute
+-- references into read-only space. Info tables in the tablesNextToCode
+-- case must be in .text, which is read-only, so we doctor the CmmLits
+-- to use relative offsets instead.
+
+-- Note that this is done even when the -fPIC flag is not specified,
+-- as we want to keep binary compatibility between PIC and non-PIC.
+
+makeRelativeRefTo :: DynFlags -> CLabel -> CmmLit -> CmmLit
+
+makeRelativeRefTo dflags info_lbl (CmmLabel lbl)
+  | tablesNextToCode dflags
+  = CmmLabelDiffOff lbl info_lbl 0 (wordWidth dflags)
+makeRelativeRefTo dflags info_lbl (CmmLabelOff lbl off)
+  | tablesNextToCode dflags
+  = CmmLabelDiffOff lbl info_lbl off (wordWidth dflags)
+makeRelativeRefTo _ _ lit = lit
+
+
+-------------------------------------------------------------------------
+--
+--              Build a liveness mask for the stack layout
+--
+-------------------------------------------------------------------------
+
+-- There are four kinds of things on the stack:
+--
+--      - pointer variables (bound in the environment)
+--      - non-pointer variables (bound in the environment)
+--      - free slots (recorded in the stack free list)
+--      - non-pointer data slots (recorded in the stack free list)
+--
+-- The first two are represented with a 'Just' of a 'LocalReg'.
+-- The last two with one or more 'Nothing' constructors.
+-- Each 'Nothing' represents one used word.
+--
+-- The head of the stack layout is the top of the stack and
+-- the least-significant bit.
+
+mkLivenessBits :: DynFlags -> Liveness -> UniqSM (CmmLit, [RawCmmDecl])
+              -- ^ Returns:
+              --   1. The bitmap (literal value or label)
+              --   2. Large bitmap CmmData if needed
+
+mkLivenessBits dflags liveness
+  | n_bits > mAX_SMALL_BITMAP_SIZE dflags -- does not fit in one word
+  = do { uniq <- getUniqueM
+       ; let bitmap_lbl = mkBitmapLabel uniq
+       ; return (CmmLabel bitmap_lbl,
+                 [mkRODataLits bitmap_lbl lits]) }
+
+  | otherwise -- Fits in one word
+  = return (mkStgWordCLit dflags bitmap_word, [])
+  where
+    n_bits = length liveness
+
+    bitmap :: Bitmap
+    bitmap = mkBitmap dflags liveness
+
+    small_bitmap = case bitmap of
+                     []  -> toStgWord dflags 0
+                     [b] -> b
+                     _   -> panic "mkLiveness"
+    bitmap_word = toStgWord dflags (fromIntegral n_bits)
+              .|. (small_bitmap `shiftL` bITMAP_BITS_SHIFT dflags)
+
+    lits = mkWordCLit dflags (fromIntegral n_bits)
+         : map (mkStgWordCLit dflags) bitmap
+      -- The first word is the size.  The structure must match
+      -- StgLargeBitmap in includes/rts/storage/InfoTable.h
+
+-------------------------------------------------------------------------
+--
+--      Generating a standard info table
+--
+-------------------------------------------------------------------------
+
+-- The standard bits of an info table.  This part of the info table
+-- corresponds to the StgInfoTable type defined in
+-- includes/rts/storage/InfoTables.h.
+--
+-- Its shape varies with ticky/profiling/tables next to code etc
+-- so we can't use constant offsets from Constants
+
+mkStdInfoTable
+   :: DynFlags
+   -> (CmmLit,CmmLit)   -- Closure type descr and closure descr  (profiling)
+   -> Int               -- Closure RTS tag
+   -> CmmLit            -- SRT length
+   -> CmmLit            -- layout field
+   -> [CmmLit]
+
+mkStdInfoTable dflags (type_descr, closure_descr) cl_type srt layout_lit
+ =      -- Parallel revertible-black hole field
+    prof_info
+        -- Ticky info (none at present)
+        -- Debug info (none at present)
+ ++ [layout_lit, tag, srt]
+
+ where
+    prof_info
+        | gopt Opt_SccProfilingOn dflags = [type_descr, closure_descr]
+        | otherwise = []
+
+    tag = CmmInt (fromIntegral cl_type) (halfWordWidth dflags)
+
+-------------------------------------------------------------------------
+--
+--      Making string literals
+--
+-------------------------------------------------------------------------
+
+mkProfLits :: DynFlags -> ProfilingInfo -> UniqSM ((CmmLit,CmmLit), [RawCmmDecl])
+mkProfLits dflags NoProfilingInfo       = return ((zeroCLit dflags, zeroCLit dflags), [])
+mkProfLits _ (ProfilingInfo td cd)
+  = do { (td_lit, td_decl) <- newStringLit td
+       ; (cd_lit, cd_decl) <- newStringLit cd
+       ; return ((td_lit,cd_lit), [td_decl,cd_decl]) }
+
+newStringLit :: ByteString -> UniqSM (CmmLit, GenCmmDecl CmmStatics info stmt)
+newStringLit bytes
+  = do { uniq <- getUniqueM
+       ; return (mkByteStringCLit (mkStringLitLabel uniq) bytes) }
+
+
+-- Misc utils
+
+-- | Value of the srt field of an info table when using an StgLargeSRT
+srtEscape :: DynFlags -> StgHalfWord
+srtEscape dflags = toStgHalfWord dflags (-1)
+
+-------------------------------------------------------------------------
+--
+--      Accessing fields of an info table
+--
+-------------------------------------------------------------------------
+
+-- | Wrap a 'CmmExpr' in an alignment check when @-falignment-sanitisation@ is
+-- enabled.
+wordAligned :: DynFlags -> CmmExpr -> CmmExpr
+wordAligned dflags e
+  | gopt Opt_AlignmentSanitisation dflags
+  = CmmMachOp (MO_AlignmentCheck (wORD_SIZE dflags) (wordWidth dflags)) [e]
+  | otherwise
+  = e
+
+closureInfoPtr :: DynFlags -> CmmExpr -> CmmExpr
+-- Takes a closure pointer and returns the info table pointer
+closureInfoPtr dflags e =
+    CmmLoad (wordAligned dflags e) (bWord dflags)
+
+entryCode :: DynFlags -> CmmExpr -> CmmExpr
+-- Takes an info pointer (the first word of a closure)
+-- and returns its entry code
+entryCode dflags e
+ | tablesNextToCode dflags = e
+ | otherwise               = CmmLoad e (bWord dflags)
+
+getConstrTag :: DynFlags -> CmmExpr -> CmmExpr
+-- Takes a closure pointer, and return the *zero-indexed*
+-- constructor tag obtained from the info table
+-- This lives in the SRT field of the info table
+-- (constructors don't need SRTs).
+getConstrTag dflags closure_ptr
+  = CmmMachOp (MO_UU_Conv (halfWordWidth dflags) (wordWidth dflags)) [infoTableConstrTag dflags info_table]
+  where
+    info_table = infoTable dflags (closureInfoPtr dflags closure_ptr)
+
+cmmGetClosureType :: DynFlags -> CmmExpr -> CmmExpr
+-- Takes a closure pointer, and return the closure type
+-- obtained from the info table
+cmmGetClosureType dflags closure_ptr
+  = CmmMachOp (MO_UU_Conv (halfWordWidth dflags) (wordWidth dflags)) [infoTableClosureType dflags info_table]
+  where
+    info_table = infoTable dflags (closureInfoPtr dflags closure_ptr)
+
+infoTable :: DynFlags -> CmmExpr -> CmmExpr
+-- Takes an info pointer (the first word of a closure)
+-- and returns a pointer to the first word of the standard-form
+-- info table, excluding the entry-code word (if present)
+infoTable dflags info_ptr
+  | tablesNextToCode dflags = cmmOffsetB dflags info_ptr (- stdInfoTableSizeB dflags)
+  | otherwise               = cmmOffsetW dflags info_ptr 1 -- Past the entry code pointer
+
+infoTableConstrTag :: DynFlags -> CmmExpr -> CmmExpr
+-- Takes an info table pointer (from infoTable) and returns the constr tag
+-- field of the info table (same as the srt_bitmap field)
+infoTableConstrTag = infoTableSrtBitmap
+
+infoTableSrtBitmap :: DynFlags -> CmmExpr -> CmmExpr
+-- Takes an info table pointer (from infoTable) and returns the srt_bitmap
+-- field of the info table
+infoTableSrtBitmap dflags info_tbl
+  = CmmLoad (cmmOffsetB dflags info_tbl (stdSrtBitmapOffset dflags)) (bHalfWord dflags)
+
+infoTableClosureType :: DynFlags -> CmmExpr -> CmmExpr
+-- Takes an info table pointer (from infoTable) and returns the closure type
+-- field of the info table.
+infoTableClosureType dflags info_tbl
+  = CmmLoad (cmmOffsetB dflags info_tbl (stdClosureTypeOffset dflags)) (bHalfWord dflags)
+
+infoTablePtrs :: DynFlags -> CmmExpr -> CmmExpr
+infoTablePtrs dflags info_tbl
+  = CmmLoad (cmmOffsetB dflags info_tbl (stdPtrsOffset dflags)) (bHalfWord dflags)
+
+infoTableNonPtrs :: DynFlags -> CmmExpr -> CmmExpr
+infoTableNonPtrs dflags info_tbl
+  = CmmLoad (cmmOffsetB dflags info_tbl (stdNonPtrsOffset dflags)) (bHalfWord dflags)
+
+funInfoTable :: DynFlags -> CmmExpr -> CmmExpr
+-- Takes the info pointer of a function,
+-- and returns a pointer to the first word of the StgFunInfoExtra struct
+-- in the info table.
+funInfoTable dflags info_ptr
+  | tablesNextToCode dflags
+  = cmmOffsetB dflags info_ptr (- stdInfoTableSizeB dflags - sIZEOF_StgFunInfoExtraRev dflags)
+  | otherwise
+  = cmmOffsetW dflags info_ptr (1 + stdInfoTableSizeW dflags)
+                                -- Past the entry code pointer
+
+-- Takes the info pointer of a function, returns the function's arity
+funInfoArity :: DynFlags -> CmmExpr -> CmmExpr
+funInfoArity dflags iptr
+  = cmmToWord dflags (cmmLoadIndex dflags rep fun_info (offset `div` rep_bytes))
+  where
+   fun_info = funInfoTable dflags iptr
+   rep = cmmBits (widthFromBytes rep_bytes)
+
+   (rep_bytes, offset)
+    | tablesNextToCode dflags = ( pc_REP_StgFunInfoExtraRev_arity pc
+                                , oFFSET_StgFunInfoExtraRev_arity dflags )
+    | otherwise               = ( pc_REP_StgFunInfoExtraFwd_arity pc
+                                , oFFSET_StgFunInfoExtraFwd_arity dflags )
+
+   pc = platformConstants dflags
+
+-----------------------------------------------------------------------------
+--
+--      Info table sizes & offsets
+--
+-----------------------------------------------------------------------------
+
+stdInfoTableSizeW :: DynFlags -> WordOff
+-- The size of a standard info table varies with profiling/ticky etc,
+-- so we can't get it from Constants
+-- It must vary in sync with mkStdInfoTable
+stdInfoTableSizeW dflags
+  = fixedInfoTableSizeW
+  + if gopt Opt_SccProfilingOn dflags
+       then profInfoTableSizeW
+       else 0
+
+fixedInfoTableSizeW :: WordOff
+fixedInfoTableSizeW = 2 -- layout, type
+
+profInfoTableSizeW :: WordOff
+profInfoTableSizeW = 2
+
+maxStdInfoTableSizeW :: WordOff
+maxStdInfoTableSizeW =
+  1 {- entry, when !tablesNextToCode -}
+  + fixedInfoTableSizeW
+  + profInfoTableSizeW
+
+maxRetInfoTableSizeW :: WordOff
+maxRetInfoTableSizeW =
+  maxStdInfoTableSizeW
+  + 1 {- srt label -}
+
+stdInfoTableSizeB  :: DynFlags -> ByteOff
+stdInfoTableSizeB dflags = stdInfoTableSizeW dflags * wORD_SIZE dflags
+
+stdSrtBitmapOffset :: DynFlags -> ByteOff
+-- Byte offset of the SRT bitmap half-word which is
+-- in the *higher-addressed* part of the type_lit
+stdSrtBitmapOffset dflags = stdInfoTableSizeB dflags - halfWordSize dflags
+
+stdClosureTypeOffset :: DynFlags -> ByteOff
+-- Byte offset of the closure type half-word
+stdClosureTypeOffset dflags = stdInfoTableSizeB dflags - wORD_SIZE dflags
+
+stdPtrsOffset, stdNonPtrsOffset :: DynFlags -> ByteOff
+stdPtrsOffset    dflags = stdInfoTableSizeB dflags - 2 * wORD_SIZE dflags
+stdNonPtrsOffset dflags = stdInfoTableSizeB dflags - 2 * wORD_SIZE dflags + halfWordSize dflags
+
+conInfoTableSizeB :: DynFlags -> Int
+conInfoTableSizeB dflags = stdInfoTableSizeB dflags + wORD_SIZE dflags
diff --git a/compiler/GHC/Cmm/Info/Build.hs b/compiler/GHC/Cmm/Info/Build.hs
new file mode 100644
index 0000000000..1ba79befcd
--- /dev/null
+++ b/compiler/GHC/Cmm/Info/Build.hs
@@ -0,0 +1,892 @@
+{-# LANGUAGE GADTs, BangPatterns, RecordWildCards,
+    GeneralizedNewtypeDeriving, NondecreasingIndentation, TupleSections #-}
+
+module GHC.Cmm.Info.Build
+  ( CAFSet, CAFEnv, cafAnal
+  , doSRTs, ModuleSRTInfo, emptySRT
+  ) where
+
+import GhcPrelude hiding (succ)
+
+import Id
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow
+import Module
+import GHC.Platform
+import Digraph
+import GHC.Cmm.CLabel
+import GHC.Cmm
+import GHC.Cmm.Utils
+import DynFlags
+import Maybes
+import Outputable
+import GHC.Runtime.Layout
+import UniqSupply
+import CostCentre
+import GHC.StgToCmm.Heap
+
+import Control.Monad
+import Data.Map (Map)
+import qualified Data.Map as Map
+import Data.Set (Set)
+import qualified Data.Set as Set
+import Data.Tuple
+import Control.Monad.Trans.State
+import Control.Monad.Trans.Class
+
+
+{- Note [SRTs]
+
+SRTs are the mechanism by which the garbage collector can determine
+the live CAFs in the program.
+
+Representation
+^^^^^^^^^^^^^^
+
++------+
+| info |
+|      |     +-----+---+---+---+
+|   -------->|SRT_2| | | | | 0 |
+|------|     +-----+-|-+-|-+---+
+|      |             |   |
+| code |             |   |
+|      |             v   v
+
+An SRT is simply an object in the program's data segment. It has the
+same representation as a static constructor.  There are 16
+pre-compiled SRT info tables: stg_SRT_1_info, .. stg_SRT_16_info,
+representing SRT objects with 1-16 pointers, respectively.
+
+The entries of an SRT object point to static closures, which are either
+- FUN_STATIC, THUNK_STATIC or CONSTR
+- Another SRT (actually just a CONSTR)
+
+The final field of the SRT is the static link field, used by the
+garbage collector to chain together static closures that it visits and
+to determine whether a static closure has been visited or not. (see
+Note [STATIC_LINK fields])
+
+By traversing the transitive closure of an SRT, the GC will reach all
+of the CAFs that are reachable from the code associated with this SRT.
+
+If we need to create an SRT with more than 16 entries, we build a
+chain of SRT objects with all but the last having 16 entries.
+
++-----+---+- -+---+---+
+|SRT16| | |   | | | 0 |
++-----+-|-+- -+-|-+---+
+        |       |
+        v       v
+              +----+---+---+---+
+              |SRT2| | | | | 0 |
+              +----+-|-+-|-+---+
+                     |   |
+                     |   |
+                     v   v
+
+Referring to an SRT from the info table
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following things have SRTs:
+
+- Static functions (FUN)
+- Static thunks (THUNK), ie. CAFs
+- Continuations (RET_SMALL, etc.)
+
+In each case, the info table points to the SRT.
+
+- info->srt is zero if there's no SRT, otherwise:
+- info->srt == 1 and info->f.srt_offset points to the SRT
+
+e.g. for a FUN with an SRT:
+
+StgFunInfoTable       +------+
+  info->f.srt_offset  |  ------------> offset to SRT object
+StgStdInfoTable       +------+
+  info->layout.ptrs   | ...  |
+  info->layout.nptrs  | ...  |
+  info->srt           |  1   |
+  info->type          | ...  |
+                      |------|
+
+On x86_64, we optimise the info table representation further.  The
+offset to the SRT can be stored in 32 bits (all code lives within a
+2GB region in x86_64's small memory model), so we can save a word in
+the info table by storing the srt_offset in the srt field, which is
+half a word.
+
+On x86_64 with TABLES_NEXT_TO_CODE (except on MachO, due to #15169):
+
+- info->srt is zero if there's no SRT, otherwise:
+- info->srt is an offset from the info pointer to the SRT object
+
+StgStdInfoTable       +------+
+  info->layout.ptrs   |      |
+  info->layout.nptrs  |      |
+  info->srt           |  ------------> offset to SRT object
+                      |------|
+
+
+EXAMPLE
+^^^^^^^
+
+f = \x. ... g ...
+  where
+    g = \y. ... h ... c1 ...
+    h = \z. ... c2 ...
+
+c1 & c2 are CAFs
+
+g and h are local functions, but they have no static closures.  When
+we generate code for f, we start with a CmmGroup of four CmmDecls:
+
+   [ f_closure, f_entry, g_entry, h_entry ]
+
+we process each CmmDecl separately in cpsTop, giving us a list of
+CmmDecls. e.g. for f_entry, we might end up with
+
+   [ f_entry, f1_ret, f2_proc ]
+
+where f1_ret is a return point, and f2_proc is a proc-point.  We have
+a CAFSet for each of these CmmDecls, let's suppose they are
+
+   [ f_entry{g_info}, f1_ret{g_info}, f2_proc{} ]
+   [ g_entry{h_info, c1_closure} ]
+   [ h_entry{c2_closure} ]
+
+Next, we make an SRT for each of these functions:
+
+  f_srt : [g_info]
+  g_srt : [h_info, c1_closure]
+  h_srt : [c2_closure]
+
+Now, for g_info and h_info, we want to refer to the SRTs for g and h
+respectively, which we'll label g_srt and h_srt:
+
+  f_srt : [g_srt]
+  g_srt : [h_srt, c1_closure]
+  h_srt : [c2_closure]
+
+Now, when an SRT has a single entry, we don't actually generate an SRT
+closure for it, instead we just replace references to it with its
+single element.  So, since h_srt == c2_closure, we have
+
+  f_srt : [g_srt]
+  g_srt : [c2_closure, c1_closure]
+  h_srt : [c2_closure]
+
+and the only SRT closure we generate is
+
+  g_srt = SRT_2 [c2_closure, c1_closure]
+
+
+Optimisations
+^^^^^^^^^^^^^
+
+To reduce the code size overhead and the cost of traversing SRTs in
+the GC, we want to simplify SRTs where possible. We therefore apply
+the following optimisations.  Each has a [keyword]; search for the
+keyword in the code below to see where the optimisation is
+implemented.
+
+1. [Inline] we never create an SRT with a single entry, instead we
+   point to the single entry directly from the info table.
+
+   i.e. instead of
+
+    +------+
+    | info |
+    |      |     +-----+---+---+
+    |   -------->|SRT_1| | | 0 |
+    |------|     +-----+-|-+---+
+    |      |             |
+    | code |             |
+    |      |             v
+                         C
+
+   we can point directly to the closure:
+
+    +------+
+    | info |
+    |      |
+    |   -------->C
+    |------|
+    |      |
+    | code |
+    |      |
+
+
+   Furthermore, the SRT for any code that refers to this info table
+   can point directly to C.
+
+   The exception to this is when we're doing dynamic linking. In that
+   case, if the closure is not locally defined then we can't point to
+   it directly from the info table, because this is the text section
+   which cannot contain runtime relocations. In this case we skip this
+   optimisation and generate the singleton SRT, because SRTs are in the
+   data section and *can* have relocatable references.
+
+2. [FUN] A static function closure can also be an SRT, we simply put
+   the SRT entries as fields in the static closure.  This makes a lot
+   of sense: the static references are just like the free variables of
+   the FUN closure.
+
+   i.e. instead of
+
+   f_closure:
+   +-----+---+
+   |  |  | 0 |
+   +- |--+---+
+      |            +------+
+      |            | info |     f_srt:
+      |            |      |     +-----+---+---+---+
+      |            |   -------->|SRT_2| | | | + 0 |
+      `----------->|------|     +-----+-|-+-|-+---+
+                   |      |             |   |
+                   | code |             |   |
+                   |      |             v   v
+
+
+   We can generate:
+
+   f_closure:
+   +-----+---+---+---+
+   |  |  | | | | | 0 |
+   +- |--+-|-+-|-+---+
+      |    |   |   +------+
+      |    v   v   | info |
+      |            |      |
+      |            |   0  |
+      `----------->|------|
+                   |      |
+                   | code |
+                   |      |
+
+
+   (note: we can't do this for THUNKs, because the thunk gets
+   overwritten when it is entered, so we wouldn't be able to share
+   this SRT with other info tables that want to refer to it (see
+   [Common] below). FUNs are immutable so don't have this problem.)
+
+3. [Common] Identical SRTs can be commoned up.
+
+4. [Filter] If an SRT A refers to an SRT B and a closure C, and B also
+   refers to C (perhaps transitively), then we can omit the reference
+   to C from A.
+
+
+Note that there are many other optimisations that we could do, but
+aren't implemented. In general, we could omit any reference from an
+SRT if everything reachable from it is also reachable from the other
+fields in the SRT. Our [Filter] optimisation is a special case of
+this.
+
+Another opportunity we don't exploit is this:
+
+A = {X,Y,Z}
+B = {Y,Z}
+C = {X,B}
+
+Here we could use C = {A} and therefore [Inline] C = A.
+-}
+
+-- ---------------------------------------------------------------------
+{- Note [Invalid optimisation: shortcutting]
+
+You might think that if we have something like
+
+A's SRT = {B}
+B's SRT = {X}
+
+that we could replace the reference to B in A's SRT with X.
+
+A's SRT = {X}
+B's SRT = {X}
+
+and thereby perhaps save a little work at runtime, because we don't
+have to visit B.
+
+But this is NOT valid.
+
+Consider these cases:
+
+0. B can't be a constructor, because constructors don't have SRTs
+
+1. B is a CAF. This is the easy one. Obviously we want A's SRT to
+   point to B, so that it keeps B alive.
+
+2. B is a function.  This is the tricky one. The reason we can't
+shortcut in this case is that we aren't allowed to resurrect static
+objects.
+
+== How does this cause a problem? ==
+
+The particular case that cropped up when we tried this was #15544.
+- A is a thunk
+- B is a static function
+- X is a CAF
+- suppose we GC when A is alive, and B is not otherwise reachable.
+- B is "collected", meaning that it doesn't make it onto the static
+  objects list during this GC, but nothing bad happens yet.
+- Next, suppose we enter A, and then call B. (remember that A refers to B)
+  At the entry point to B, we GC. This puts B on the stack, as part of the
+  RET_FUN stack frame that gets pushed when we GC at a function entry point.
+- This GC will now reach B
+- But because B was previous "collected", it breaks the assumption
+  that static objects are never resurrected. See Note [STATIC_LINK
+  fields] in rts/sm/Storage.h for why this is bad.
+- In practice, the GC thinks that B has already been visited, and so
+  doesn't visit X, and catastrophe ensues.
+
+== Isn't this caused by the RET_FUN business? ==
+
+Maybe, but could you prove that RET_FUN is the only way that
+resurrection can occur?
+
+So, no shortcutting.
+-}
+
+-- ---------------------------------------------------------------------
+-- Label types
+
+-- Labels that come from cafAnal can be:
+--   - _closure labels for static functions or CAFs
+--   - _info labels for dynamic functions, thunks, or continuations
+--   - _entry labels for functions or thunks
+--
+-- Meanwhile the labels on top-level blocks are _entry labels.
+--
+-- To put everything in the same namespace we convert all labels to
+-- closure labels using toClosureLbl.  Note that some of these
+-- labels will not actually exist; that's ok because we're going to
+-- map them to SRTEntry later, which ranges over labels that do exist.
+--
+newtype CAFLabel = CAFLabel CLabel
+  deriving (Eq,Ord,Outputable)
+
+type CAFSet = Set CAFLabel
+type CAFEnv = LabelMap CAFSet
+
+mkCAFLabel :: CLabel -> CAFLabel
+mkCAFLabel lbl = CAFLabel (toClosureLbl lbl)
+
+-- This is a label that we can put in an SRT.  It *must* be a closure label,
+-- pointing to either a FUN_STATIC, THUNK_STATIC, or CONSTR.
+newtype SRTEntry = SRTEntry CLabel
+  deriving (Eq, Ord, Outputable)
+
+-- ---------------------------------------------------------------------
+-- CAF analysis
+
+-- |
+-- For each code block:
+--   - collect the references reachable from this code block to FUN,
+--     THUNK or RET labels for which hasCAF == True
+--
+-- This gives us a `CAFEnv`: a mapping from code block to sets of labels
+--
+cafAnal
+  :: LabelSet   -- The blocks representing continuations, ie. those
+                -- that will get RET info tables.  These labels will
+                -- get their own SRTs, so we don't aggregate CAFs from
+                -- references to these labels, we just use the label.
+  -> CLabel     -- The top label of the proc
+  -> CmmGraph
+  -> CAFEnv
+cafAnal contLbls topLbl cmmGraph =
+  analyzeCmmBwd cafLattice
+    (cafTransfers contLbls (g_entry cmmGraph) topLbl) cmmGraph mapEmpty
+
+
+cafLattice :: DataflowLattice CAFSet
+cafLattice = DataflowLattice Set.empty add
+  where
+    add (OldFact old) (NewFact new) =
+        let !new' = old `Set.union` new
+        in changedIf (Set.size new' > Set.size old) new'
+
+
+cafTransfers :: LabelSet -> Label -> CLabel -> TransferFun CAFSet
+cafTransfers contLbls entry topLbl
+  (BlockCC eNode middle xNode) fBase =
+    let joined = cafsInNode xNode $! live'
+        !result = foldNodesBwdOO cafsInNode middle joined
+
+        facts = mapMaybe successorFact (successors xNode)
+        live' = joinFacts cafLattice facts
+
+        successorFact s
+          -- If this is a loop back to the entry, we can refer to the
+          -- entry label.
+          | s == entry = Just (add topLbl Set.empty)
+          -- If this is a continuation, we want to refer to the
+          -- SRT for the continuation's info table
+          | s `setMember` contLbls
+          = Just (Set.singleton (mkCAFLabel (infoTblLbl s)))
+          -- Otherwise, takes the CAF references from the destination
+          | otherwise
+          = lookupFact s fBase
+
+        cafsInNode :: CmmNode e x -> CAFSet -> CAFSet
+        cafsInNode node set = foldExpDeep addCaf node set
+
+        addCaf expr !set =
+          case expr of
+              CmmLit (CmmLabel c) -> add c set
+              CmmLit (CmmLabelOff c _) -> add c set
+              CmmLit (CmmLabelDiffOff c1 c2 _ _) -> add c1 $! add c2 set
+              _ -> set
+        add l s | hasCAF l  = Set.insert (mkCAFLabel l) s
+                | otherwise = s
+
+    in mapSingleton (entryLabel eNode) result
+
+
+-- -----------------------------------------------------------------------------
+-- ModuleSRTInfo
+
+data ModuleSRTInfo = ModuleSRTInfo
+  { thisModule :: Module
+    -- ^ Current module being compiled. Required for calling labelDynamic.
+  , dedupSRTs :: Map (Set SRTEntry) SRTEntry
+    -- ^ previous SRTs we've emitted, so we can de-duplicate.
+    -- Used to implement the [Common] optimisation.
+  , flatSRTs :: Map SRTEntry (Set SRTEntry)
+    -- ^ The reverse mapping, so that we can remove redundant
+    -- entries. e.g.  if we have an SRT [a,b,c], and we know that b
+    -- points to [c,d], we can omit c and emit [a,b].
+    -- Used to implement the [Filter] optimisation.
+  }
+instance Outputable ModuleSRTInfo where
+  ppr ModuleSRTInfo{..} =
+    text "ModuleSRTInfo:" <+> ppr dedupSRTs <+> ppr flatSRTs
+
+emptySRT :: Module -> ModuleSRTInfo
+emptySRT mod =
+  ModuleSRTInfo
+    { thisModule = mod
+    , dedupSRTs = Map.empty
+    , flatSRTs = Map.empty }
+
+-- -----------------------------------------------------------------------------
+-- Constructing SRTs
+
+{- Implementation notes
+
+- In each CmmDecl there is a mapping info_tbls from Label -> CmmInfoTable
+
+- The entry in info_tbls corresponding to g_entry is the closure info
+  table, the rest are continuations.
+
+- Each entry in info_tbls possibly needs an SRT.  We need to make a
+  label for each of these.
+
+- We get the CAFSet for each entry from the CAFEnv
+
+-}
+
+-- | Return a (Label,CLabel) pair for each labelled block of a CmmDecl,
+--   where the label is
+--   - the info label for a continuation or dynamic closure
+--   - the closure label for a top-level function (not a CAF)
+getLabelledBlocks :: CmmDecl -> [(Label, CAFLabel)]
+getLabelledBlocks (CmmData _ _) = []
+getLabelledBlocks (CmmProc top_info _ _ _) =
+  [ (blockId, mkCAFLabel (cit_lbl info))
+  | (blockId, info) <- mapToList (info_tbls top_info)
+  , let rep = cit_rep info
+  , not (isStaticRep rep) || not (isThunkRep rep)
+  ]
+
+
+-- | Put the labelled blocks that we will be annotating with SRTs into
+-- dependency order.  This is so that we can process them one at a
+-- time, resolving references to earlier blocks to point to their
+-- SRTs. CAFs themselves are not included here; see getCAFs below.
+depAnalSRTs
+  :: CAFEnv
+  -> [CmmDecl]
+  -> [SCC (Label, CAFLabel, Set CAFLabel)]
+depAnalSRTs cafEnv decls =
+  srtTrace "depAnalSRTs" (ppr graph) graph
+ where
+  labelledBlocks = concatMap getLabelledBlocks decls
+  labelToBlock = Map.fromList (map swap labelledBlocks)
+  graph = stronglyConnCompFromEdgedVerticesOrd
+             [ let cafs' = Set.delete lbl cafs in
+               DigraphNode (l,lbl,cafs') l
+                 (mapMaybe (flip Map.lookup labelToBlock) (Set.toList cafs'))
+             | (l, lbl) <- labelledBlocks
+             , Just cafs <- [mapLookup l cafEnv] ]
+
+
+-- | Get (Label, CAFLabel, Set CAFLabel) for each block that represents a CAF.
+-- These are treated differently from other labelled blocks:
+--  - we never shortcut a reference to a CAF to the contents of its
+--    SRT, since the point of SRTs is to keep CAFs alive.
+--  - CAFs therefore don't take part in the dependency analysis in depAnalSRTs.
+--    instead we generate their SRTs after everything else.
+getCAFs :: CAFEnv -> [CmmDecl] -> [(Label, CAFLabel, Set CAFLabel)]
+getCAFs cafEnv decls =
+  [ (g_entry g, mkCAFLabel topLbl, cafs)
+  | CmmProc top_info topLbl _ g <- decls
+  , Just info <- [mapLookup (g_entry g) (info_tbls top_info)]
+  , let rep = cit_rep info
+  , isStaticRep rep && isThunkRep rep
+  , Just cafs <- [mapLookup (g_entry g) cafEnv]
+  ]
+
+
+-- | Get the list of blocks that correspond to the entry points for
+-- FUN_STATIC closures.  These are the blocks for which if we have an
+-- SRT we can merge it with the static closure. [FUN]
+getStaticFuns :: [CmmDecl] -> [(BlockId, CLabel)]
+getStaticFuns decls =
+  [ (g_entry g, lbl)
+  | CmmProc top_info _ _ g <- decls
+  , Just info <- [mapLookup (g_entry g) (info_tbls top_info)]
+  , Just (id, _) <- [cit_clo info]
+  , let rep = cit_rep info
+  , isStaticRep rep && isFunRep rep
+  , let lbl = mkLocalClosureLabel (idName id) (idCafInfo id)
+  ]
+
+
+-- | Maps labels from 'cafAnal' to the final CLabel that will appear
+-- in the SRT.
+--   - closures with singleton SRTs resolve to their single entry
+--   - closures with larger SRTs map to the label for that SRT
+--   - CAFs must not map to anything!
+--   - if a labels maps to Nothing, we found that this label's SRT
+--     is empty, so we don't need to refer to it from other SRTs.
+type SRTMap = Map CAFLabel (Maybe SRTEntry)
+
+-- | resolve a CAFLabel to its SRTEntry using the SRTMap
+resolveCAF :: SRTMap -> CAFLabel -> Maybe SRTEntry
+resolveCAF srtMap lbl@(CAFLabel l) =
+  Map.findWithDefault (Just (SRTEntry (toClosureLbl l))) lbl srtMap
+
+
+-- | Attach SRTs to all info tables in the CmmDecls, and add SRT
+-- declarations to the ModuleSRTInfo.
+--
+doSRTs
+  :: DynFlags
+  -> ModuleSRTInfo
+  -> [(CAFEnv, [CmmDecl])]
+  -> IO (ModuleSRTInfo, [CmmDecl])
+
+doSRTs dflags moduleSRTInfo tops = do
+  us <- mkSplitUniqSupply 'u'
+
+  -- Ignore the original grouping of decls, and combine all the
+  -- CAFEnvs into a single CAFEnv.
+  let (cafEnvs, declss) = unzip tops
+      cafEnv = mapUnions cafEnvs
+      decls = concat declss
+      staticFuns = mapFromList (getStaticFuns decls)
+
+  -- Put the decls in dependency order. Why? So that we can implement
+  -- [Inline] and [Filter].  If we need to refer to an SRT that has
+  -- a single entry, we use the entry itself, which means that we
+  -- don't need to generate the singleton SRT in the first place.  But
+  -- to do this we need to process blocks before things that depend on
+  -- them.
+  let
+    sccs = depAnalSRTs cafEnv decls
+    cafsWithSRTs = getCAFs cafEnv decls
+
+  -- On each strongly-connected group of decls, construct the SRT
+  -- closures and the SRT fields for info tables.
+  let result ::
+        [ ( [CmmDecl]              -- generated SRTs
+          , [(Label, CLabel)]      -- SRT fields for info tables
+          , [(Label, [SRTEntry])]  -- SRTs to attach to static functions
+          ) ]
+      ((result, _srtMap), moduleSRTInfo') =
+        initUs_ us $
+        flip runStateT moduleSRTInfo $
+        flip runStateT Map.empty $ do
+          nonCAFs <- mapM (doSCC dflags staticFuns) sccs
+          cAFs <- forM cafsWithSRTs $ \(l, cafLbl, cafs) ->
+            oneSRT dflags staticFuns [l] [cafLbl] True{-is a CAF-} cafs
+          return (nonCAFs ++ cAFs)
+
+      (declss, pairs, funSRTs) = unzip3 result
+
+  -- Next, update the info tables with the SRTs
+  let
+    srtFieldMap = mapFromList (concat pairs)
+    funSRTMap = mapFromList (concat funSRTs)
+    decls' = concatMap (updInfoSRTs dflags srtFieldMap funSRTMap) decls
+
+  return (moduleSRTInfo', concat declss ++ decls')
+
+
+-- | Build the SRT for a strongly-connected component of blocks
+doSCC
+  :: DynFlags
+  -> LabelMap CLabel           -- which blocks are static function entry points
+  -> SCC (Label, CAFLabel, Set CAFLabel)
+  -> StateT SRTMap
+        (StateT ModuleSRTInfo UniqSM)
+        ( [CmmDecl]              -- generated SRTs
+        , [(Label, CLabel)]      -- SRT fields for info tables
+        , [(Label, [SRTEntry])]  -- SRTs to attach to static functions
+        )
+
+doSCC dflags staticFuns  (AcyclicSCC (l, cafLbl, cafs)) =
+  oneSRT dflags staticFuns [l] [cafLbl] False cafs
+
+doSCC dflags staticFuns (CyclicSCC nodes) = do
+  -- build a single SRT for the whole cycle, see Note [recursive SRTs]
+  let (blockids, lbls, cafsets) = unzip3 nodes
+      cafs = Set.unions cafsets
+  oneSRT dflags staticFuns blockids lbls False cafs
+
+
+{- Note [recursive SRTs]
+
+If the dependency analyser has found us a recursive group of
+declarations, then we build a single SRT for the whole group, on the
+grounds that everything in the group is reachable from everything
+else, so we lose nothing by having a single SRT.
+
+However, there are a couple of wrinkles to be aware of.
+
+* The Set CAFLabel for this SRT will contain labels in the group
+itself. The SRTMap will therefore not contain entries for these labels
+yet, so we can't turn them into SRTEntries using resolveCAF. BUT we
+can just remove recursive references from the Set CAFLabel before
+generating the SRT - the SRT will still contain all the CAFLabels that
+we need to refer to from this group's SRT.
+
+* That is, EXCEPT for static function closures. For the same reason
+described in Note [Invalid optimisation: shortcutting], we cannot omit
+references to static function closures.
+  - But, since we will merge the SRT with one of the static function
+    closures (see [FUN]), we can omit references to *that* static
+    function closure from the SRT.
+-}
+
+-- | Build an SRT for a set of blocks
+oneSRT
+  :: DynFlags
+  -> LabelMap CLabel            -- which blocks are static function entry points
+  -> [Label]                    -- blocks in this set
+  -> [CAFLabel]                 -- labels for those blocks
+  -> Bool                       -- True <=> this SRT is for a CAF
+  -> Set CAFLabel               -- SRT for this set
+  -> StateT SRTMap
+       (StateT ModuleSRTInfo UniqSM)
+       ( [CmmDecl]                    -- SRT objects we built
+       , [(Label, CLabel)]            -- SRT fields for these blocks' itbls
+       , [(Label, [SRTEntry])]        -- SRTs to attach to static functions
+       )
+
+oneSRT dflags staticFuns blockids lbls isCAF cafs = do
+  srtMap <- get
+  topSRT <- lift get
+  let
+    -- Can we merge this SRT with a FUN_STATIC closure?
+    (maybeFunClosure, otherFunLabels) =
+      case [ (l,b) | b <- blockids, Just l <- [mapLookup b staticFuns] ] of
+        [] -> (Nothing, [])
+        ((l,b):xs) -> (Just (l,b), map (mkCAFLabel . fst) xs)
+
+    -- Remove recursive references from the SRT, except for (all but
+    -- one of the) static functions. See Note [recursive SRTs].
+    nonRec = cafs `Set.difference`
+      (Set.fromList lbls `Set.difference` Set.fromList otherFunLabels)
+
+    -- First resolve all the CAFLabels to SRTEntries
+    -- Implements the [Inline] optimisation.
+    resolved = mapMaybe (resolveCAF srtMap) (Set.toList nonRec)
+
+    -- The set of all SRTEntries in SRTs that we refer to from here.
+    allBelow =
+      Set.unions [ lbls | caf <- resolved
+                        , Just lbls <- [Map.lookup caf (flatSRTs topSRT)] ]
+
+    -- Remove SRTEntries that are also in an SRT that we refer to.
+    -- Implements the [Filter] optimisation.
+    filtered = Set.difference (Set.fromList resolved) allBelow
+
+  srtTrace "oneSRT:"
+     (ppr cafs <+> ppr resolved <+> ppr allBelow <+> ppr filtered) $ return ()
+
+  let
+    isStaticFun = isJust maybeFunClosure
+
+    -- For a label without a closure (e.g. a continuation), we must
+    -- update the SRTMap for the label to point to a closure. It's
+    -- important that we don't do this for static functions or CAFs,
+    -- see Note [Invalid optimisation: shortcutting].
+    updateSRTMap srtEntry =
+      when (not isCAF && (not isStaticFun || isNothing srtEntry)) $ do
+        let newSRTMap = Map.fromList [(cafLbl, srtEntry) | cafLbl <- lbls]
+        put (Map.union newSRTMap srtMap)
+
+    this_mod = thisModule topSRT
+
+  case Set.toList filtered of
+    [] -> do
+      srtTrace "oneSRT: empty" (ppr lbls) $ return ()
+      updateSRTMap Nothing
+      return ([], [], [])
+
+    -- [Inline] - when we have only one entry there is no need to
+    -- build an SRT object at all, instead we put the singleton SRT
+    -- entry in the info table.
+    [one@(SRTEntry lbl)]
+      | -- Info tables refer to SRTs by offset (as noted in the section
+        -- "Referring to an SRT from the info table" of Note [SRTs]). However,
+        -- when dynamic linking is used we cannot guarantee that the offset
+        -- between the SRT and the info table will fit in the offset field.
+        -- Consequently we build a singleton SRT in in this case.
+        not (labelDynamic dflags this_mod lbl)
+
+        -- MachO relocations can't express offsets between compilation units at
+        -- all, so we are always forced to build a singleton SRT in this case.
+          && (not (osMachOTarget $ platformOS $ targetPlatform dflags)
+             || isLocalCLabel this_mod lbl) -> do
+
+        -- If we have a static function closure, then it becomes the
+        -- SRT object, and everything else points to it. (the only way
+        -- we could have multiple labels here is if this is a
+        -- recursive group, see Note [recursive SRTs])
+        case maybeFunClosure of
+          Just (staticFunLbl,staticFunBlock) -> return ([], withLabels, [])
+            where
+              withLabels =
+                [ (b, if b == staticFunBlock then lbl else staticFunLbl)
+                | b <- blockids ]
+          Nothing -> do
+            updateSRTMap (Just one)
+            return ([], map (,lbl) blockids, [])
+
+    cafList ->
+      -- Check whether an SRT with the same entries has been emitted already.
+      -- Implements the [Common] optimisation.
+      case Map.lookup filtered (dedupSRTs topSRT) of
+        Just srtEntry@(SRTEntry srtLbl)  -> do
+          srtTrace "oneSRT [Common]" (ppr lbls <+> ppr srtLbl) $ return ()
+          updateSRTMap (Just srtEntry)
+          return ([], map (,srtLbl) blockids, [])
+        Nothing -> do
+          -- No duplicates: we have to build a new SRT object
+          srtTrace "oneSRT: new" (ppr lbls <+> ppr filtered) $ return ()
+          (decls, funSRTs, srtEntry) <-
+            case maybeFunClosure of
+              Just (fun,block) ->
+                return ( [], [(block, cafList)], SRTEntry fun )
+              Nothing -> do
+                (decls, entry) <- lift . lift $ buildSRTChain dflags cafList
+                return (decls, [], entry)
+          updateSRTMap (Just srtEntry)
+          let allBelowThis = Set.union allBelow filtered
+              oldFlatSRTs = flatSRTs topSRT
+              newFlatSRTs = Map.insert srtEntry allBelowThis oldFlatSRTs
+              newDedupSRTs = Map.insert filtered srtEntry (dedupSRTs topSRT)
+          lift (put (topSRT { dedupSRTs = newDedupSRTs
+                            , flatSRTs = newFlatSRTs }))
+          let SRTEntry lbl = srtEntry
+          return (decls, map (,lbl) blockids, funSRTs)
+
+
+-- | build a static SRT object (or a chain of objects) from a list of
+-- SRTEntries.
+buildSRTChain
+   :: DynFlags
+   -> [SRTEntry]
+   -> UniqSM
+        ( [CmmDecl]    -- The SRT object(s)
+        , SRTEntry     -- label to use in the info table
+        )
+buildSRTChain _ [] = panic "buildSRT: empty"
+buildSRTChain dflags cafSet =
+  case splitAt mAX_SRT_SIZE cafSet of
+    (these, []) -> do
+      (decl,lbl) <- buildSRT dflags these
+      return ([decl], lbl)
+    (these,those) -> do
+      (rest, rest_lbl) <- buildSRTChain dflags (head these : those)
+      (decl,lbl) <- buildSRT dflags (rest_lbl : tail these)
+      return (decl:rest, lbl)
+  where
+    mAX_SRT_SIZE = 16
+
+
+buildSRT :: DynFlags -> [SRTEntry] -> UniqSM (CmmDecl, SRTEntry)
+buildSRT dflags refs = do
+  id <- getUniqueM
+  let
+    lbl = mkSRTLabel id
+    srt_n_info = mkSRTInfoLabel (length refs)
+    fields =
+      mkStaticClosure dflags srt_n_info dontCareCCS
+        [ CmmLabel lbl | SRTEntry lbl <- refs ]
+        [] -- no padding
+        [mkIntCLit dflags 0] -- link field
+        [] -- no saved info
+  return (mkDataLits (Section Data lbl) lbl fields, SRTEntry lbl)
+
+
+-- | Update info tables with references to their SRTs. Also generate
+-- static closures, splicing in SRT fields as necessary.
+updInfoSRTs
+  :: DynFlags
+  -> LabelMap CLabel               -- SRT labels for each block
+  -> LabelMap [SRTEntry]           -- SRTs to merge into FUN_STATIC closures
+  -> CmmDecl
+  -> [CmmDecl]
+
+updInfoSRTs dflags srt_env funSRTEnv (CmmProc top_info top_l live g)
+  | Just (_,closure) <- maybeStaticClosure = [ proc, closure ]
+  | otherwise = [ proc ]
+  where
+    proc = CmmProc top_info { info_tbls = newTopInfo } top_l live g
+    newTopInfo = mapMapWithKey updInfoTbl (info_tbls top_info)
+    updInfoTbl l info_tbl
+      | l == g_entry g, Just (inf, _) <- maybeStaticClosure = inf
+      | otherwise  = info_tbl { cit_srt = mapLookup l srt_env }
+
+    -- Generate static closures [FUN].  Note that this also generates
+    -- static closures for thunks (CAFs), because it's easier to treat
+    -- them uniformly in the code generator.
+    maybeStaticClosure :: Maybe (CmmInfoTable, CmmDecl)
+    maybeStaticClosure
+      | Just info_tbl@CmmInfoTable{..} <-
+           mapLookup (g_entry g) (info_tbls top_info)
+      , Just (id, ccs) <- cit_clo
+      , isStaticRep cit_rep =
+        let
+          (newInfo, srtEntries) = case mapLookup (g_entry g) funSRTEnv of
+            Nothing ->
+              -- if we don't add SRT entries to this closure, then we
+              -- want to set the srt field in its info table as usual
+              (info_tbl { cit_srt = mapLookup (g_entry g) srt_env }, [])
+            Just srtEntries -> srtTrace "maybeStaticFun" (ppr res)
+              (info_tbl { cit_rep = new_rep }, res)
+              where res = [ CmmLabel lbl | SRTEntry lbl <- srtEntries ]
+          fields = mkStaticClosureFields dflags info_tbl ccs (idCafInfo id)
+            srtEntries
+          new_rep = case cit_rep of
+             HeapRep sta ptrs nptrs ty ->
+               HeapRep sta (ptrs + length srtEntries) nptrs ty
+             _other -> panic "maybeStaticFun"
+          lbl = mkLocalClosureLabel (idName id) (idCafInfo id)
+        in
+          Just (newInfo, mkDataLits (Section Data lbl) lbl fields)
+      | otherwise = Nothing
+
+updInfoSRTs _ _ _ t = [t]
+
+
+srtTrace :: String -> SDoc -> b -> b
+-- srtTrace = pprTrace
+srtTrace _ _ b = b
diff --git a/compiler/GHC/Cmm/LayoutStack.hs b/compiler/GHC/Cmm/LayoutStack.hs
new file mode 100644
index 0000000000..f6dda7728c
--- /dev/null
+++ b/compiler/GHC/Cmm/LayoutStack.hs
@@ -0,0 +1,1236 @@
+{-# LANGUAGE BangPatterns, RecordWildCards, GADTs #-}
+module GHC.Cmm.LayoutStack (
+       cmmLayoutStack, setInfoTableStackMap
+  ) where
+
+import GhcPrelude hiding ((<*>))
+
+import GHC.StgToCmm.Utils      ( callerSaveVolatileRegs, newTemp  ) -- XXX layering violation
+import GHC.StgToCmm.Foreign    ( saveThreadState, loadThreadState ) -- XXX layering violation
+
+import BasicTypes
+import GHC.Cmm
+import GHC.Cmm.Info
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import GHC.Cmm.Utils
+import GHC.Cmm.Graph
+import ForeignCall
+import GHC.Cmm.Liveness
+import GHC.Cmm.ProcPoint
+import GHC.Runtime.Layout
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+import UniqSupply
+import Maybes
+import UniqFM
+import Util
+
+import DynFlags
+import FastString
+import Outputable hiding ( isEmpty )
+import qualified Data.Set as Set
+import Control.Monad.Fix
+import Data.Array as Array
+import Data.Bits
+import Data.List (nub)
+
+{- Note [Stack Layout]
+
+The job of this pass is to
+
+ - replace references to abstract stack Areas with fixed offsets from Sp.
+
+ - replace the CmmHighStackMark constant used in the stack check with
+   the maximum stack usage of the proc.
+
+ - save any variables that are live across a call, and reload them as
+   necessary.
+
+Before stack allocation, local variables remain live across native
+calls (CmmCall{ cmm_cont = Just _ }), and after stack allocation local
+variables are clobbered by native calls.
+
+We want to do stack allocation so that as far as possible
+ - stack use is minimized, and
+ - unnecessary stack saves and loads are avoided.
+
+The algorithm we use is a variant of linear-scan register allocation,
+where the stack is our register file.
+
+We proceed in two passes, see Note [Two pass approach] for why they are not easy
+to merge into one.
+
+Pass 1:
+
+ - First, we do a liveness analysis, which annotates every block with
+   the variables live on entry to the block.
+
+ - We traverse blocks in reverse postorder DFS; that is, we visit at
+   least one predecessor of a block before the block itself.  The
+   stack layout flowing from the predecessor of the block will
+   determine the stack layout on entry to the block.
+
+ - We maintain a data structure
+
+     Map Label StackMap
+
+   which describes the contents of the stack and the stack pointer on
+   entry to each block that is a successor of a block that we have
+   visited.
+
+ - For each block we visit:
+
+    - Look up the StackMap for this block.
+
+    - If this block is a proc point (or a call continuation, if we aren't
+      splitting proc points), we need to reload all the live variables from the
+      stack - but this is done in Pass 2, which calculates more precise liveness
+      information (see description of Pass 2).
+
+    - Walk forwards through the instructions:
+      - At an assignment  x = Sp[loc]
+        - Record the fact that Sp[loc] contains x, so that we won't
+          need to save x if it ever needs to be spilled.
+      - At an assignment  x = E
+        - If x was previously on the stack, it isn't any more
+      - At the last node, if it is a call or a jump to a proc point
+        - Lay out the stack frame for the call (see setupStackFrame)
+        - emit instructions to save all the live variables
+        - Remember the StackMaps for all the successors
+        - emit an instruction to adjust Sp
+      - If the last node is a branch, then the current StackMap is the
+        StackMap for the successors.
+
+    - Manifest Sp: replace references to stack areas in this block
+      with real Sp offsets. We cannot do this until we have laid out
+      the stack area for the successors above.
+
+      In this phase we also eliminate redundant stores to the stack;
+      see elimStackStores.
+
+  - There is one important gotcha: sometimes we'll encounter a control
+    transfer to a block that we've already processed (a join point),
+    and in that case we might need to rearrange the stack to match
+    what the block is expecting. (exactly the same as in linear-scan
+    register allocation, except here we have the luxury of an infinite
+    supply of temporary variables).
+
+  - Finally, we update the magic CmmHighStackMark constant with the
+    stack usage of the function, and eliminate the whole stack check
+    if there was no stack use. (in fact this is done as part of the
+    main traversal, by feeding the high-water-mark output back in as
+    an input. I hate cyclic programming, but it's just too convenient
+    sometimes.)
+
+  There are plenty of tricky details: update frames, proc points, return
+  addresses, foreign calls, and some ad-hoc optimisations that are
+  convenient to do here and effective in common cases.  Comments in the
+  code below explain these.
+
+Pass 2:
+
+- Calculate live registers, but taking into account that nothing is live at the
+  entry to a proc point.
+
+- At each proc point and call continuation insert reloads of live registers from
+  the stack (they were saved by Pass 1).
+
+
+Note [Two pass approach]
+
+The main reason for Pass 2 is being able to insert only the reloads that are
+needed and the fact that the two passes need different liveness information.
+Let's consider an example:
+
+  .....
+   \ /
+    D   <- proc point
+   / \
+  E   F
+   \ /
+    G   <- proc point
+    |
+    X
+
+Pass 1 needs liveness assuming that local variables are preserved across calls.
+This is important because it needs to save any local registers to the stack
+(e.g., if register a is used in block X, it must be saved before any native
+call).
+However, for Pass 2, where we want to reload registers from stack (in a proc
+point), this is overly conservative and would lead us to generate reloads in D
+for things used in X, even though we're going to generate reloads in G anyway
+(since it's also a proc point).
+So Pass 2 calculates liveness knowing that nothing is live at the entry to a
+proc point. This means that in D we only need to reload things used in E or F.
+This can be quite important, for an extreme example see testcase for #3294.
+
+Merging the two passes is not trivial - Pass 2 is a backward rewrite and Pass 1
+is a forward one. Furthermore, Pass 1 is creating code that uses local registers
+(saving them before a call), which the liveness analysis for Pass 2 must see to
+be correct.
+
+-}
+
+
+-- All stack locations are expressed as positive byte offsets from the
+-- "base", which is defined to be the address above the return address
+-- on the stack on entry to this CmmProc.
+--
+-- Lower addresses have higher StackLocs.
+--
+type StackLoc = ByteOff
+
+{-
+ A StackMap describes the stack at any given point.  At a continuation
+ it has a particular layout, like this:
+
+         |             | <- base
+         |-------------|
+         |     ret0    | <- base + 8
+         |-------------|
+         .  upd frame  . <- base + sm_ret_off
+         |-------------|
+         |             |
+         .    vars     .
+         . (live/dead) .
+         |             | <- base + sm_sp - sm_args
+         |-------------|
+         |    ret1     |
+         .  ret vals   . <- base + sm_sp    (<--- Sp points here)
+         |-------------|
+
+Why do we include the final return address (ret0) in our stack map?  I
+have absolutely no idea, but it seems to be done that way consistently
+in the rest of the code generator, so I played along here. --SDM
+
+Note that we will be constructing an info table for the continuation
+(ret1), which needs to describe the stack down to, but not including,
+the update frame (or ret0, if there is no update frame).
+-}
+
+data StackMap = StackMap
+ {  sm_sp   :: StackLoc
+       -- ^ the offset of Sp relative to the base on entry
+       -- to this block.
+ ,  sm_args :: ByteOff
+       -- ^ the number of bytes of arguments in the area for this block
+       -- Defn: the offset of young(L) relative to the base is given by
+       -- (sm_sp - sm_args) of the StackMap for block L.
+ ,  sm_ret_off :: ByteOff
+       -- ^ Number of words of stack that we do not describe with an info
+       -- table, because it contains an update frame.
+ ,  sm_regs :: UniqFM (LocalReg,StackLoc)
+       -- ^ regs on the stack
+ }
+
+instance Outputable StackMap where
+  ppr StackMap{..} =
+     text "Sp = " <> int sm_sp $$
+     text "sm_args = " <> int sm_args $$
+     text "sm_ret_off = " <> int sm_ret_off $$
+     text "sm_regs = " <> pprUFM sm_regs ppr
+
+
+cmmLayoutStack :: DynFlags -> ProcPointSet -> ByteOff -> CmmGraph
+               -> UniqSM (CmmGraph, LabelMap StackMap)
+cmmLayoutStack dflags procpoints entry_args
+               graph@(CmmGraph { g_entry = entry })
+  = do
+    -- We need liveness info. Dead assignments are removed later
+    -- by the sinking pass.
+    let liveness = cmmLocalLiveness dflags graph
+        blocks = revPostorder graph
+
+    (final_stackmaps, _final_high_sp, new_blocks) <-
+          mfix $ \ ~(rec_stackmaps, rec_high_sp, _new_blocks) ->
+            layout dflags procpoints liveness entry entry_args
+                   rec_stackmaps rec_high_sp blocks
+
+    blocks_with_reloads <-
+        insertReloadsAsNeeded dflags procpoints final_stackmaps entry new_blocks
+    new_blocks' <- mapM (lowerSafeForeignCall dflags) blocks_with_reloads
+    return (ofBlockList entry new_blocks', final_stackmaps)
+
+-- -----------------------------------------------------------------------------
+-- Pass 1
+-- -----------------------------------------------------------------------------
+
+layout :: DynFlags
+       -> LabelSet                      -- proc points
+       -> LabelMap CmmLocalLive         -- liveness
+       -> BlockId                       -- entry
+       -> ByteOff                       -- stack args on entry
+
+       -> LabelMap StackMap             -- [final] stack maps
+       -> ByteOff                       -- [final] Sp high water mark
+
+       -> [CmmBlock]                    -- [in] blocks
+
+       -> UniqSM
+          ( LabelMap StackMap           -- [out] stack maps
+          , ByteOff                     -- [out] Sp high water mark
+          , [CmmBlock]                  -- [out] new blocks
+          )
+
+layout dflags procpoints liveness entry entry_args final_stackmaps final_sp_high blocks
+  = go blocks init_stackmap entry_args []
+  where
+    (updfr, cont_info)  = collectContInfo blocks
+
+    init_stackmap = mapSingleton entry StackMap{ sm_sp   = entry_args
+                                               , sm_args = entry_args
+                                               , sm_ret_off = updfr
+                                               , sm_regs = emptyUFM
+                                               }
+
+    go [] acc_stackmaps acc_hwm acc_blocks
+      = return (acc_stackmaps, acc_hwm, acc_blocks)
+
+    go (b0 : bs) acc_stackmaps acc_hwm acc_blocks
+      = do
+       let (entry0@(CmmEntry entry_lbl tscope), middle0, last0) = blockSplit b0
+
+       let stack0@StackMap { sm_sp = sp0 }
+               = mapFindWithDefault
+                     (pprPanic "no stack map for" (ppr entry_lbl))
+                     entry_lbl acc_stackmaps
+
+       -- (a) Update the stack map to include the effects of
+       --     assignments in this block
+       let stack1 = foldBlockNodesF (procMiddle acc_stackmaps) middle0 stack0
+
+       -- (b) Look at the last node and if we are making a call or
+       --     jumping to a proc point, we must save the live
+       --     variables, adjust Sp, and construct the StackMaps for
+       --     each of the successor blocks.  See handleLastNode for
+       --     details.
+       (middle1, sp_off, last1, fixup_blocks, out)
+           <- handleLastNode dflags procpoints liveness cont_info
+                             acc_stackmaps stack1 tscope middle0 last0
+
+       -- (c) Manifest Sp: run over the nodes in the block and replace
+       --     CmmStackSlot with CmmLoad from Sp with a concrete offset.
+       --
+       -- our block:
+       --    middle0          -- the original middle nodes
+       --    middle1          -- live variable saves from handleLastNode
+       --    Sp = Sp + sp_off -- Sp adjustment goes here
+       --    last1            -- the last node
+       --
+       let middle_pre = blockToList $ foldl' blockSnoc middle0 middle1
+
+       let final_blocks =
+               manifestSp dflags final_stackmaps stack0 sp0 final_sp_high
+                          entry0 middle_pre sp_off last1 fixup_blocks
+
+       let acc_stackmaps' = mapUnion acc_stackmaps out
+
+           -- If this block jumps to the GC, then we do not take its
+           -- stack usage into account for the high-water mark.
+           -- Otherwise, if the only stack usage is in the stack-check
+           -- failure block itself, we will do a redundant stack
+           -- check.  The stack has a buffer designed to accommodate
+           -- the largest amount of stack needed for calling the GC.
+           --
+           this_sp_hwm | isGcJump last0 = 0
+                       | otherwise      = sp0 - sp_off
+
+           hwm' = maximum (acc_hwm : this_sp_hwm : map sm_sp (mapElems out))
+
+       go bs acc_stackmaps' hwm' (final_blocks ++ acc_blocks)
+
+
+-- -----------------------------------------------------------------------------
+
+-- Not foolproof, but GCFun is the culprit we most want to catch
+isGcJump :: CmmNode O C -> Bool
+isGcJump (CmmCall { cml_target = CmmReg (CmmGlobal l) })
+  = l == GCFun || l == GCEnter1
+isGcJump _something_else = False
+
+-- -----------------------------------------------------------------------------
+
+-- This doesn't seem right somehow.  We need to find out whether this
+-- proc will push some update frame material at some point, so that we
+-- can avoid using that area of the stack for spilling.  The
+-- updfr_space field of the CmmProc *should* tell us, but it doesn't
+-- (I think maybe it gets filled in later when we do proc-point
+-- splitting).
+--
+-- So we'll just take the max of all the cml_ret_offs.  This could be
+-- unnecessarily pessimistic, but probably not in the code we
+-- generate.
+
+collectContInfo :: [CmmBlock] -> (ByteOff, LabelMap ByteOff)
+collectContInfo blocks
+  = (maximum ret_offs, mapFromList (catMaybes mb_argss))
+ where
+  (mb_argss, ret_offs) = mapAndUnzip get_cont blocks
+
+  get_cont :: Block CmmNode x C -> (Maybe (Label, ByteOff), ByteOff)
+  get_cont b =
+     case lastNode b of
+        CmmCall { cml_cont = Just l, .. }
+           -> (Just (l, cml_ret_args), cml_ret_off)
+        CmmForeignCall { .. }
+           -> (Just (succ, ret_args), ret_off)
+        _other -> (Nothing, 0)
+
+
+-- -----------------------------------------------------------------------------
+-- Updating the StackMap from middle nodes
+
+-- Look for loads from stack slots, and update the StackMap.  This is
+-- purely for optimisation reasons, so that we can avoid saving a
+-- variable back to a different stack slot if it is already on the
+-- stack.
+--
+-- This happens a lot: for example when function arguments are passed
+-- on the stack and need to be immediately saved across a call, we
+-- want to just leave them where they are on the stack.
+--
+procMiddle :: LabelMap StackMap -> CmmNode e x -> StackMap -> StackMap
+procMiddle stackmaps node sm
+  = case node of
+     CmmAssign (CmmLocal r) (CmmLoad (CmmStackSlot area off) _)
+       -> sm { sm_regs = addToUFM (sm_regs sm) r (r,loc) }
+        where loc = getStackLoc area off stackmaps
+     CmmAssign (CmmLocal r) _other
+       -> sm { sm_regs = delFromUFM (sm_regs sm) r }
+     _other
+       -> sm
+
+getStackLoc :: Area -> ByteOff -> LabelMap StackMap -> StackLoc
+getStackLoc Old       n _         = n
+getStackLoc (Young l) n stackmaps =
+  case mapLookup l stackmaps of
+    Nothing -> pprPanic "getStackLoc" (ppr l)
+    Just sm -> sm_sp sm - sm_args sm + n
+
+
+-- -----------------------------------------------------------------------------
+-- Handling stack allocation for a last node
+
+-- We take a single last node and turn it into:
+--
+--    C1 (some statements)
+--    Sp = Sp + N
+--    C2 (some more statements)
+--    call f()          -- the actual last node
+--
+-- plus possibly some more blocks (we may have to add some fixup code
+-- between the last node and the continuation).
+--
+-- C1: is the code for saving the variables across this last node onto
+-- the stack, if the continuation is a call or jumps to a proc point.
+--
+-- C2: if the last node is a safe foreign call, we have to inject some
+-- extra code that goes *after* the Sp adjustment.
+
+handleLastNode
+   :: DynFlags -> ProcPointSet -> LabelMap CmmLocalLive -> LabelMap ByteOff
+   -> LabelMap StackMap -> StackMap -> CmmTickScope
+   -> Block CmmNode O O
+   -> CmmNode O C
+   -> UniqSM
+      ( [CmmNode O O]      -- nodes to go *before* the Sp adjustment
+      , ByteOff            -- amount to adjust Sp
+      , CmmNode O C        -- new last node
+      , [CmmBlock]         -- new blocks
+      , LabelMap StackMap  -- stackmaps for the continuations
+      )
+
+handleLastNode dflags procpoints liveness cont_info stackmaps
+               stack0@StackMap { sm_sp = sp0 } tscp middle last
+ = case last of
+    --  At each return / tail call,
+    --  adjust Sp to point to the last argument pushed, which
+    --  is cml_args, after popping any other junk from the stack.
+    CmmCall{ cml_cont = Nothing, .. } -> do
+      let sp_off = sp0 - cml_args
+      return ([], sp_off, last, [], mapEmpty)
+
+    --  At each CmmCall with a continuation:
+    CmmCall{ cml_cont = Just cont_lbl, .. } ->
+       return $ lastCall cont_lbl cml_args cml_ret_args cml_ret_off
+
+    CmmForeignCall{ succ = cont_lbl, .. } -> do
+       return $ lastCall cont_lbl (wORD_SIZE dflags) ret_args ret_off
+            -- one word of args: the return address
+
+    CmmBranch {}     ->  handleBranches
+    CmmCondBranch {} ->  handleBranches
+    CmmSwitch {}     ->  handleBranches
+
+  where
+     -- Calls and ForeignCalls are handled the same way:
+     lastCall :: BlockId -> ByteOff -> ByteOff -> ByteOff
+              -> ( [CmmNode O O]
+                 , ByteOff
+                 , CmmNode O C
+                 , [CmmBlock]
+                 , LabelMap StackMap
+                 )
+     lastCall lbl cml_args cml_ret_args cml_ret_off
+      =  ( assignments
+         , spOffsetForCall sp0 cont_stack cml_args
+         , last
+         , [] -- no new blocks
+         , mapSingleton lbl cont_stack )
+      where
+         (assignments, cont_stack) = prepareStack lbl cml_ret_args cml_ret_off
+
+
+     prepareStack lbl cml_ret_args cml_ret_off
+       | Just cont_stack <- mapLookup lbl stackmaps
+             -- If we have already seen this continuation before, then
+             -- we just have to make the stack look the same:
+       = (fixupStack stack0 cont_stack, cont_stack)
+             -- Otherwise, we have to allocate the stack frame
+       | otherwise
+       = (save_assignments, new_cont_stack)
+       where
+        (new_cont_stack, save_assignments)
+           = setupStackFrame dflags lbl liveness cml_ret_off cml_ret_args stack0
+
+
+     -- For other last nodes (branches), if any of the targets is a
+     -- proc point, we have to set up the stack to match what the proc
+     -- point is expecting.
+     --
+     handleBranches :: UniqSM ( [CmmNode O O]
+                                , ByteOff
+                                , CmmNode O C
+                                , [CmmBlock]
+                                , LabelMap StackMap )
+
+     handleBranches
+         -- Note [diamond proc point]
+       | Just l <- futureContinuation middle
+       , (nub $ filter (`setMember` procpoints) $ successors last) == [l]
+       = do
+         let cont_args = mapFindWithDefault 0 l cont_info
+             (assigs, cont_stack) = prepareStack l cont_args (sm_ret_off stack0)
+             out = mapFromList [ (l', cont_stack)
+                               | l' <- successors last ]
+         return ( assigs
+                , spOffsetForCall sp0 cont_stack (wORD_SIZE dflags)
+                , last
+                , []
+                , out)
+
+        | otherwise = do
+          pps <- mapM handleBranch (successors last)
+          let lbl_map :: LabelMap Label
+              lbl_map = mapFromList [ (l,tmp) | (l,tmp,_,_) <- pps ]
+              fix_lbl l = mapFindWithDefault l l lbl_map
+          return ( []
+                 , 0
+                 , mapSuccessors fix_lbl last
+                 , concat [ blk | (_,_,_,blk) <- pps ]
+                 , mapFromList [ (l, sm) | (l,_,sm,_) <- pps ] )
+
+     -- For each successor of this block
+     handleBranch :: BlockId -> UniqSM (BlockId, BlockId, StackMap, [CmmBlock])
+     handleBranch l
+        --   (a) if the successor already has a stackmap, we need to
+        --       shuffle the current stack to make it look the same.
+        --       We have to insert a new block to make this happen.
+        | Just stack2 <- mapLookup l stackmaps
+        = do
+             let assigs = fixupStack stack0 stack2
+             (tmp_lbl, block) <- makeFixupBlock dflags sp0 l stack2 tscp assigs
+             return (l, tmp_lbl, stack2, block)
+
+        --   (b) if the successor is a proc point, save everything
+        --       on the stack.
+        | l `setMember` procpoints
+        = do
+             let cont_args = mapFindWithDefault 0 l cont_info
+                 (stack2, assigs) =
+                      setupStackFrame dflags l liveness (sm_ret_off stack0)
+                                                        cont_args stack0
+             (tmp_lbl, block) <- makeFixupBlock dflags sp0 l stack2 tscp assigs
+             return (l, tmp_lbl, stack2, block)
+
+        --   (c) otherwise, the current StackMap is the StackMap for
+        --       the continuation.  But we must remember to remove any
+        --       variables from the StackMap that are *not* live at
+        --       the destination, because this StackMap might be used
+        --       by fixupStack if this is a join point.
+        | otherwise = return (l, l, stack1, [])
+        where live = mapFindWithDefault (panic "handleBranch") l liveness
+              stack1 = stack0 { sm_regs = filterUFM is_live (sm_regs stack0) }
+              is_live (r,_) = r `elemRegSet` live
+
+
+makeFixupBlock :: DynFlags -> ByteOff -> Label -> StackMap
+               -> CmmTickScope -> [CmmNode O O]
+               -> UniqSM (Label, [CmmBlock])
+makeFixupBlock dflags sp0 l stack tscope assigs
+  | null assigs && sp0 == sm_sp stack = return (l, [])
+  | otherwise = do
+    tmp_lbl <- newBlockId
+    let sp_off = sp0 - sm_sp stack
+        block = blockJoin (CmmEntry tmp_lbl tscope)
+                          ( maybeAddSpAdj dflags sp0 sp_off
+                           $ blockFromList assigs )
+                          (CmmBranch l)
+    return (tmp_lbl, [block])
+
+
+-- Sp is currently pointing to current_sp,
+-- we want it to point to
+--    (sm_sp cont_stack - sm_args cont_stack + args)
+-- so the difference is
+--    sp0 - (sm_sp cont_stack - sm_args cont_stack + args)
+spOffsetForCall :: ByteOff -> StackMap -> ByteOff -> ByteOff
+spOffsetForCall current_sp cont_stack args
+  = current_sp - (sm_sp cont_stack - sm_args cont_stack + args)
+
+
+-- | create a sequence of assignments to establish the new StackMap,
+-- given the old StackMap.
+fixupStack :: StackMap -> StackMap -> [CmmNode O O]
+fixupStack old_stack new_stack = concatMap move new_locs
+ where
+     old_map  = sm_regs old_stack
+     new_locs = stackSlotRegs new_stack
+
+     move (r,n)
+       | Just (_,m) <- lookupUFM old_map r, n == m = []
+       | otherwise = [CmmStore (CmmStackSlot Old n)
+                               (CmmReg (CmmLocal r))]
+
+
+
+setupStackFrame
+             :: DynFlags
+             -> BlockId                 -- label of continuation
+             -> LabelMap CmmLocalLive   -- liveness
+             -> ByteOff      -- updfr
+             -> ByteOff      -- bytes of return values on stack
+             -> StackMap     -- current StackMap
+             -> (StackMap, [CmmNode O O])
+
+setupStackFrame dflags lbl liveness updfr_off ret_args stack0
+  = (cont_stack, assignments)
+  where
+      -- get the set of LocalRegs live in the continuation
+      live = mapFindWithDefault Set.empty lbl liveness
+
+      -- the stack from the base to updfr_off is off-limits.
+      -- our new stack frame contains:
+      --   * saved live variables
+      --   * the return address [young(C) + 8]
+      --   * the args for the call,
+      --     which are replaced by the return values at the return
+      --     point.
+
+      -- everything up to updfr_off is off-limits
+      -- stack1 contains updfr_off, plus everything we need to save
+      (stack1, assignments) = allocate dflags updfr_off live stack0
+
+      -- And the Sp at the continuation is:
+      --   sm_sp stack1 + ret_args
+      cont_stack = stack1{ sm_sp = sm_sp stack1 + ret_args
+                         , sm_args = ret_args
+                         , sm_ret_off = updfr_off
+                         }
+
+
+-- -----------------------------------------------------------------------------
+-- Note [diamond proc point]
+--
+-- This special case looks for the pattern we get from a typical
+-- tagged case expression:
+--
+--    Sp[young(L1)] = L1
+--    if (R1 & 7) != 0 goto L1 else goto L2
+--  L2:
+--    call [R1] returns to L1
+--  L1: live: {y}
+--    x = R1
+--
+-- If we let the generic case handle this, we get
+--
+--    Sp[-16] = L1
+--    if (R1 & 7) != 0 goto L1a else goto L2
+--  L2:
+--    Sp[-8] = y
+--    Sp = Sp - 16
+--    call [R1] returns to L1
+--  L1a:
+--    Sp[-8] = y
+--    Sp = Sp - 16
+--    goto L1
+--  L1:
+--    x = R1
+--
+-- The code for saving the live vars is duplicated in each branch, and
+-- furthermore there is an extra jump in the fast path (assuming L1 is
+-- a proc point, which it probably is if there is a heap check).
+--
+-- So to fix this we want to set up the stack frame before the
+-- conditional jump.  How do we know when to do this, and when it is
+-- safe?  The basic idea is, when we see the assignment
+--
+--   Sp[young(L)] = L
+--
+-- we know that
+--   * we are definitely heading for L
+--   * there can be no more reads from another stack area, because young(L)
+--     overlaps with it.
+--
+-- We don't necessarily know that everything live at L is live now
+-- (some might be assigned between here and the jump to L).  So we
+-- simplify and only do the optimisation when we see
+--
+--   (1) a block containing an assignment of a return address L
+--   (2) ending in a branch where one (and only) continuation goes to L,
+--       and no other continuations go to proc points.
+--
+-- then we allocate the stack frame for L at the end of the block,
+-- before the branch.
+--
+-- We could generalise (2), but that would make it a bit more
+-- complicated to handle, and this currently catches the common case.
+
+futureContinuation :: Block CmmNode O O -> Maybe BlockId
+futureContinuation middle = foldBlockNodesB f middle Nothing
+   where f :: CmmNode a b -> Maybe BlockId -> Maybe BlockId
+         f (CmmStore (CmmStackSlot (Young l) _) (CmmLit (CmmBlock _))) _
+               = Just l
+         f _ r = r
+
+-- -----------------------------------------------------------------------------
+-- Saving live registers
+
+-- | Given a set of live registers and a StackMap, save all the registers
+-- on the stack and return the new StackMap and the assignments to do
+-- the saving.
+--
+allocate :: DynFlags -> ByteOff -> LocalRegSet -> StackMap
+         -> (StackMap, [CmmNode O O])
+allocate dflags ret_off live stackmap@StackMap{ sm_sp = sp0
+                                              , sm_regs = regs0 }
+ =
+   -- we only have to save regs that are not already in a slot
+   let to_save = filter (not . (`elemUFM` regs0)) (Set.elems live)
+       regs1   = filterUFM (\(r,_) -> elemRegSet r live) regs0
+   in
+
+   -- make a map of the stack
+   let stack = reverse $ Array.elems $
+               accumArray (\_ x -> x) Empty (1, toWords dflags (max sp0 ret_off)) $
+                 ret_words ++ live_words
+            where ret_words =
+                   [ (x, Occupied)
+                   | x <- [ 1 .. toWords dflags ret_off] ]
+                  live_words =
+                   [ (toWords dflags x, Occupied)
+                   | (r,off) <- nonDetEltsUFM regs1,
+                   -- See Note [Unique Determinism and code generation]
+                     let w = localRegBytes dflags r,
+                     x <- [ off, off - wORD_SIZE dflags .. off - w + 1] ]
+   in
+
+   -- Pass over the stack: find slots to save all the new live variables,
+   -- choosing the oldest slots first (hence a foldr).
+   let
+       save slot ([], stack, n, assigs, regs) -- no more regs to save
+          = ([], slot:stack, plusW dflags n 1, assigs, regs)
+       save slot (to_save, stack, n, assigs, regs)
+          = case slot of
+               Occupied ->  (to_save, Occupied:stack, plusW dflags n 1, assigs, regs)
+               Empty
+                 | Just (stack', r, to_save') <-
+                       select_save to_save (slot:stack)
+                 -> let assig = CmmStore (CmmStackSlot Old n')
+                                         (CmmReg (CmmLocal r))
+                        n' = plusW dflags n 1
+                   in
+                        (to_save', stack', n', assig : assigs, (r,(r,n')):regs)
+
+                 | otherwise
+                 -> (to_save, slot:stack, plusW dflags n 1, assigs, regs)
+
+       -- we should do better here: right now we'll fit the smallest first,
+       -- but it would make more sense to fit the biggest first.
+       select_save :: [LocalReg] -> [StackSlot]
+                   -> Maybe ([StackSlot], LocalReg, [LocalReg])
+       select_save regs stack = go regs []
+         where go []     _no_fit = Nothing
+               go (r:rs) no_fit
+                 | Just rest <- dropEmpty words stack
+                 = Just (replicate words Occupied ++ rest, r, rs++no_fit)
+                 | otherwise
+                 = go rs (r:no_fit)
+                 where words = localRegWords dflags r
+
+       -- fill in empty slots as much as possible
+       (still_to_save, save_stack, n, save_assigs, save_regs)
+          = foldr save (to_save, [], 0, [], []) stack
+
+       -- push any remaining live vars on the stack
+       (push_sp, push_assigs, push_regs)
+          = foldr push (n, [], []) still_to_save
+          where
+              push r (n, assigs, regs)
+                = (n', assig : assigs, (r,(r,n')) : regs)
+                where
+                  n' = n + localRegBytes dflags r
+                  assig = CmmStore (CmmStackSlot Old n')
+                                   (CmmReg (CmmLocal r))
+
+       trim_sp
+          | not (null push_regs) = push_sp
+          | otherwise
+          = plusW dflags n (- length (takeWhile isEmpty save_stack))
+
+       final_regs = regs1 `addListToUFM` push_regs
+                          `addListToUFM` save_regs
+
+   in
+  -- XXX should be an assert
+   if ( n /= max sp0 ret_off ) then pprPanic "allocate" (ppr n <+> ppr sp0 <+> ppr ret_off) else
+
+   if (trim_sp .&. (wORD_SIZE dflags - 1)) /= 0  then pprPanic "allocate2" (ppr trim_sp <+> ppr final_regs <+> ppr push_sp) else
+
+   ( stackmap { sm_regs = final_regs , sm_sp = trim_sp }
+   , push_assigs ++ save_assigs )
+
+
+-- -----------------------------------------------------------------------------
+-- Manifesting Sp
+
+-- | Manifest Sp: turn all the CmmStackSlots into CmmLoads from Sp.  The
+-- block looks like this:
+--
+--    middle_pre       -- the middle nodes
+--    Sp = Sp + sp_off -- Sp adjustment goes here
+--    last             -- the last node
+--
+-- And we have some extra blocks too (that don't contain Sp adjustments)
+--
+-- The adjustment for middle_pre will be different from that for
+-- middle_post, because the Sp adjustment intervenes.
+--
+manifestSp
+   :: DynFlags
+   -> LabelMap StackMap  -- StackMaps for other blocks
+   -> StackMap           -- StackMap for this block
+   -> ByteOff            -- Sp on entry to the block
+   -> ByteOff            -- SpHigh
+   -> CmmNode C O        -- first node
+   -> [CmmNode O O]      -- middle
+   -> ByteOff            -- sp_off
+   -> CmmNode O C        -- last node
+   -> [CmmBlock]         -- new blocks
+   -> [CmmBlock]         -- final blocks with Sp manifest
+
+manifestSp dflags stackmaps stack0 sp0 sp_high
+           first middle_pre sp_off last fixup_blocks
+  = final_block : fixup_blocks'
+  where
+    area_off = getAreaOff stackmaps
+
+    adj_pre_sp, adj_post_sp :: CmmNode e x -> CmmNode e x
+    adj_pre_sp  = mapExpDeep (areaToSp dflags sp0            sp_high area_off)
+    adj_post_sp = mapExpDeep (areaToSp dflags (sp0 - sp_off) sp_high area_off)
+
+    final_middle = maybeAddSpAdj dflags sp0 sp_off
+                 . blockFromList
+                 . map adj_pre_sp
+                 . elimStackStores stack0 stackmaps area_off
+                 $ middle_pre
+    final_last    = optStackCheck (adj_post_sp last)
+
+    final_block   = blockJoin first final_middle final_last
+
+    fixup_blocks' = map (mapBlock3' (id, adj_post_sp, id)) fixup_blocks
+
+getAreaOff :: LabelMap StackMap -> (Area -> StackLoc)
+getAreaOff _ Old = 0
+getAreaOff stackmaps (Young l) =
+  case mapLookup l stackmaps of
+    Just sm -> sm_sp sm - sm_args sm
+    Nothing -> pprPanic "getAreaOff" (ppr l)
+
+
+maybeAddSpAdj
+  :: DynFlags -> ByteOff -> ByteOff -> Block CmmNode O O -> Block CmmNode O O
+maybeAddSpAdj dflags sp0 sp_off block =
+  add_initial_unwind $ add_adj_unwind $ adj block
+  where
+    adj block
+      | sp_off /= 0
+      = block `blockSnoc` CmmAssign spReg (cmmOffset dflags spExpr sp_off)
+      | otherwise = block
+    -- Add unwind pseudo-instruction at the beginning of each block to
+    -- document Sp level for debugging
+    add_initial_unwind block
+      | debugLevel dflags > 0
+      = CmmUnwind [(Sp, Just sp_unwind)] `blockCons` block
+      | otherwise
+      = block
+      where sp_unwind = CmmRegOff spReg (sp0 - wORD_SIZE dflags)
+
+    -- Add unwind pseudo-instruction right after the Sp adjustment
+    -- if there is one.
+    add_adj_unwind block
+      | debugLevel dflags > 0
+      , sp_off /= 0
+      = block `blockSnoc` CmmUnwind [(Sp, Just sp_unwind)]
+      | otherwise
+      = block
+      where sp_unwind = CmmRegOff spReg (sp0 - wORD_SIZE dflags - sp_off)
+
+{- Note [SP old/young offsets]
+
+Sp(L) is the Sp offset on entry to block L relative to the base of the
+OLD area.
+
+SpArgs(L) is the size of the young area for L, i.e. the number of
+arguments.
+
+ - in block L, each reference to [old + N] turns into
+   [Sp + Sp(L) - N]
+
+ - in block L, each reference to [young(L') + N] turns into
+   [Sp + Sp(L) - Sp(L') + SpArgs(L') - N]
+
+ - be careful with the last node of each block: Sp has already been adjusted
+   to be Sp + Sp(L) - Sp(L')
+-}
+
+areaToSp :: DynFlags -> ByteOff -> ByteOff -> (Area -> StackLoc) -> CmmExpr -> CmmExpr
+
+areaToSp dflags sp_old _sp_hwm area_off (CmmStackSlot area n)
+  = cmmOffset dflags spExpr (sp_old - area_off area - n)
+    -- Replace (CmmStackSlot area n) with an offset from Sp
+
+areaToSp dflags _ sp_hwm _ (CmmLit CmmHighStackMark)
+  = mkIntExpr dflags sp_hwm
+    -- Replace CmmHighStackMark with the number of bytes of stack used,
+    -- the sp_hwm.   See Note [Stack usage] in GHC.StgToCmm.Heap
+
+areaToSp dflags _ _ _ (CmmMachOp (MO_U_Lt _) args)
+  | falseStackCheck args
+  = zeroExpr dflags
+areaToSp dflags _ _ _ (CmmMachOp (MO_U_Ge _) args)
+  | falseStackCheck args
+  = mkIntExpr dflags 1
+    -- Replace a stack-overflow test that cannot fail with a no-op
+    -- See Note [Always false stack check]
+
+areaToSp _ _ _ _ other = other
+
+-- | Determine whether a stack check cannot fail.
+falseStackCheck :: [CmmExpr] -> Bool
+falseStackCheck [ CmmMachOp (MO_Sub _)
+                      [ CmmRegOff (CmmGlobal Sp) x_off
+                      , CmmLit (CmmInt y_lit _)]
+                , CmmReg (CmmGlobal SpLim)]
+  = fromIntegral x_off >= y_lit
+falseStackCheck _ = False
+
+-- Note [Always false stack check]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- We can optimise stack checks of the form
+--
+--   if ((Sp + x) - y < SpLim) then .. else ..
+--
+-- where are non-negative integer byte offsets.  Since we know that
+-- SpLim <= Sp (remember the stack grows downwards), this test must
+-- yield False if (x >= y), so we can rewrite the comparison to False.
+-- A subsequent sinking pass will later drop the dead code.
+-- Optimising this away depends on knowing that SpLim <= Sp, so it is
+-- really the job of the stack layout algorithm, hence we do it now.
+--
+-- The control flow optimiser may negate a conditional to increase
+-- the likelihood of a fallthrough if the branch is not taken.  But
+-- not every conditional is inverted as the control flow optimiser
+-- places some requirements on the predecessors of both branch targets.
+-- So we better look for the inverted comparison too.
+
+optStackCheck :: CmmNode O C -> CmmNode O C
+optStackCheck n = -- Note [Always false stack check]
+ case n of
+   CmmCondBranch (CmmLit (CmmInt 0 _)) _true false _ -> CmmBranch false
+   CmmCondBranch (CmmLit (CmmInt _ _)) true _false _ -> CmmBranch true
+   other -> other
+
+
+-- -----------------------------------------------------------------------------
+
+-- | Eliminate stores of the form
+--
+--    Sp[area+n] = r
+--
+-- when we know that r is already in the same slot as Sp[area+n].  We
+-- could do this in a later optimisation pass, but that would involve
+-- a separate analysis and we already have the information to hand
+-- here.  It helps clean up some extra stack stores in common cases.
+--
+-- Note that we may have to modify the StackMap as we walk through the
+-- code using procMiddle, since an assignment to a variable in the
+-- StackMap will invalidate its mapping there.
+--
+elimStackStores :: StackMap
+                -> LabelMap StackMap
+                -> (Area -> ByteOff)
+                -> [CmmNode O O]
+                -> [CmmNode O O]
+elimStackStores stackmap stackmaps area_off nodes
+  = go stackmap nodes
+  where
+    go _stackmap [] = []
+    go stackmap (n:ns)
+     = case n of
+         CmmStore (CmmStackSlot area m) (CmmReg (CmmLocal r))
+            | Just (_,off) <- lookupUFM (sm_regs stackmap) r
+            , area_off area + m == off
+            -> go stackmap ns
+         _otherwise
+            -> n : go (procMiddle stackmaps n stackmap) ns
+
+
+-- -----------------------------------------------------------------------------
+-- Update info tables to include stack liveness
+
+
+setInfoTableStackMap :: DynFlags -> LabelMap StackMap -> CmmDecl -> CmmDecl
+setInfoTableStackMap dflags stackmaps (CmmProc top_info@TopInfo{..} l v g)
+  = CmmProc top_info{ info_tbls = mapMapWithKey fix_info info_tbls } l v g
+  where
+    fix_info lbl info_tbl@CmmInfoTable{ cit_rep = StackRep _ } =
+       info_tbl { cit_rep = StackRep (get_liveness lbl) }
+    fix_info _ other = other
+
+    get_liveness :: BlockId -> Liveness
+    get_liveness lbl
+      = case mapLookup lbl stackmaps of
+          Nothing -> pprPanic "setInfoTableStackMap" (ppr lbl <+> ppr info_tbls)
+          Just sm -> stackMapToLiveness dflags sm
+
+setInfoTableStackMap _ _ d = d
+
+
+stackMapToLiveness :: DynFlags -> StackMap -> Liveness
+stackMapToLiveness dflags StackMap{..} =
+   reverse $ Array.elems $
+        accumArray (\_ x -> x) True (toWords dflags sm_ret_off + 1,
+                                     toWords dflags (sm_sp - sm_args)) live_words
+   where
+     live_words =  [ (toWords dflags off, False)
+                   | (r,off) <- nonDetEltsUFM sm_regs
+                   , isGcPtrType (localRegType r) ]
+                   -- See Note [Unique Determinism and code generation]
+
+-- -----------------------------------------------------------------------------
+-- Pass 2
+-- -----------------------------------------------------------------------------
+
+insertReloadsAsNeeded
+    :: DynFlags
+    -> ProcPointSet
+    -> LabelMap StackMap
+    -> BlockId
+    -> [CmmBlock]
+    -> UniqSM [CmmBlock]
+insertReloadsAsNeeded dflags procpoints final_stackmaps entry blocks = do
+    toBlockList . fst <$>
+        rewriteCmmBwd liveLattice rewriteCC (ofBlockList entry blocks) mapEmpty
+  where
+    rewriteCC :: RewriteFun CmmLocalLive
+    rewriteCC (BlockCC e_node middle0 x_node) fact_base0 = do
+        let entry_label = entryLabel e_node
+            stackmap = case mapLookup entry_label final_stackmaps of
+                Just sm -> sm
+                Nothing -> panic "insertReloadsAsNeeded: rewriteCC: stackmap"
+
+            -- Merge the liveness from successor blocks and analyse the last
+            -- node.
+            joined = gen_kill dflags x_node $!
+                         joinOutFacts liveLattice x_node fact_base0
+            -- What is live at the start of middle0.
+            live_at_middle0 = foldNodesBwdOO (gen_kill dflags) middle0 joined
+
+            -- If this is a procpoint we need to add the reloads, but only if
+            -- they're actually live. Furthermore, nothing is live at the entry
+            -- to a proc point.
+            (middle1, live_with_reloads)
+                | entry_label `setMember` procpoints
+                = let reloads = insertReloads dflags stackmap live_at_middle0
+                  in (foldr blockCons middle0 reloads, emptyRegSet)
+                | otherwise
+                = (middle0, live_at_middle0)
+
+            -- Final liveness for this block.
+            !fact_base2 = mapSingleton entry_label live_with_reloads
+
+        return (BlockCC e_node middle1 x_node, fact_base2)
+
+insertReloads :: DynFlags -> StackMap -> CmmLocalLive -> [CmmNode O O]
+insertReloads dflags stackmap live =
+     [ CmmAssign (CmmLocal reg)
+                 -- This cmmOffset basically corresponds to manifesting
+                 -- @CmmStackSlot Old sp_off@, see Note [SP old/young offsets]
+                 (CmmLoad (cmmOffset dflags spExpr (sp_off - reg_off))
+                          (localRegType reg))
+     | (reg, reg_off) <- stackSlotRegs stackmap
+     , reg `elemRegSet` live
+     ]
+   where
+     sp_off = sm_sp stackmap
+
+-- -----------------------------------------------------------------------------
+-- Lowering safe foreign calls
+
+{-
+Note [Lower safe foreign calls]
+
+We start with
+
+   Sp[young(L1)] = L1
+ ,-----------------------
+ | r1 = foo(x,y,z) returns to L1
+ '-----------------------
+ L1:
+   R1 = r1 -- copyIn, inserted by mkSafeCall
+   ...
+
+the stack layout algorithm will arrange to save and reload everything
+live across the call.  Our job now is to expand the call so we get
+
+   Sp[young(L1)] = L1
+ ,-----------------------
+ | SAVE_THREAD_STATE()
+ | token = suspendThread(BaseReg, interruptible)
+ | r = foo(x,y,z)
+ | BaseReg = resumeThread(token)
+ | LOAD_THREAD_STATE()
+ | R1 = r  -- copyOut
+ | jump Sp[0]
+ '-----------------------
+ L1:
+   r = R1 -- copyIn, inserted by mkSafeCall
+   ...
+
+Note the copyOut, which saves the results in the places that L1 is
+expecting them (see Note [safe foreign call convention]). Note also
+that safe foreign call is replace by an unsafe one in the Cmm graph.
+-}
+
+lowerSafeForeignCall :: DynFlags -> CmmBlock -> UniqSM CmmBlock
+lowerSafeForeignCall dflags block
+  | (entry@(CmmEntry _ tscp), middle, CmmForeignCall { .. }) <- blockSplit block
+  = do
+    -- Both 'id' and 'new_base' are KindNonPtr because they're
+    -- RTS-only objects and are not subject to garbage collection
+    id <- newTemp (bWord dflags)
+    new_base <- newTemp (cmmRegType dflags baseReg)
+    let (caller_save, caller_load) = callerSaveVolatileRegs dflags
+    save_state_code <- saveThreadState dflags
+    load_state_code <- loadThreadState dflags
+    let suspend = save_state_code  <*>
+                  caller_save <*>
+                  mkMiddle (callSuspendThread dflags id intrbl)
+        midCall = mkUnsafeCall tgt res args
+        resume  = mkMiddle (callResumeThread new_base id) <*>
+                  -- Assign the result to BaseReg: we
+                  -- might now have a different Capability!
+                  mkAssign baseReg (CmmReg (CmmLocal new_base)) <*>
+                  caller_load <*>
+                  load_state_code
+
+        (_, regs, copyout) =
+             copyOutOflow dflags NativeReturn Jump (Young succ)
+                            (map (CmmReg . CmmLocal) res)
+                            ret_off []
+
+        -- NB. after resumeThread returns, the top-of-stack probably contains
+        -- the stack frame for succ, but it might not: if the current thread
+        -- received an exception during the call, then the stack might be
+        -- different.  Hence we continue by jumping to the top stack frame,
+        -- not by jumping to succ.
+        jump = CmmCall { cml_target    = entryCode dflags $
+                                         CmmLoad spExpr (bWord dflags)
+                       , cml_cont      = Just succ
+                       , cml_args_regs = regs
+                       , cml_args      = widthInBytes (wordWidth dflags)
+                       , cml_ret_args  = ret_args
+                       , cml_ret_off   = ret_off }
+
+    graph' <- lgraphOfAGraph ( suspend <*>
+                               midCall <*>
+                               resume  <*>
+                               copyout <*>
+                               mkLast jump, tscp)
+
+    case toBlockList graph' of
+      [one] -> let (_, middle', last) = blockSplit one
+               in return (blockJoin entry (middle `blockAppend` middle') last)
+      _ -> panic "lowerSafeForeignCall0"
+
+  -- Block doesn't end in a safe foreign call:
+  | otherwise = return block
+
+
+foreignLbl :: FastString -> CmmExpr
+foreignLbl name = CmmLit (CmmLabel (mkForeignLabel name Nothing ForeignLabelInExternalPackage IsFunction))
+
+callSuspendThread :: DynFlags -> LocalReg -> Bool -> CmmNode O O
+callSuspendThread dflags id intrbl =
+  CmmUnsafeForeignCall
+       (ForeignTarget (foreignLbl (fsLit "suspendThread"))
+        (ForeignConvention CCallConv [AddrHint, NoHint] [AddrHint] CmmMayReturn))
+       [id] [baseExpr, mkIntExpr dflags (fromEnum intrbl)]
+
+callResumeThread :: LocalReg -> LocalReg -> CmmNode O O
+callResumeThread new_base id =
+  CmmUnsafeForeignCall
+       (ForeignTarget (foreignLbl (fsLit "resumeThread"))
+            (ForeignConvention CCallConv [AddrHint] [AddrHint] CmmMayReturn))
+       [new_base] [CmmReg (CmmLocal id)]
+
+-- -----------------------------------------------------------------------------
+
+plusW :: DynFlags -> ByteOff -> WordOff -> ByteOff
+plusW dflags b w = b + w * wORD_SIZE dflags
+
+data StackSlot = Occupied | Empty
+     -- Occupied: a return address or part of an update frame
+
+instance Outputable StackSlot where
+  ppr Occupied = text "XXX"
+  ppr Empty    = text "---"
+
+dropEmpty :: WordOff -> [StackSlot] -> Maybe [StackSlot]
+dropEmpty 0 ss           = Just ss
+dropEmpty n (Empty : ss) = dropEmpty (n-1) ss
+dropEmpty _ _            = Nothing
+
+isEmpty :: StackSlot -> Bool
+isEmpty Empty = True
+isEmpty _ = False
+
+localRegBytes :: DynFlags -> LocalReg -> ByteOff
+localRegBytes dflags r
+    = roundUpToWords dflags (widthInBytes (typeWidth (localRegType r)))
+
+localRegWords :: DynFlags -> LocalReg -> WordOff
+localRegWords dflags = toWords dflags . localRegBytes dflags
+
+toWords :: DynFlags -> ByteOff -> WordOff
+toWords dflags x = x `quot` wORD_SIZE dflags
+
+
+stackSlotRegs :: StackMap -> [(LocalReg, StackLoc)]
+stackSlotRegs sm = nonDetEltsUFM (sm_regs sm)
+  -- See Note [Unique Determinism and code generation]
diff --git a/compiler/GHC/Cmm/Lexer.x b/compiler/GHC/Cmm/Lexer.x
new file mode 100644
index 0000000000..d8f15b916c
--- /dev/null
+++ b/compiler/GHC/Cmm/Lexer.x
@@ -0,0 +1,368 @@
+-----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow, 2004-2006
+--
+-- Lexer for concrete Cmm.  We try to stay close to the C-- spec, but there
+-- are a few minor differences:
+--
+--   * extra keywords for our macros, and float32/float64 types
+--   * global registers (Sp,Hp, etc.)
+--
+-----------------------------------------------------------------------------
+
+{
+module GHC.Cmm.Lexer (
+   CmmToken(..), cmmlex,
+  ) where
+
+import GhcPrelude
+
+import GHC.Cmm.Expr
+
+import Lexer
+import GHC.Cmm.Monad
+import SrcLoc
+import UniqFM
+import StringBuffer
+import FastString
+import Ctype
+import Util
+--import TRACE
+
+import Data.Word
+import Data.Char
+}
+
+$whitechar   = [\ \t\n\r\f\v\xa0] -- \xa0 is Unicode no-break space
+$white_no_nl = $whitechar # \n
+
+$ascdigit  = 0-9
+$unidigit  = \x01 -- Trick Alex into handling Unicode. See alexGetChar.
+$digit     = [$ascdigit $unidigit]
+$octit     = 0-7
+$hexit     = [$digit A-F a-f]
+
+$unilarge  = \x03 -- Trick Alex into handling Unicode. See alexGetChar.
+$asclarge  = [A-Z \xc0-\xd6 \xd8-\xde]
+$large     = [$asclarge $unilarge]
+
+$unismall  = \x04 -- Trick Alex into handling Unicode. See alexGetChar.
+$ascsmall  = [a-z \xdf-\xf6 \xf8-\xff]
+$small     = [$ascsmall $unismall \_]
+
+$namebegin = [$large $small \. \$ \@]
+$namechar  = [$namebegin $digit]
+
+@decimal     = $digit+
+@octal       = $octit+
+@hexadecimal = $hexit+
+@exponent    = [eE] [\-\+]? @decimal
+
+@floating_point = @decimal \. @decimal @exponent? | @decimal @exponent
+
+@escape      = \\ ([abfnrt\\\'\"\?] | x $hexit{1,2} | $octit{1,3})
+@strchar     = ($printable # [\"\\]) | @escape
+
+cmm :-
+
+$white_no_nl+           ;
+^\# pragma .* \n        ; -- Apple GCC 3.3 CPP generates pragmas in its output
+
+^\# (line)?             { begin line_prag }
+
+-- single-line line pragmas, of the form
+--    # <line> "<file>" <extra-stuff> \n
+<line_prag> $digit+                     { setLine line_prag1 }
+<line_prag1> \" [^\"]* \"       { setFile line_prag2 }
+<line_prag2> .*                         { pop }
+
+<0> {
+  \n                    ;
+
+  [\:\;\{\}\[\]\(\)\=\`\~\/\*\%\-\+\&\^\|\>\<\,\!]      { special_char }
+
+  ".."                  { kw CmmT_DotDot }
+  "::"                  { kw CmmT_DoubleColon }
+  ">>"                  { kw CmmT_Shr }
+  "<<"                  { kw CmmT_Shl }
+  ">="                  { kw CmmT_Ge }
+  "<="                  { kw CmmT_Le }
+  "=="                  { kw CmmT_Eq }
+  "!="                  { kw CmmT_Ne }
+  "&&"                  { kw CmmT_BoolAnd }
+  "||"                  { kw CmmT_BoolOr }
+
+  "True"                { kw CmmT_True  }
+  "False"               { kw CmmT_False }
+  "likely"              { kw CmmT_likely}
+
+  P@decimal             { global_regN (\n -> VanillaReg n VGcPtr) }
+  R@decimal             { global_regN (\n -> VanillaReg n VNonGcPtr) }
+  F@decimal             { global_regN FloatReg }
+  D@decimal             { global_regN DoubleReg }
+  L@decimal             { global_regN LongReg }
+  Sp                    { global_reg Sp }
+  SpLim                 { global_reg SpLim }
+  Hp                    { global_reg Hp }
+  HpLim                 { global_reg HpLim }
+  CCCS                  { global_reg CCCS }
+  CurrentTSO            { global_reg CurrentTSO }
+  CurrentNursery        { global_reg CurrentNursery }
+  HpAlloc               { global_reg HpAlloc }
+  BaseReg               { global_reg BaseReg }
+  MachSp                { global_reg MachSp }
+  UnwindReturnReg       { global_reg UnwindReturnReg }
+
+  $namebegin $namechar* { name }
+
+  0 @octal              { tok_octal }
+  @decimal              { tok_decimal }
+  0[xX] @hexadecimal    { tok_hexadecimal }
+  @floating_point       { strtoken tok_float }
+
+  \" @strchar* \"       { strtoken tok_string }
+}
+
+{
+data CmmToken
+  = CmmT_SpecChar  Char
+  | CmmT_DotDot
+  | CmmT_DoubleColon
+  | CmmT_Shr
+  | CmmT_Shl
+  | CmmT_Ge
+  | CmmT_Le
+  | CmmT_Eq
+  | CmmT_Ne
+  | CmmT_BoolAnd
+  | CmmT_BoolOr
+  | CmmT_CLOSURE
+  | CmmT_INFO_TABLE
+  | CmmT_INFO_TABLE_RET
+  | CmmT_INFO_TABLE_FUN
+  | CmmT_INFO_TABLE_CONSTR
+  | CmmT_INFO_TABLE_SELECTOR
+  | CmmT_else
+  | CmmT_export
+  | CmmT_section
+  | CmmT_goto
+  | CmmT_if
+  | CmmT_call
+  | CmmT_jump
+  | CmmT_foreign
+  | CmmT_never
+  | CmmT_prim
+  | CmmT_reserve
+  | CmmT_return
+  | CmmT_returns
+  | CmmT_import
+  | CmmT_switch
+  | CmmT_case
+  | CmmT_default
+  | CmmT_push
+  | CmmT_unwind
+  | CmmT_bits8
+  | CmmT_bits16
+  | CmmT_bits32
+  | CmmT_bits64
+  | CmmT_bits128
+  | CmmT_bits256
+  | CmmT_bits512
+  | CmmT_float32
+  | CmmT_float64
+  | CmmT_gcptr
+  | CmmT_GlobalReg GlobalReg
+  | CmmT_Name      FastString
+  | CmmT_String    String
+  | CmmT_Int       Integer
+  | CmmT_Float     Rational
+  | CmmT_EOF
+  | CmmT_False
+  | CmmT_True
+  | CmmT_likely
+  deriving (Show)
+
+-- -----------------------------------------------------------------------------
+-- Lexer actions
+
+type Action = RealSrcSpan -> StringBuffer -> Int -> PD (RealLocated CmmToken)
+
+begin :: Int -> Action
+begin code _span _str _len = do liftP (pushLexState code); lexToken
+
+pop :: Action
+pop _span _buf _len = liftP popLexState >> lexToken
+
+special_char :: Action
+special_char span buf _len = return (L span (CmmT_SpecChar (currentChar buf)))
+
+kw :: CmmToken -> Action
+kw tok span _buf _len = return (L span tok)
+
+global_regN :: (Int -> GlobalReg) -> Action
+global_regN con span buf len
+  = return (L span (CmmT_GlobalReg (con (fromIntegral n))))
+  where buf' = stepOn buf
+        n = parseUnsignedInteger buf' (len-1) 10 octDecDigit
+
+global_reg :: GlobalReg -> Action
+global_reg r span _buf _len = return (L span (CmmT_GlobalReg r))
+
+strtoken :: (String -> CmmToken) -> Action
+strtoken f span buf len =
+  return (L span $! (f $! lexemeToString buf len))
+
+name :: Action
+name span buf len =
+  case lookupUFM reservedWordsFM fs of
+        Just tok -> return (L span tok)
+        Nothing  -> return (L span (CmmT_Name fs))
+  where
+        fs = lexemeToFastString buf len
+
+reservedWordsFM = listToUFM $
+        map (\(x, y) -> (mkFastString x, y)) [
+        ( "CLOSURE",            CmmT_CLOSURE ),
+        ( "INFO_TABLE",         CmmT_INFO_TABLE ),
+        ( "INFO_TABLE_RET",     CmmT_INFO_TABLE_RET ),
+        ( "INFO_TABLE_FUN",     CmmT_INFO_TABLE_FUN ),
+        ( "INFO_TABLE_CONSTR",  CmmT_INFO_TABLE_CONSTR ),
+        ( "INFO_TABLE_SELECTOR",CmmT_INFO_TABLE_SELECTOR ),
+        ( "else",               CmmT_else ),
+        ( "export",             CmmT_export ),
+        ( "section",            CmmT_section ),
+        ( "goto",               CmmT_goto ),
+        ( "if",                 CmmT_if ),
+        ( "call",               CmmT_call ),
+        ( "jump",               CmmT_jump ),
+        ( "foreign",            CmmT_foreign ),
+        ( "never",              CmmT_never ),
+        ( "prim",               CmmT_prim ),
+        ( "reserve",            CmmT_reserve ),
+        ( "return",             CmmT_return ),
+        ( "returns",            CmmT_returns ),
+        ( "import",             CmmT_import ),
+        ( "switch",             CmmT_switch ),
+        ( "case",               CmmT_case ),
+        ( "default",            CmmT_default ),
+        ( "push",               CmmT_push ),
+        ( "unwind",             CmmT_unwind ),
+        ( "bits8",              CmmT_bits8 ),
+        ( "bits16",             CmmT_bits16 ),
+        ( "bits32",             CmmT_bits32 ),
+        ( "bits64",             CmmT_bits64 ),
+        ( "bits128",            CmmT_bits128 ),
+        ( "bits256",            CmmT_bits256 ),
+        ( "bits512",            CmmT_bits512 ),
+        ( "float32",            CmmT_float32 ),
+        ( "float64",            CmmT_float64 ),
+-- New forms
+        ( "b8",                 CmmT_bits8 ),
+        ( "b16",                CmmT_bits16 ),
+        ( "b32",                CmmT_bits32 ),
+        ( "b64",                CmmT_bits64 ),
+        ( "b128",               CmmT_bits128 ),
+        ( "b256",               CmmT_bits256 ),
+        ( "b512",               CmmT_bits512 ),
+        ( "f32",                CmmT_float32 ),
+        ( "f64",                CmmT_float64 ),
+        ( "gcptr",              CmmT_gcptr ),
+        ( "likely",             CmmT_likely),
+        ( "True",               CmmT_True  ),
+        ( "False",              CmmT_False )
+        ]
+
+tok_decimal span buf len
+  = return (L span (CmmT_Int  $! parseUnsignedInteger buf len 10 octDecDigit))
+
+tok_octal span buf len
+  = return (L span (CmmT_Int  $! parseUnsignedInteger (offsetBytes 1 buf) (len-1) 8 octDecDigit))
+
+tok_hexadecimal span buf len
+  = return (L span (CmmT_Int  $! parseUnsignedInteger (offsetBytes 2 buf) (len-2) 16 hexDigit))
+
+tok_float str = CmmT_Float $! readRational str
+
+tok_string str = CmmT_String (read str)
+                 -- urk, not quite right, but it'll do for now
+
+-- -----------------------------------------------------------------------------
+-- Line pragmas
+
+setLine :: Int -> Action
+setLine code span buf len = do
+  let line = parseUnsignedInteger buf len 10 octDecDigit
+  liftP $ do
+    setSrcLoc (mkRealSrcLoc (srcSpanFile span) (fromIntegral line - 1) 1)
+          -- subtract one: the line number refers to the *following* line
+    -- trace ("setLine "  ++ show line) $ do
+    popLexState >> pushLexState code
+  lexToken
+
+setFile :: Int -> Action
+setFile code span buf len = do
+  let file = lexemeToFastString (stepOn buf) (len-2)
+  liftP $ do
+    setSrcLoc (mkRealSrcLoc file (srcSpanEndLine span) (srcSpanEndCol span))
+    popLexState >> pushLexState code
+  lexToken
+
+-- -----------------------------------------------------------------------------
+-- This is the top-level function: called from the parser each time a
+-- new token is to be read from the input.
+
+cmmlex :: (Located CmmToken -> PD a) -> PD a
+cmmlex cont = do
+  (L span tok) <- lexToken
+  --trace ("token: " ++ show tok) $ do
+  cont (L (RealSrcSpan span) tok)
+
+lexToken :: PD (RealLocated CmmToken)
+lexToken = do
+  inp@(loc1,buf) <- getInput
+  sc <- liftP getLexState
+  case alexScan inp sc of
+    AlexEOF -> do let span = mkRealSrcSpan loc1 loc1
+                  liftP (setLastToken span 0)
+                  return (L span CmmT_EOF)
+    AlexError (loc2,_) -> liftP $ failLocMsgP loc1 loc2 "lexical error"
+    AlexSkip inp2 _ -> do
+        setInput inp2
+        lexToken
+    AlexToken inp2@(end,_buf2) len t -> do
+        setInput inp2
+        let span = mkRealSrcSpan loc1 end
+        span `seq` liftP (setLastToken span len)
+        t span buf len
+
+-- -----------------------------------------------------------------------------
+-- Monad stuff
+
+-- Stuff that Alex needs to know about our input type:
+type AlexInput = (RealSrcLoc,StringBuffer)
+
+alexInputPrevChar :: AlexInput -> Char
+alexInputPrevChar (_,s) = prevChar s '\n'
+
+-- backwards compatibility for Alex 2.x
+alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
+alexGetChar inp = case alexGetByte inp of
+                    Nothing    -> Nothing
+                    Just (b,i) -> c `seq` Just (c,i)
+                       where c = chr $ fromIntegral b
+
+alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
+alexGetByte (loc,s)
+  | atEnd s   = Nothing
+  | otherwise = b `seq` loc' `seq` s' `seq` Just (b, (loc', s'))
+  where c    = currentChar s
+        b    = fromIntegral $ ord $ c
+        loc' = advanceSrcLoc loc c
+        s'   = stepOn s
+
+getInput :: PD AlexInput
+getInput = PD $ \_ s@PState{ loc=l, buffer=b } -> POk s (l,b)
+
+setInput :: AlexInput -> PD ()
+setInput (l,b) = PD $ \_ s -> POk s{ loc=l, buffer=b } ()
+}
diff --git a/compiler/GHC/Cmm/Lint.hs b/compiler/GHC/Cmm/Lint.hs
new file mode 100644
index 0000000000..d70fed3b9e
--- /dev/null
+++ b/compiler/GHC/Cmm/Lint.hs
@@ -0,0 +1,261 @@
+-----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 2011
+--
+-- CmmLint: checking the correctness of Cmm statements and expressions
+--
+-----------------------------------------------------------------------------
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE GADTs #-}
+module GHC.Cmm.Lint (
+    cmmLint, cmmLintGraph
+  ) where
+
+import GhcPrelude
+
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Liveness
+import GHC.Cmm.Switch (switchTargetsToList)
+import GHC.Cmm.Ppr () -- For Outputable instances
+import Outputable
+import DynFlags
+
+import Control.Monad (ap)
+
+-- Things to check:
+--     - invariant on CmmBlock in GHC.Cmm.Expr (see comment there)
+--     - check for branches to blocks that don't exist
+--     - check types
+
+-- -----------------------------------------------------------------------------
+-- Exported entry points:
+
+cmmLint :: (Outputable d, Outputable h)
+        => DynFlags -> GenCmmGroup d h CmmGraph -> Maybe SDoc
+cmmLint dflags tops = runCmmLint dflags (mapM_ (lintCmmDecl dflags)) tops
+
+cmmLintGraph :: DynFlags -> CmmGraph -> Maybe SDoc
+cmmLintGraph dflags g = runCmmLint dflags (lintCmmGraph dflags) g
+
+runCmmLint :: Outputable a => DynFlags -> (a -> CmmLint b) -> a -> Maybe SDoc
+runCmmLint dflags l p =
+   case unCL (l p) dflags of
+     Left err -> Just (vcat [text "Cmm lint error:",
+                             nest 2 err,
+                             text "Program was:",
+                             nest 2 (ppr p)])
+     Right _  -> Nothing
+
+lintCmmDecl :: DynFlags -> GenCmmDecl h i CmmGraph -> CmmLint ()
+lintCmmDecl dflags (CmmProc _ lbl _ g)
+  = addLintInfo (text "in proc " <> ppr lbl) $ lintCmmGraph dflags g
+lintCmmDecl _ (CmmData {})
+  = return ()
+
+
+lintCmmGraph :: DynFlags -> CmmGraph -> CmmLint ()
+lintCmmGraph dflags g =
+    cmmLocalLiveness dflags g `seq` mapM_ (lintCmmBlock labels) blocks
+    -- cmmLiveness throws an error if there are registers
+    -- live on entry to the graph (i.e. undefined
+    -- variables)
+  where
+       blocks = toBlockList g
+       labels = setFromList (map entryLabel blocks)
+
+
+lintCmmBlock :: LabelSet -> CmmBlock -> CmmLint ()
+lintCmmBlock labels block
+  = addLintInfo (text "in basic block " <> ppr (entryLabel block)) $ do
+        let (_, middle, last) = blockSplit block
+        mapM_ lintCmmMiddle (blockToList middle)
+        lintCmmLast labels last
+
+-- -----------------------------------------------------------------------------
+-- lintCmmExpr
+
+-- Checks whether a CmmExpr is "type-correct", and check for obvious-looking
+-- byte/word mismatches.
+
+lintCmmExpr :: CmmExpr -> CmmLint CmmType
+lintCmmExpr (CmmLoad expr rep) = do
+  _ <- lintCmmExpr expr
+  -- Disabled, if we have the inlining phase before the lint phase,
+  -- we can have funny offsets due to pointer tagging. -- EZY
+  -- when (widthInBytes (typeWidth rep) >= wORD_SIZE) $
+  --   cmmCheckWordAddress expr
+  return rep
+lintCmmExpr expr@(CmmMachOp op args) = do
+  dflags <- getDynFlags
+  tys <- mapM lintCmmExpr args
+  if map (typeWidth . cmmExprType dflags) args == machOpArgReps dflags op
+        then cmmCheckMachOp op args tys
+        else cmmLintMachOpErr expr (map (cmmExprType dflags) args) (machOpArgReps dflags op)
+lintCmmExpr (CmmRegOff reg offset)
+  = do dflags <- getDynFlags
+       let rep = typeWidth (cmmRegType dflags reg)
+       lintCmmExpr (CmmMachOp (MO_Add rep)
+                [CmmReg reg, CmmLit (CmmInt (fromIntegral offset) rep)])
+lintCmmExpr expr =
+  do dflags <- getDynFlags
+     return (cmmExprType dflags expr)
+
+-- Check for some common byte/word mismatches (eg. Sp + 1)
+cmmCheckMachOp   :: MachOp -> [CmmExpr] -> [CmmType] -> CmmLint CmmType
+cmmCheckMachOp op [lit@(CmmLit (CmmInt { })), reg@(CmmReg _)] tys
+  = cmmCheckMachOp op [reg, lit] tys
+cmmCheckMachOp op _ tys
+  = do dflags <- getDynFlags
+       return (machOpResultType dflags op tys)
+
+{-
+isOffsetOp :: MachOp -> Bool
+isOffsetOp (MO_Add _) = True
+isOffsetOp (MO_Sub _) = True
+isOffsetOp _ = False
+
+-- This expression should be an address from which a word can be loaded:
+-- check for funny-looking sub-word offsets.
+_cmmCheckWordAddress :: CmmExpr -> CmmLint ()
+_cmmCheckWordAddress e@(CmmMachOp op [arg, CmmLit (CmmInt i _)])
+  | isOffsetOp op && notNodeReg arg && i `rem` fromIntegral (wORD_SIZE dflags) /= 0
+  = cmmLintDubiousWordOffset e
+_cmmCheckWordAddress e@(CmmMachOp op [CmmLit (CmmInt i _), arg])
+  | isOffsetOp op && notNodeReg arg && i `rem` fromIntegral (wORD_SIZE dflags) /= 0
+  = cmmLintDubiousWordOffset e
+_cmmCheckWordAddress _
+  = return ()
+
+-- No warnings for unaligned arithmetic with the node register,
+-- which is used to extract fields from tagged constructor closures.
+notNodeReg :: CmmExpr -> Bool
+notNodeReg (CmmReg reg) | reg == nodeReg = False
+notNodeReg _                             = True
+-}
+
+lintCmmMiddle :: CmmNode O O -> CmmLint ()
+lintCmmMiddle node = case node of
+  CmmComment _ -> return ()
+  CmmTick _    -> return ()
+  CmmUnwind{}  -> return ()
+
+  CmmAssign reg expr -> do
+            dflags <- getDynFlags
+            erep <- lintCmmExpr expr
+            let reg_ty = cmmRegType dflags reg
+            if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
+                then return ()
+                else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
+
+  CmmStore l r -> do
+            _ <- lintCmmExpr l
+            _ <- lintCmmExpr r
+            return ()
+
+  CmmUnsafeForeignCall target _formals actuals -> do
+            lintTarget target
+            mapM_ lintCmmExpr actuals
+
+
+lintCmmLast :: LabelSet -> CmmNode O C -> CmmLint ()
+lintCmmLast labels node = case node of
+  CmmBranch id -> checkTarget id
+
+  CmmCondBranch e t f _ -> do
+            dflags <- getDynFlags
+            mapM_ checkTarget [t,f]
+            _ <- lintCmmExpr e
+            checkCond dflags e
+
+  CmmSwitch e ids -> do
+            dflags <- getDynFlags
+            mapM_ checkTarget $ switchTargetsToList ids
+            erep <- lintCmmExpr e
+            if (erep `cmmEqType_ignoring_ptrhood` bWord dflags)
+              then return ()
+              else cmmLintErr (text "switch scrutinee is not a word: " <>
+                               ppr e <> text " :: " <> ppr erep)
+
+  CmmCall { cml_target = target, cml_cont = cont } -> do
+          _ <- lintCmmExpr target
+          maybe (return ()) checkTarget cont
+
+  CmmForeignCall tgt _ args succ _ _ _ -> do
+          lintTarget tgt
+          mapM_ lintCmmExpr args
+          checkTarget succ
+ where
+  checkTarget id
+     | setMember id labels = return ()
+     | otherwise = cmmLintErr (text "Branch to nonexistent id" <+> ppr id)
+
+
+lintTarget :: ForeignTarget -> CmmLint ()
+lintTarget (ForeignTarget e _) = lintCmmExpr e >> return ()
+lintTarget (PrimTarget {})     = return ()
+
+
+checkCond :: DynFlags -> CmmExpr -> CmmLint ()
+checkCond _ (CmmMachOp mop _) | isComparisonMachOp mop = return ()
+checkCond dflags (CmmLit (CmmInt x t)) | x == 0 || x == 1, t == wordWidth dflags = return () -- constant values
+checkCond _ expr
+    = cmmLintErr (hang (text "expression is not a conditional:") 2
+                         (ppr expr))
+
+-- -----------------------------------------------------------------------------
+-- CmmLint monad
+
+-- just a basic error monad:
+
+newtype CmmLint a = CmmLint { unCL :: DynFlags -> Either SDoc a }
+    deriving (Functor)
+
+instance Applicative CmmLint where
+      pure a = CmmLint (\_ -> Right a)
+      (<*>) = ap
+
+instance Monad CmmLint where
+  CmmLint m >>= k = CmmLint $ \dflags ->
+                                case m dflags of
+                                Left e -> Left e
+                                Right a -> unCL (k a) dflags
+
+instance HasDynFlags CmmLint where
+    getDynFlags = CmmLint (\dflags -> Right dflags)
+
+cmmLintErr :: SDoc -> CmmLint a
+cmmLintErr msg = CmmLint (\_ -> Left msg)
+
+addLintInfo :: SDoc -> CmmLint a -> CmmLint a
+addLintInfo info thing = CmmLint $ \dflags ->
+   case unCL thing dflags of
+        Left err -> Left (hang info 2 err)
+        Right a  -> Right a
+
+cmmLintMachOpErr :: CmmExpr -> [CmmType] -> [Width] -> CmmLint a
+cmmLintMachOpErr expr argsRep opExpectsRep
+     = cmmLintErr (text "in MachOp application: " $$
+                   nest 2 (ppr  expr) $$
+                      (text "op is expecting: " <+> ppr opExpectsRep) $$
+                      (text "arguments provide: " <+> ppr argsRep))
+
+cmmLintAssignErr :: CmmNode e x -> CmmType -> CmmType -> CmmLint a
+cmmLintAssignErr stmt e_ty r_ty
+  = cmmLintErr (text "in assignment: " $$
+                nest 2 (vcat [ppr stmt,
+                              text "Reg ty:" <+> ppr r_ty,
+                              text "Rhs ty:" <+> ppr e_ty]))
+
+
+{-
+cmmLintDubiousWordOffset :: CmmExpr -> CmmLint a
+cmmLintDubiousWordOffset expr
+   = cmmLintErr (text "offset is not a multiple of words: " $$
+                 nest 2 (ppr expr))
+-}
+
diff --git a/compiler/GHC/Cmm/Liveness.hs b/compiler/GHC/Cmm/Liveness.hs
new file mode 100644
index 0000000000..2b598f52e5
--- /dev/null
+++ b/compiler/GHC/Cmm/Liveness.hs
@@ -0,0 +1,93 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+
+module GHC.Cmm.Liveness
+    ( CmmLocalLive
+    , cmmLocalLiveness
+    , cmmGlobalLiveness
+    , liveLattice
+    , gen_kill
+    )
+where
+
+import GhcPrelude
+
+import DynFlags
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.Ppr.Expr () -- For Outputable instances
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow
+import GHC.Cmm.Dataflow.Label
+
+import Maybes
+import Outputable
+
+-----------------------------------------------------------------------------
+-- Calculating what variables are live on entry to a basic block
+-----------------------------------------------------------------------------
+
+-- | The variables live on entry to a block
+type CmmLive r = RegSet r
+type CmmLocalLive = CmmLive LocalReg
+
+-- | The dataflow lattice
+liveLattice :: Ord r => DataflowLattice (CmmLive r)
+{-# SPECIALIZE liveLattice :: DataflowLattice (CmmLive LocalReg) #-}
+{-# SPECIALIZE liveLattice :: DataflowLattice (CmmLive GlobalReg) #-}
+liveLattice = DataflowLattice emptyRegSet add
+  where
+    add (OldFact old) (NewFact new) =
+        let !join = plusRegSet old new
+        in changedIf (sizeRegSet join > sizeRegSet old) join
+
+-- | A mapping from block labels to the variables live on entry
+type BlockEntryLiveness r = LabelMap (CmmLive r)
+
+-----------------------------------------------------------------------------
+-- | Calculated liveness info for a CmmGraph
+-----------------------------------------------------------------------------
+
+cmmLocalLiveness :: DynFlags -> CmmGraph -> BlockEntryLiveness LocalReg
+cmmLocalLiveness dflags graph =
+    check $ analyzeCmmBwd liveLattice (xferLive dflags) graph mapEmpty
+  where
+    entry = g_entry graph
+    check facts =
+        noLiveOnEntry entry (expectJust "check" $ mapLookup entry facts) facts
+
+cmmGlobalLiveness :: DynFlags -> CmmGraph -> BlockEntryLiveness GlobalReg
+cmmGlobalLiveness dflags graph =
+    analyzeCmmBwd liveLattice (xferLive dflags) graph mapEmpty
+
+-- | On entry to the procedure, there had better not be any LocalReg's live-in.
+noLiveOnEntry :: BlockId -> CmmLive LocalReg -> a -> a
+noLiveOnEntry bid in_fact x =
+  if nullRegSet in_fact then x
+  else pprPanic "LocalReg's live-in to graph" (ppr bid <+> ppr in_fact)
+
+gen_kill
+    :: (DefinerOfRegs r n, UserOfRegs r n)
+    => DynFlags -> n -> CmmLive r -> CmmLive r
+gen_kill dflags node set =
+    let !afterKill = foldRegsDefd dflags deleteFromRegSet set node
+    in foldRegsUsed dflags extendRegSet afterKill node
+{-# INLINE gen_kill #-}
+
+xferLive
+    :: forall r.
+       ( UserOfRegs r (CmmNode O O)
+       , DefinerOfRegs r (CmmNode O O)
+       , UserOfRegs r (CmmNode O C)
+       , DefinerOfRegs r (CmmNode O C)
+       )
+    => DynFlags -> TransferFun (CmmLive r)
+xferLive dflags (BlockCC eNode middle xNode) fBase =
+    let joined = gen_kill dflags xNode $! joinOutFacts liveLattice xNode fBase
+        !result = foldNodesBwdOO (gen_kill dflags) middle joined
+    in mapSingleton (entryLabel eNode) result
+{-# SPECIALIZE xferLive :: DynFlags -> TransferFun (CmmLive LocalReg) #-}
+{-# SPECIALIZE xferLive :: DynFlags -> TransferFun (CmmLive GlobalReg) #-}
diff --git a/compiler/GHC/Cmm/MachOp.hs b/compiler/GHC/Cmm/MachOp.hs
new file mode 100644
index 0000000000..234001545c
--- /dev/null
+++ b/compiler/GHC/Cmm/MachOp.hs
@@ -0,0 +1,664 @@
+module GHC.Cmm.MachOp
+    ( MachOp(..)
+    , pprMachOp, isCommutableMachOp, isAssociativeMachOp
+    , isComparisonMachOp, maybeIntComparison, machOpResultType
+    , machOpArgReps, maybeInvertComparison, isFloatComparison
+
+    -- MachOp builders
+    , mo_wordAdd, mo_wordSub, mo_wordEq, mo_wordNe,mo_wordMul, mo_wordSQuot
+    , mo_wordSRem, mo_wordSNeg, mo_wordUQuot, mo_wordURem
+    , mo_wordSGe, mo_wordSLe, mo_wordSGt, mo_wordSLt, mo_wordUGe
+    , mo_wordULe, mo_wordUGt, mo_wordULt
+    , mo_wordAnd, mo_wordOr, mo_wordXor, mo_wordNot
+    , mo_wordShl, mo_wordSShr, mo_wordUShr
+    , mo_u_8To32, mo_s_8To32, mo_u_16To32, mo_s_16To32
+    , mo_u_8ToWord, mo_s_8ToWord, mo_u_16ToWord, mo_s_16ToWord
+    , mo_u_32ToWord, mo_s_32ToWord
+    , mo_32To8, mo_32To16, mo_WordTo8, mo_WordTo16, mo_WordTo32, mo_WordTo64
+
+    -- CallishMachOp
+    , CallishMachOp(..), callishMachOpHints
+    , pprCallishMachOp
+    , machOpMemcpyishAlign
+
+    -- Atomic read-modify-write
+    , AtomicMachOp(..)
+   )
+where
+
+import GhcPrelude
+
+import GHC.Cmm.Type
+import Outputable
+import DynFlags
+
+-----------------------------------------------------------------------------
+--              MachOp
+-----------------------------------------------------------------------------
+
+{- |
+Machine-level primops; ones which we can reasonably delegate to the
+native code generators to handle.
+
+Most operations are parameterised by the 'Width' that they operate on.
+Some operations have separate signed and unsigned versions, and float
+and integer versions.
+-}
+
+data MachOp
+  -- Integer operations (insensitive to signed/unsigned)
+  = MO_Add Width
+  | MO_Sub Width
+  | MO_Eq  Width
+  | MO_Ne  Width
+  | MO_Mul Width                -- low word of multiply
+
+  -- Signed multiply/divide
+  | MO_S_MulMayOflo Width       -- nonzero if signed multiply overflows
+  | MO_S_Quot Width             -- signed / (same semantics as IntQuotOp)
+  | MO_S_Rem  Width             -- signed % (same semantics as IntRemOp)
+  | MO_S_Neg  Width             -- unary -
+
+  -- Unsigned multiply/divide
+  | MO_U_MulMayOflo Width       -- nonzero if unsigned multiply overflows
+  | MO_U_Quot Width             -- unsigned / (same semantics as WordQuotOp)
+  | MO_U_Rem  Width             -- unsigned % (same semantics as WordRemOp)
+
+  -- Signed comparisons
+  | MO_S_Ge Width
+  | MO_S_Le Width
+  | MO_S_Gt Width
+  | MO_S_Lt Width
+
+  -- Unsigned comparisons
+  | MO_U_Ge Width
+  | MO_U_Le Width
+  | MO_U_Gt Width
+  | MO_U_Lt Width
+
+  -- Floating point arithmetic
+  | MO_F_Add  Width
+  | MO_F_Sub  Width
+  | MO_F_Neg  Width             -- unary -
+  | MO_F_Mul  Width
+  | MO_F_Quot Width
+
+  -- Floating point comparison
+  | MO_F_Eq Width
+  | MO_F_Ne Width
+  | MO_F_Ge Width
+  | MO_F_Le Width
+  | MO_F_Gt Width
+  | MO_F_Lt Width
+
+  -- Bitwise operations.  Not all of these may be supported
+  -- at all sizes, and only integral Widths are valid.
+  | MO_And   Width
+  | MO_Or    Width
+  | MO_Xor   Width
+  | MO_Not   Width
+  | MO_Shl   Width
+  | MO_U_Shr Width      -- unsigned shift right
+  | MO_S_Shr Width      -- signed shift right
+
+  -- Conversions.  Some of these will be NOPs.
+  -- Floating-point conversions use the signed variant.
+  | MO_SF_Conv Width Width      -- Signed int -> Float
+  | MO_FS_Conv Width Width      -- Float -> Signed int
+  | MO_SS_Conv Width Width      -- Signed int -> Signed int
+  | MO_UU_Conv Width Width      -- unsigned int -> unsigned int
+  | MO_XX_Conv Width Width      -- int -> int; puts no requirements on the
+                                -- contents of upper bits when extending;
+                                -- narrowing is simply truncation; the only
+                                -- expectation is that we can recover the
+                                -- original value by applying the opposite
+                                -- MO_XX_Conv, e.g.,
+                                --   MO_XX_CONV W64 W8 (MO_XX_CONV W8 W64 x)
+                                -- is equivalent to just x.
+  | MO_FF_Conv Width Width      -- Float -> Float
+
+  -- Vector element insertion and extraction operations
+  | MO_V_Insert  Length Width   -- Insert scalar into vector
+  | MO_V_Extract Length Width   -- Extract scalar from vector
+
+  -- Integer vector operations
+  | MO_V_Add Length Width
+  | MO_V_Sub Length Width
+  | MO_V_Mul Length Width
+
+  -- Signed vector multiply/divide
+  | MO_VS_Quot Length Width
+  | MO_VS_Rem  Length Width
+  | MO_VS_Neg  Length Width
+
+  -- Unsigned vector multiply/divide
+  | MO_VU_Quot Length Width
+  | MO_VU_Rem  Length Width
+
+  -- Floating point vector element insertion and extraction operations
+  | MO_VF_Insert  Length Width   -- Insert scalar into vector
+  | MO_VF_Extract Length Width   -- Extract scalar from vector
+
+  -- Floating point vector operations
+  | MO_VF_Add  Length Width
+  | MO_VF_Sub  Length Width
+  | MO_VF_Neg  Length Width      -- unary negation
+  | MO_VF_Mul  Length Width
+  | MO_VF_Quot Length Width
+
+  -- Alignment check (for -falignment-sanitisation)
+  | MO_AlignmentCheck Int Width
+  deriving (Eq, Show)
+
+pprMachOp :: MachOp -> SDoc
+pprMachOp mo = text (show mo)
+
+
+
+-- -----------------------------------------------------------------------------
+-- Some common MachReps
+
+-- A 'wordRep' is a machine word on the target architecture
+-- Specifically, it is the size of an Int#, Word#, Addr#
+-- and the unit of allocation on the stack and the heap
+-- Any pointer is also guaranteed to be a wordRep.
+
+mo_wordAdd, mo_wordSub, mo_wordEq, mo_wordNe,mo_wordMul, mo_wordSQuot
+    , mo_wordSRem, mo_wordSNeg, mo_wordUQuot, mo_wordURem
+    , mo_wordSGe, mo_wordSLe, mo_wordSGt, mo_wordSLt, mo_wordUGe
+    , mo_wordULe, mo_wordUGt, mo_wordULt
+    , mo_wordAnd, mo_wordOr, mo_wordXor, mo_wordNot, mo_wordShl, mo_wordSShr, mo_wordUShr
+    , mo_u_8ToWord, mo_s_8ToWord, mo_u_16ToWord, mo_s_16ToWord, mo_u_32ToWord, mo_s_32ToWord
+    , mo_WordTo8, mo_WordTo16, mo_WordTo32, mo_WordTo64
+    :: DynFlags -> MachOp
+
+mo_u_8To32, mo_s_8To32, mo_u_16To32, mo_s_16To32
+    , mo_32To8, mo_32To16
+    :: MachOp
+
+mo_wordAdd      dflags = MO_Add (wordWidth dflags)
+mo_wordSub      dflags = MO_Sub (wordWidth dflags)
+mo_wordEq       dflags = MO_Eq  (wordWidth dflags)
+mo_wordNe       dflags = MO_Ne  (wordWidth dflags)
+mo_wordMul      dflags = MO_Mul (wordWidth dflags)
+mo_wordSQuot    dflags = MO_S_Quot (wordWidth dflags)
+mo_wordSRem     dflags = MO_S_Rem (wordWidth dflags)
+mo_wordSNeg     dflags = MO_S_Neg (wordWidth dflags)
+mo_wordUQuot    dflags = MO_U_Quot (wordWidth dflags)
+mo_wordURem     dflags = MO_U_Rem (wordWidth dflags)
+
+mo_wordSGe      dflags = MO_S_Ge  (wordWidth dflags)
+mo_wordSLe      dflags = MO_S_Le  (wordWidth dflags)
+mo_wordSGt      dflags = MO_S_Gt  (wordWidth dflags)
+mo_wordSLt      dflags = MO_S_Lt  (wordWidth dflags)
+
+mo_wordUGe      dflags = MO_U_Ge  (wordWidth dflags)
+mo_wordULe      dflags = MO_U_Le  (wordWidth dflags)
+mo_wordUGt      dflags = MO_U_Gt  (wordWidth dflags)
+mo_wordULt      dflags = MO_U_Lt  (wordWidth dflags)
+
+mo_wordAnd      dflags = MO_And (wordWidth dflags)
+mo_wordOr       dflags = MO_Or  (wordWidth dflags)
+mo_wordXor      dflags = MO_Xor (wordWidth dflags)
+mo_wordNot      dflags = MO_Not (wordWidth dflags)
+mo_wordShl      dflags = MO_Shl (wordWidth dflags)
+mo_wordSShr     dflags = MO_S_Shr (wordWidth dflags)
+mo_wordUShr     dflags = MO_U_Shr (wordWidth dflags)
+
+mo_u_8To32             = MO_UU_Conv W8 W32
+mo_s_8To32             = MO_SS_Conv W8 W32
+mo_u_16To32            = MO_UU_Conv W16 W32
+mo_s_16To32            = MO_SS_Conv W16 W32
+
+mo_u_8ToWord    dflags = MO_UU_Conv W8  (wordWidth dflags)
+mo_s_8ToWord    dflags = MO_SS_Conv W8  (wordWidth dflags)
+mo_u_16ToWord   dflags = MO_UU_Conv W16 (wordWidth dflags)
+mo_s_16ToWord   dflags = MO_SS_Conv W16 (wordWidth dflags)
+mo_s_32ToWord   dflags = MO_SS_Conv W32 (wordWidth dflags)
+mo_u_32ToWord   dflags = MO_UU_Conv W32 (wordWidth dflags)
+
+mo_WordTo8      dflags = MO_UU_Conv (wordWidth dflags) W8
+mo_WordTo16     dflags = MO_UU_Conv (wordWidth dflags) W16
+mo_WordTo32     dflags = MO_UU_Conv (wordWidth dflags) W32
+mo_WordTo64     dflags = MO_UU_Conv (wordWidth dflags) W64
+
+mo_32To8               = MO_UU_Conv W32 W8
+mo_32To16              = MO_UU_Conv W32 W16
+
+
+-- ----------------------------------------------------------------------------
+-- isCommutableMachOp
+
+{- |
+Returns 'True' if the MachOp has commutable arguments.  This is used
+in the platform-independent Cmm optimisations.
+
+If in doubt, return 'False'.  This generates worse code on the
+native routes, but is otherwise harmless.
+-}
+isCommutableMachOp :: MachOp -> Bool
+isCommutableMachOp mop =
+  case mop of
+        MO_Add _                -> True
+        MO_Eq _                 -> True
+        MO_Ne _                 -> True
+        MO_Mul _                -> True
+        MO_S_MulMayOflo _       -> True
+        MO_U_MulMayOflo _       -> True
+        MO_And _                -> True
+        MO_Or _                 -> True
+        MO_Xor _                -> True
+        MO_F_Add _              -> True
+        MO_F_Mul _              -> True
+        _other                  -> False
+
+-- ----------------------------------------------------------------------------
+-- isAssociativeMachOp
+
+{- |
+Returns 'True' if the MachOp is associative (i.e. @(x+y)+z == x+(y+z)@)
+This is used in the platform-independent Cmm optimisations.
+
+If in doubt, return 'False'.  This generates worse code on the
+native routes, but is otherwise harmless.
+-}
+isAssociativeMachOp :: MachOp -> Bool
+isAssociativeMachOp mop =
+  case mop of
+        MO_Add {} -> True       -- NB: does not include
+        MO_Mul {} -> True --     floatint point!
+        MO_And {} -> True
+        MO_Or  {} -> True
+        MO_Xor {} -> True
+        _other    -> False
+
+
+-- ----------------------------------------------------------------------------
+-- isComparisonMachOp
+
+{- |
+Returns 'True' if the MachOp is a comparison.
+
+If in doubt, return False.  This generates worse code on the
+native routes, but is otherwise harmless.
+-}
+isComparisonMachOp :: MachOp -> Bool
+isComparisonMachOp mop =
+  case mop of
+    MO_Eq   _  -> True
+    MO_Ne   _  -> True
+    MO_S_Ge _  -> True
+    MO_S_Le _  -> True
+    MO_S_Gt _  -> True
+    MO_S_Lt _  -> True
+    MO_U_Ge _  -> True
+    MO_U_Le _  -> True
+    MO_U_Gt _  -> True
+    MO_U_Lt _  -> True
+    MO_F_Eq {} -> True
+    MO_F_Ne {} -> True
+    MO_F_Ge {} -> True
+    MO_F_Le {} -> True
+    MO_F_Gt {} -> True
+    MO_F_Lt {} -> True
+    _other     -> False
+
+{- |
+Returns @Just w@ if the operation is an integer comparison with width
+@w@, or @Nothing@ otherwise.
+-}
+maybeIntComparison :: MachOp -> Maybe Width
+maybeIntComparison mop =
+  case mop of
+    MO_Eq   w  -> Just w
+    MO_Ne   w  -> Just w
+    MO_S_Ge w  -> Just w
+    MO_S_Le w  -> Just w
+    MO_S_Gt w  -> Just w
+    MO_S_Lt w  -> Just w
+    MO_U_Ge w  -> Just w
+    MO_U_Le w  -> Just w
+    MO_U_Gt w  -> Just w
+    MO_U_Lt w  -> Just w
+    _ -> Nothing
+
+isFloatComparison :: MachOp -> Bool
+isFloatComparison mop =
+  case mop of
+    MO_F_Eq {} -> True
+    MO_F_Ne {} -> True
+    MO_F_Ge {} -> True
+    MO_F_Le {} -> True
+    MO_F_Gt {} -> True
+    MO_F_Lt {} -> True
+    _other     -> False
+
+-- -----------------------------------------------------------------------------
+-- Inverting conditions
+
+-- Sometimes it's useful to be able to invert the sense of a
+-- condition.  Not all conditional tests are invertible: in
+-- particular, floating point conditionals cannot be inverted, because
+-- there exist floating-point values which return False for both senses
+-- of a condition (eg. !(NaN > NaN) && !(NaN /<= NaN)).
+
+maybeInvertComparison :: MachOp -> Maybe MachOp
+maybeInvertComparison op
+  = case op of  -- None of these Just cases include floating point
+        MO_Eq r   -> Just (MO_Ne r)
+        MO_Ne r   -> Just (MO_Eq r)
+        MO_U_Lt r -> Just (MO_U_Ge r)
+        MO_U_Gt r -> Just (MO_U_Le r)
+        MO_U_Le r -> Just (MO_U_Gt r)
+        MO_U_Ge r -> Just (MO_U_Lt r)
+        MO_S_Lt r -> Just (MO_S_Ge r)
+        MO_S_Gt r -> Just (MO_S_Le r)
+        MO_S_Le r -> Just (MO_S_Gt r)
+        MO_S_Ge r -> Just (MO_S_Lt r)
+        _other    -> Nothing
+
+-- ----------------------------------------------------------------------------
+-- machOpResultType
+
+{- |
+Returns the MachRep of the result of a MachOp.
+-}
+machOpResultType :: DynFlags -> MachOp -> [CmmType] -> CmmType
+machOpResultType dflags mop tys =
+  case mop of
+    MO_Add {}           -> ty1  -- Preserve GC-ptr-hood
+    MO_Sub {}           -> ty1  -- of first arg
+    MO_Mul    r         -> cmmBits r
+    MO_S_MulMayOflo r   -> cmmBits r
+    MO_S_Quot r         -> cmmBits r
+    MO_S_Rem  r         -> cmmBits r
+    MO_S_Neg  r         -> cmmBits r
+    MO_U_MulMayOflo r   -> cmmBits r
+    MO_U_Quot r         -> cmmBits r
+    MO_U_Rem  r         -> cmmBits r
+
+    MO_Eq {}            -> comparisonResultRep dflags
+    MO_Ne {}            -> comparisonResultRep dflags
+    MO_S_Ge {}          -> comparisonResultRep dflags
+    MO_S_Le {}          -> comparisonResultRep dflags
+    MO_S_Gt {}          -> comparisonResultRep dflags
+    MO_S_Lt {}          -> comparisonResultRep dflags
+
+    MO_U_Ge {}          -> comparisonResultRep dflags
+    MO_U_Le {}          -> comparisonResultRep dflags
+    MO_U_Gt {}          -> comparisonResultRep dflags
+    MO_U_Lt {}          -> comparisonResultRep dflags
+
+    MO_F_Add r          -> cmmFloat r
+    MO_F_Sub r          -> cmmFloat r
+    MO_F_Mul r          -> cmmFloat r
+    MO_F_Quot r         -> cmmFloat r
+    MO_F_Neg r          -> cmmFloat r
+    MO_F_Eq  {}         -> comparisonResultRep dflags
+    MO_F_Ne  {}         -> comparisonResultRep dflags
+    MO_F_Ge  {}         -> comparisonResultRep dflags
+    MO_F_Le  {}         -> comparisonResultRep dflags
+    MO_F_Gt  {}         -> comparisonResultRep dflags
+    MO_F_Lt  {}         -> comparisonResultRep dflags
+
+    MO_And {}           -> ty1  -- Used for pointer masking
+    MO_Or {}            -> ty1
+    MO_Xor {}           -> ty1
+    MO_Not   r          -> cmmBits r
+    MO_Shl   r          -> cmmBits r
+    MO_U_Shr r          -> cmmBits r
+    MO_S_Shr r          -> cmmBits r
+
+    MO_SS_Conv _ to     -> cmmBits to
+    MO_UU_Conv _ to     -> cmmBits to
+    MO_XX_Conv _ to     -> cmmBits to
+    MO_FS_Conv _ to     -> cmmBits to
+    MO_SF_Conv _ to     -> cmmFloat to
+    MO_FF_Conv _ to     -> cmmFloat to
+
+    MO_V_Insert  l w    -> cmmVec l (cmmBits w)
+    MO_V_Extract _ w    -> cmmBits w
+
+    MO_V_Add l w        -> cmmVec l (cmmBits w)
+    MO_V_Sub l w        -> cmmVec l (cmmBits w)
+    MO_V_Mul l w        -> cmmVec l (cmmBits w)
+
+    MO_VS_Quot l w      -> cmmVec l (cmmBits w)
+    MO_VS_Rem  l w      -> cmmVec l (cmmBits w)
+    MO_VS_Neg  l w      -> cmmVec l (cmmBits w)
+
+    MO_VU_Quot l w      -> cmmVec l (cmmBits w)
+    MO_VU_Rem  l w      -> cmmVec l (cmmBits w)
+
+    MO_VF_Insert  l w   -> cmmVec l (cmmFloat w)
+    MO_VF_Extract _ w   -> cmmFloat w
+
+    MO_VF_Add  l w      -> cmmVec l (cmmFloat w)
+    MO_VF_Sub  l w      -> cmmVec l (cmmFloat w)
+    MO_VF_Mul  l w      -> cmmVec l (cmmFloat w)
+    MO_VF_Quot l w      -> cmmVec l (cmmFloat w)
+    MO_VF_Neg  l w      -> cmmVec l (cmmFloat w)
+
+    MO_AlignmentCheck _ _ -> ty1
+  where
+    (ty1:_) = tys
+
+comparisonResultRep :: DynFlags -> CmmType
+comparisonResultRep = bWord  -- is it?
+
+
+-- -----------------------------------------------------------------------------
+-- machOpArgReps
+
+-- | This function is used for debugging only: we can check whether an
+-- application of a MachOp is "type-correct" by checking that the MachReps of
+-- its arguments are the same as the MachOp expects.  This is used when
+-- linting a CmmExpr.
+
+machOpArgReps :: DynFlags -> MachOp -> [Width]
+machOpArgReps dflags op =
+  case op of
+    MO_Add    r         -> [r,r]
+    MO_Sub    r         -> [r,r]
+    MO_Eq     r         -> [r,r]
+    MO_Ne     r         -> [r,r]
+    MO_Mul    r         -> [r,r]
+    MO_S_MulMayOflo r   -> [r,r]
+    MO_S_Quot r         -> [r,r]
+    MO_S_Rem  r         -> [r,r]
+    MO_S_Neg  r         -> [r]
+    MO_U_MulMayOflo r   -> [r,r]
+    MO_U_Quot r         -> [r,r]
+    MO_U_Rem  r         -> [r,r]
+
+    MO_S_Ge r           -> [r,r]
+    MO_S_Le r           -> [r,r]
+    MO_S_Gt r           -> [r,r]
+    MO_S_Lt r           -> [r,r]
+
+    MO_U_Ge r           -> [r,r]
+    MO_U_Le r           -> [r,r]
+    MO_U_Gt r           -> [r,r]
+    MO_U_Lt r           -> [r,r]
+
+    MO_F_Add r          -> [r,r]
+    MO_F_Sub r          -> [r,r]
+    MO_F_Mul r          -> [r,r]
+    MO_F_Quot r         -> [r,r]
+    MO_F_Neg r          -> [r]
+    MO_F_Eq  r          -> [r,r]
+    MO_F_Ne  r          -> [r,r]
+    MO_F_Ge  r          -> [r,r]
+    MO_F_Le  r          -> [r,r]
+    MO_F_Gt  r          -> [r,r]
+    MO_F_Lt  r          -> [r,r]
+
+    MO_And   r          -> [r,r]
+    MO_Or    r          -> [r,r]
+    MO_Xor   r          -> [r,r]
+    MO_Not   r          -> [r]
+    MO_Shl   r          -> [r, wordWidth dflags]
+    MO_U_Shr r          -> [r, wordWidth dflags]
+    MO_S_Shr r          -> [r, wordWidth dflags]
+
+    MO_SS_Conv from _   -> [from]
+    MO_UU_Conv from _   -> [from]
+    MO_XX_Conv from _   -> [from]
+    MO_SF_Conv from _   -> [from]
+    MO_FS_Conv from _   -> [from]
+    MO_FF_Conv from _   -> [from]
+
+    MO_V_Insert  l r    -> [typeWidth (vec l (cmmBits r)),r,wordWidth dflags]
+    MO_V_Extract l r    -> [typeWidth (vec l (cmmBits r)),wordWidth dflags]
+
+    MO_V_Add _ r        -> [r,r]
+    MO_V_Sub _ r        -> [r,r]
+    MO_V_Mul _ r        -> [r,r]
+
+    MO_VS_Quot _ r      -> [r,r]
+    MO_VS_Rem  _ r      -> [r,r]
+    MO_VS_Neg  _ r      -> [r]
+
+    MO_VU_Quot _ r      -> [r,r]
+    MO_VU_Rem  _ r      -> [r,r]
+
+    MO_VF_Insert  l r   -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags]
+    MO_VF_Extract l r   -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags]
+
+    MO_VF_Add  _ r      -> [r,r]
+    MO_VF_Sub  _ r      -> [r,r]
+    MO_VF_Mul  _ r      -> [r,r]
+    MO_VF_Quot _ r      -> [r,r]
+    MO_VF_Neg  _ r      -> [r]
+
+    MO_AlignmentCheck _ r -> [r]
+
+-----------------------------------------------------------------------------
+-- CallishMachOp
+-----------------------------------------------------------------------------
+
+-- CallishMachOps tend to be implemented by foreign calls in some backends,
+-- so we separate them out.  In Cmm, these can only occur in a
+-- statement position, in contrast to an ordinary MachOp which can occur
+-- anywhere in an expression.
+data CallishMachOp
+  = MO_F64_Pwr
+  | MO_F64_Sin
+  | MO_F64_Cos
+  | MO_F64_Tan
+  | MO_F64_Sinh
+  | MO_F64_Cosh
+  | MO_F64_Tanh
+  | MO_F64_Asin
+  | MO_F64_Acos
+  | MO_F64_Atan
+  | MO_F64_Asinh
+  | MO_F64_Acosh
+  | MO_F64_Atanh
+  | MO_F64_Log
+  | MO_F64_Log1P
+  | MO_F64_Exp
+  | MO_F64_ExpM1
+  | MO_F64_Fabs
+  | MO_F64_Sqrt
+  | MO_F32_Pwr
+  | MO_F32_Sin
+  | MO_F32_Cos
+  | MO_F32_Tan
+  | MO_F32_Sinh
+  | MO_F32_Cosh
+  | MO_F32_Tanh
+  | MO_F32_Asin
+  | MO_F32_Acos
+  | MO_F32_Atan
+  | MO_F32_Asinh
+  | MO_F32_Acosh
+  | MO_F32_Atanh
+  | MO_F32_Log
+  | MO_F32_Log1P
+  | MO_F32_Exp
+  | MO_F32_ExpM1
+  | MO_F32_Fabs
+  | MO_F32_Sqrt
+
+  | MO_UF_Conv Width
+
+  | MO_S_Mul2    Width
+  | MO_S_QuotRem Width
+  | MO_U_QuotRem Width
+  | MO_U_QuotRem2 Width
+  | MO_Add2      Width
+  | MO_AddWordC  Width
+  | MO_SubWordC  Width
+  | MO_AddIntC   Width
+  | MO_SubIntC   Width
+  | MO_U_Mul2    Width
+
+  | MO_ReadBarrier
+  | MO_WriteBarrier
+  | MO_Touch         -- Keep variables live (when using interior pointers)
+
+  -- Prefetch
+  | MO_Prefetch_Data Int -- Prefetch hint. May change program performance but not
+                     -- program behavior.
+                     -- the Int can be 0-3. Needs to be known at compile time
+                     -- to interact with code generation correctly.
+                     --  TODO: add support for prefetch WRITES,
+                     --  currently only exposes prefetch reads, which
+                     -- would the majority of use cases in ghc anyways
+
+
+  -- These three MachOps are parameterised by the known alignment
+  -- of the destination and source (for memcpy/memmove) pointers.
+  -- This information may be used for optimisation in backends.
+  | MO_Memcpy Int
+  | MO_Memset Int
+  | MO_Memmove Int
+  | MO_Memcmp Int
+
+  | MO_PopCnt Width
+  | MO_Pdep Width
+  | MO_Pext Width
+  | MO_Clz Width
+  | MO_Ctz Width
+
+  | MO_BSwap Width
+  | MO_BRev Width
+
+  -- Atomic read-modify-write.
+  | MO_AtomicRMW Width AtomicMachOp
+  | MO_AtomicRead Width
+  | MO_AtomicWrite Width
+  | MO_Cmpxchg Width
+  deriving (Eq, Show)
+
+-- | The operation to perform atomically.
+data AtomicMachOp =
+      AMO_Add
+    | AMO_Sub
+    | AMO_And
+    | AMO_Nand
+    | AMO_Or
+    | AMO_Xor
+      deriving (Eq, Show)
+
+pprCallishMachOp :: CallishMachOp -> SDoc
+pprCallishMachOp mo = text (show mo)
+
+callishMachOpHints :: CallishMachOp -> ([ForeignHint], [ForeignHint])
+callishMachOpHints op = case op of
+  MO_Memcpy _  -> ([], [AddrHint,AddrHint,NoHint])
+  MO_Memset _  -> ([], [AddrHint,NoHint,NoHint])
+  MO_Memmove _ -> ([], [AddrHint,AddrHint,NoHint])
+  MO_Memcmp _  -> ([], [AddrHint, AddrHint, NoHint])
+  _            -> ([],[])
+  -- empty lists indicate NoHint
+
+-- | The alignment of a 'memcpy'-ish operation.
+machOpMemcpyishAlign :: CallishMachOp -> Maybe Int
+machOpMemcpyishAlign op = case op of
+  MO_Memcpy  align -> Just align
+  MO_Memset  align -> Just align
+  MO_Memmove align -> Just align
+  MO_Memcmp  align -> Just align
+  _                -> Nothing
diff --git a/compiler/GHC/Cmm/Monad.hs b/compiler/GHC/Cmm/Monad.hs
new file mode 100644
index 0000000000..6b8d00a118
--- /dev/null
+++ b/compiler/GHC/Cmm/Monad.hs
@@ -0,0 +1,59 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+-- A Parser monad with access to the 'DynFlags'.
+--
+-- The 'P' monad  only has access to the subset of of 'DynFlags'
+-- required for parsing Haskell.
+
+-- The parser for C-- requires access to a lot more of the 'DynFlags',
+-- so 'PD' provides access to 'DynFlags' via a 'HasDynFlags' instance.
+-----------------------------------------------------------------------------
+module GHC.Cmm.Monad (
+    PD(..)
+  , liftP
+  ) where
+
+import GhcPrelude
+
+import Control.Monad
+import qualified Control.Monad.Fail as MonadFail
+
+import DynFlags
+import Lexer
+
+newtype PD a = PD { unPD :: DynFlags -> PState -> ParseResult a }
+
+instance Functor PD where
+  fmap = liftM
+
+instance Applicative PD where
+  pure = returnPD
+  (<*>) = ap
+
+instance Monad PD where
+  (>>=) = thenPD
+#if !MIN_VERSION_base(4,13,0)
+  fail = MonadFail.fail
+#endif
+
+instance MonadFail.MonadFail PD where
+  fail = failPD
+
+liftP :: P a -> PD a
+liftP (P f) = PD $ \_ s -> f s
+
+returnPD :: a -> PD a
+returnPD = liftP . return
+
+thenPD :: PD a -> (a -> PD b) -> PD b
+(PD m) `thenPD` k = PD $ \d s ->
+        case m d s of
+                POk s1 a         -> unPD (k a) d s1
+                PFailed s1 -> PFailed s1
+
+failPD :: String -> PD a
+failPD = liftP . fail
+
+instance HasDynFlags PD where
+   getDynFlags = PD $ \d s -> POk s d
diff --git a/compiler/GHC/Cmm/Node.hs b/compiler/GHC/Cmm/Node.hs
new file mode 100644
index 0000000000..bb74647910
--- /dev/null
+++ b/compiler/GHC/Cmm/Node.hs
@@ -0,0 +1,724 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE ExplicitForAll #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE GADTs #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE StandaloneDeriving #-}
+{-# LANGUAGE UndecidableInstances #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+
+
+-- CmmNode type for representation using Hoopl graphs.
+
+module GHC.Cmm.Node (
+     CmmNode(..), CmmFormal, CmmActual, CmmTickish,
+     UpdFrameOffset, Convention(..),
+     ForeignConvention(..), ForeignTarget(..), foreignTargetHints,
+     CmmReturnInfo(..),
+     mapExp, mapExpDeep, wrapRecExp, foldExp, foldExpDeep, wrapRecExpf,
+     mapExpM, mapExpDeepM, wrapRecExpM, mapSuccessors, mapCollectSuccessors,
+
+     -- * Tick scopes
+     CmmTickScope(..), isTickSubScope, combineTickScopes,
+  ) where
+
+import GhcPrelude hiding (succ)
+
+import GHC.Platform.Regs
+import GHC.Cmm.Expr
+import GHC.Cmm.Switch
+import DynFlags
+import FastString
+import ForeignCall
+import Outputable
+import GHC.Runtime.Layout
+import CoreSyn (Tickish)
+import qualified Unique as U
+
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import Data.Maybe
+import Data.List (tails,sortBy)
+import Unique (nonDetCmpUnique)
+import Util
+
+
+------------------------
+-- CmmNode
+
+#define ULabel {-# UNPACK #-} !Label
+
+data CmmNode e x where
+  CmmEntry :: ULabel -> CmmTickScope -> CmmNode C O
+
+  CmmComment :: FastString -> CmmNode O O
+
+    -- Tick annotation, covering Cmm code in our tick scope. We only
+    -- expect non-code @Tickish@ at this point (e.g. @SourceNote@).
+    -- See Note [CmmTick scoping details]
+  CmmTick :: !CmmTickish -> CmmNode O O
+
+    -- Unwind pseudo-instruction, encoding stack unwinding
+    -- instructions for a debugger. This describes how to reconstruct
+    -- the "old" value of a register if we want to navigate the stack
+    -- up one frame. Having unwind information for @Sp@ will allow the
+    -- debugger to "walk" the stack.
+    --
+    -- See Note [What is this unwinding business?] in Debug
+  CmmUnwind :: [(GlobalReg, Maybe CmmExpr)] -> CmmNode O O
+
+  CmmAssign :: !CmmReg -> !CmmExpr -> CmmNode O O
+    -- Assign to register
+
+  CmmStore :: !CmmExpr -> !CmmExpr -> CmmNode O O
+    -- Assign to memory location.  Size is
+    -- given by cmmExprType of the rhs.
+
+  CmmUnsafeForeignCall ::       -- An unsafe foreign call;
+                                -- see Note [Foreign calls]
+                                -- Like a "fat machine instruction"; can occur
+                                -- in the middle of a block
+      ForeignTarget ->          -- call target
+      [CmmFormal] ->            -- zero or more results
+      [CmmActual] ->            -- zero or more arguments
+      CmmNode O O
+      -- Semantics: clobbers any GlobalRegs for which callerSaves r == True
+      -- See Note [Unsafe foreign calls clobber caller-save registers]
+      --
+      -- Invariant: the arguments and the ForeignTarget must not
+      -- mention any registers for which GHC.Platform.callerSaves
+      -- is True.  See Note [Register Parameter Passing].
+
+  CmmBranch :: ULabel -> CmmNode O C
+                                   -- Goto another block in the same procedure
+
+  CmmCondBranch :: {                 -- conditional branch
+      cml_pred :: CmmExpr,
+      cml_true, cml_false :: ULabel,
+      cml_likely :: Maybe Bool       -- likely result of the conditional,
+                                     -- if known
+  } -> CmmNode O C
+
+  CmmSwitch
+    :: CmmExpr       -- Scrutinee, of some integral type
+    -> SwitchTargets -- Cases. See [Note SwitchTargets]
+    -> CmmNode O C
+
+  CmmCall :: {                -- A native call or tail call
+      cml_target :: CmmExpr,  -- never a CmmPrim to a CallishMachOp!
+
+      cml_cont :: Maybe Label,
+          -- Label of continuation (Nothing for return or tail call)
+          --
+          -- Note [Continuation BlockIds]: these BlockIds are called
+          -- Continuation BlockIds, and are the only BlockIds that can
+          -- occur in CmmExprs, namely as (CmmLit (CmmBlock b)) or
+          -- (CmmStackSlot (Young b) _).
+
+      cml_args_regs :: [GlobalReg],
+          -- The argument GlobalRegs (Rx, Fx, Dx, Lx) that are passed
+          -- to the call.  This is essential information for the
+          -- native code generator's register allocator; without
+          -- knowing which GlobalRegs are live it has to assume that
+          -- they are all live.  This list should only include
+          -- GlobalRegs that are mapped to real machine registers on
+          -- the target platform.
+
+      cml_args :: ByteOff,
+          -- Byte offset, from the *old* end of the Area associated with
+          -- the Label (if cml_cont = Nothing, then Old area), of
+          -- youngest outgoing arg.  Set the stack pointer to this before
+          -- transferring control.
+          -- (NB: an update frame might also have been stored in the Old
+          --      area, but it'll be in an older part than the args.)
+
+      cml_ret_args :: ByteOff,
+          -- For calls *only*, the byte offset for youngest returned value
+          -- This is really needed at the *return* point rather than here
+          -- at the call, but in practice it's convenient to record it here.
+
+      cml_ret_off :: ByteOff
+        -- For calls *only*, the byte offset of the base of the frame that
+        -- must be described by the info table for the return point.
+        -- The older words are an update frames, which have their own
+        -- info-table and layout information
+
+        -- From a liveness point of view, the stack words older than
+        -- cml_ret_off are treated as live, even if the sequel of
+        -- the call goes into a loop.
+  } -> CmmNode O C
+
+  CmmForeignCall :: {           -- A safe foreign call; see Note [Foreign calls]
+                                -- Always the last node of a block
+      tgt   :: ForeignTarget,   -- call target and convention
+      res   :: [CmmFormal],     -- zero or more results
+      args  :: [CmmActual],     -- zero or more arguments; see Note [Register parameter passing]
+      succ  :: ULabel,          -- Label of continuation
+      ret_args :: ByteOff,      -- same as cml_ret_args
+      ret_off :: ByteOff,       -- same as cml_ret_off
+      intrbl:: Bool             -- whether or not the call is interruptible
+  } -> CmmNode O C
+
+{- Note [Foreign calls]
+~~~~~~~~~~~~~~~~~~~~~~~
+A CmmUnsafeForeignCall is used for *unsafe* foreign calls;
+a CmmForeignCall call is used for *safe* foreign calls.
+
+Unsafe ones are mostly easy: think of them as a "fat machine
+instruction".  In particular, they do *not* kill all live registers,
+just the registers they return to (there was a bit of code in GHC that
+conservatively assumed otherwise.)  However, see [Register parameter passing].
+
+Safe ones are trickier.  A safe foreign call
+     r = f(x)
+ultimately expands to
+     push "return address"      -- Never used to return to;
+                                -- just points an info table
+     save registers into TSO
+     call suspendThread
+     r = f(x)                   -- Make the call
+     call resumeThread
+     restore registers
+     pop "return address"
+We cannot "lower" a safe foreign call to this sequence of Cmms, because
+after we've saved Sp all the Cmm optimiser's assumptions are broken.
+
+Note that a safe foreign call needs an info table.
+
+So Safe Foreign Calls must remain as last nodes until the stack is
+made manifest in GHC.Cmm.LayoutStack, where they are lowered into the above
+sequence.
+-}
+
+{- Note [Unsafe foreign calls clobber caller-save registers]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A foreign call is defined to clobber any GlobalRegs that are mapped to
+caller-saves machine registers (according to the prevailing C ABI).
+GHC.StgToCmm.Utils.callerSaves tells you which GlobalRegs are caller-saves.
+
+This is a design choice that makes it easier to generate code later.
+We could instead choose to say that foreign calls do *not* clobber
+caller-saves regs, but then we would have to figure out which regs
+were live across the call later and insert some saves/restores.
+
+Furthermore when we generate code we never have any GlobalRegs live
+across a call, because they are always copied-in to LocalRegs and
+copied-out again before making a call/jump.  So all we have to do is
+avoid any code motion that would make a caller-saves GlobalReg live
+across a foreign call during subsequent optimisations.
+-}
+
+{- Note [Register parameter passing]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+On certain architectures, some registers are utilized for parameter
+passing in the C calling convention.  For example, in x86-64 Linux
+convention, rdi, rsi, rdx and rcx (as well as r8 and r9) may be used for
+argument passing.  These are registers R3-R6, which our generated
+code may also be using; as a result, it's necessary to save these
+values before doing a foreign call.  This is done during initial
+code generation in callerSaveVolatileRegs in GHC.StgToCmm.Utils.  However,
+one result of doing this is that the contents of these registers
+may mysteriously change if referenced inside the arguments.  This
+is dangerous, so you'll need to disable inlining much in the same
+way is done in GHC.Cmm.Opt currently.  We should fix this!
+-}
+
+---------------------------------------------
+-- Eq instance of CmmNode
+
+deriving instance Eq (CmmNode e x)
+
+----------------------------------------------
+-- Hoopl instances of CmmNode
+
+instance NonLocal CmmNode where
+  entryLabel (CmmEntry l _) = l
+
+  successors (CmmBranch l) = [l]
+  successors (CmmCondBranch {cml_true=t, cml_false=f}) = [f, t] -- meets layout constraint
+  successors (CmmSwitch _ ids) = switchTargetsToList ids
+  successors (CmmCall {cml_cont=l}) = maybeToList l
+  successors (CmmForeignCall {succ=l}) = [l]
+
+
+--------------------------------------------------
+-- Various helper types
+
+type CmmActual = CmmExpr
+type CmmFormal = LocalReg
+
+type UpdFrameOffset = ByteOff
+
+-- | A convention maps a list of values (function arguments or return
+-- values) to registers or stack locations.
+data Convention
+  = NativeDirectCall
+       -- ^ top-level Haskell functions use @NativeDirectCall@, which
+       -- maps arguments to registers starting with R2, according to
+       -- how many registers are available on the platform.  This
+       -- convention ignores R1, because for a top-level function call
+       -- the function closure is implicit, and doesn't need to be passed.
+  | NativeNodeCall
+       -- ^ non-top-level Haskell functions, which pass the address of
+       -- the function closure in R1 (regardless of whether R1 is a
+       -- real register or not), and the rest of the arguments in
+       -- registers or on the stack.
+  | NativeReturn
+       -- ^ a native return.  The convention for returns depends on
+       -- how many values are returned: for just one value returned,
+       -- the appropriate register is used (R1, F1, etc.). regardless
+       -- of whether it is a real register or not.  For multiple
+       -- values returned, they are mapped to registers or the stack.
+  | Slow
+       -- ^ Slow entry points: all args pushed on the stack
+  | GC
+       -- ^ Entry to the garbage collector: uses the node reg!
+       -- (TODO: I don't think we need this --SDM)
+  deriving( Eq )
+
+data ForeignConvention
+  = ForeignConvention
+        CCallConv               -- Which foreign-call convention
+        [ForeignHint]           -- Extra info about the args
+        [ForeignHint]           -- Extra info about the result
+        CmmReturnInfo
+  deriving Eq
+
+data CmmReturnInfo
+  = CmmMayReturn
+  | CmmNeverReturns
+  deriving ( Eq )
+
+data ForeignTarget        -- The target of a foreign call
+  = ForeignTarget                -- A foreign procedure
+        CmmExpr                  -- Its address
+        ForeignConvention        -- Its calling convention
+  | PrimTarget            -- A possibly-side-effecting machine operation
+        CallishMachOp            -- Which one
+  deriving Eq
+
+foreignTargetHints :: ForeignTarget -> ([ForeignHint], [ForeignHint])
+foreignTargetHints target
+  = ( res_hints ++ repeat NoHint
+    , arg_hints ++ repeat NoHint )
+  where
+    (res_hints, arg_hints) =
+       case target of
+          PrimTarget op -> callishMachOpHints op
+          ForeignTarget _ (ForeignConvention _ arg_hints res_hints _) ->
+             (res_hints, arg_hints)
+
+--------------------------------------------------
+-- Instances of register and slot users / definers
+
+instance UserOfRegs LocalReg (CmmNode e x) where
+  foldRegsUsed dflags f !z n = case n of
+    CmmAssign _ expr -> fold f z expr
+    CmmStore addr rval -> fold f (fold f z addr) rval
+    CmmUnsafeForeignCall t _ args -> fold f (fold f z t) args
+    CmmCondBranch expr _ _ _ -> fold f z expr
+    CmmSwitch expr _ -> fold f z expr
+    CmmCall {cml_target=tgt} -> fold f z tgt
+    CmmForeignCall {tgt=tgt, args=args} -> fold f (fold f z tgt) args
+    _ -> z
+    where fold :: forall a b. UserOfRegs LocalReg a
+               => (b -> LocalReg -> b) -> b -> a -> b
+          fold f z n = foldRegsUsed dflags f z n
+
+instance UserOfRegs GlobalReg (CmmNode e x) where
+  foldRegsUsed dflags f !z n = case n of
+    CmmAssign _ expr -> fold f z expr
+    CmmStore addr rval -> fold f (fold f z addr) rval
+    CmmUnsafeForeignCall t _ args -> fold f (fold f z t) args
+    CmmCondBranch expr _ _ _ -> fold f z expr
+    CmmSwitch expr _ -> fold f z expr
+    CmmCall {cml_target=tgt, cml_args_regs=args} -> fold f (fold f z args) tgt
+    CmmForeignCall {tgt=tgt, args=args} -> fold f (fold f z tgt) args
+    _ -> z
+    where fold :: forall a b.  UserOfRegs GlobalReg a
+               => (b -> GlobalReg -> b) -> b -> a -> b
+          fold f z n = foldRegsUsed dflags f z n
+
+instance (Ord r, UserOfRegs r CmmReg) => UserOfRegs r ForeignTarget where
+  -- The (Ord r) in the context is necessary here
+  -- See Note [Recursive superclasses] in TcInstDcls
+  foldRegsUsed _      _ !z (PrimTarget _)      = z
+  foldRegsUsed dflags f !z (ForeignTarget e _) = foldRegsUsed dflags f z e
+
+instance DefinerOfRegs LocalReg (CmmNode e x) where
+  foldRegsDefd dflags f !z n = case n of
+    CmmAssign lhs _ -> fold f z lhs
+    CmmUnsafeForeignCall _ fs _ -> fold f z fs
+    CmmForeignCall {res=res} -> fold f z res
+    _ -> z
+    where fold :: forall a b. DefinerOfRegs LocalReg a
+               => (b -> LocalReg -> b) -> b -> a -> b
+          fold f z n = foldRegsDefd dflags f z n
+
+instance DefinerOfRegs GlobalReg (CmmNode e x) where
+  foldRegsDefd dflags f !z n = case n of
+    CmmAssign lhs _ -> fold f z lhs
+    CmmUnsafeForeignCall tgt _ _  -> fold f z (foreignTargetRegs tgt)
+    CmmCall        {} -> fold f z activeRegs
+    CmmForeignCall {} -> fold f z activeRegs
+                      -- See Note [Safe foreign calls clobber STG registers]
+    _ -> z
+    where fold :: forall a b. DefinerOfRegs GlobalReg a
+               => (b -> GlobalReg -> b) -> b -> a -> b
+          fold f z n = foldRegsDefd dflags f z n
+
+          platform = targetPlatform dflags
+          activeRegs = activeStgRegs platform
+          activeCallerSavesRegs = filter (callerSaves platform) activeRegs
+
+          foreignTargetRegs (ForeignTarget _ (ForeignConvention _ _ _ CmmNeverReturns)) = []
+          foreignTargetRegs _ = activeCallerSavesRegs
+
+-- Note [Safe foreign calls clobber STG registers]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- During stack layout phase every safe foreign call is expanded into a block
+-- that contains unsafe foreign call (instead of safe foreign call) and ends
+-- with a normal call (See Note [Foreign calls]). This means that we must
+-- treat safe foreign call as if it was a normal call (because eventually it
+-- will be). This is important if we try to run sinking pass before stack
+-- layout phase. Consider this example of what might go wrong (this is cmm
+-- code from stablename001 test). Here is code after common block elimination
+-- (before stack layout):
+--
+--  c1q6:
+--      _s1pf::P64 = R1;
+--      _c1q8::I64 = performMajorGC;
+--      I64[(young<c1q9> + 8)] = c1q9;
+--      foreign call "ccall" arg hints:  []  result hints:  [] (_c1q8::I64)(...)
+--                   returns to c1q9 args: ([]) ress: ([])ret_args: 8ret_off: 8;
+--  c1q9:
+--      I64[(young<c1qb> + 8)] = c1qb;
+--      R1 = _s1pc::P64;
+--      call stg_makeStableName#(R1) returns to c1qb, args: 8, res: 8, upd: 8;
+--
+-- If we run sinking pass now (still before stack layout) we will get this:
+--
+--  c1q6:
+--      I64[(young<c1q9> + 8)] = c1q9;
+--      foreign call "ccall" arg hints:  []  result hints:  [] performMajorGC(...)
+--                   returns to c1q9 args: ([]) ress: ([])ret_args: 8ret_off: 8;
+--  c1q9:
+--      I64[(young<c1qb> + 8)] = c1qb;
+--      _s1pf::P64 = R1;         <------ _s1pf sunk past safe foreign call
+--      R1 = _s1pc::P64;
+--      call stg_makeStableName#(R1) returns to c1qb, args: 8, res: 8, upd: 8;
+--
+-- Notice that _s1pf was sunk past a foreign call. When we run stack layout
+-- safe call to performMajorGC will be turned into:
+--
+--  c1q6:
+--      _s1pc::P64 = P64[Sp + 8];
+--      I64[Sp - 8] = c1q9;
+--      Sp = Sp - 8;
+--      I64[I64[CurrentTSO + 24] + 16] = Sp;
+--      P64[CurrentNursery + 8] = Hp + 8;
+--      (_u1qI::I64) = call "ccall" arg hints:  [PtrHint,]
+--                           result hints:  [PtrHint] suspendThread(BaseReg, 0);
+--      call "ccall" arg hints:  []  result hints:  [] performMajorGC();
+--      (_u1qJ::I64) = call "ccall" arg hints:  [PtrHint]
+--                           result hints:  [PtrHint] resumeThread(_u1qI::I64);
+--      BaseReg = _u1qJ::I64;
+--      _u1qK::P64 = CurrentTSO;
+--      _u1qL::P64 = I64[_u1qK::P64 + 24];
+--      Sp = I64[_u1qL::P64 + 16];
+--      SpLim = _u1qL::P64 + 192;
+--      HpAlloc = 0;
+--      Hp = I64[CurrentNursery + 8] - 8;
+--      HpLim = I64[CurrentNursery] + (%MO_SS_Conv_W32_W64(I32[CurrentNursery + 48]) * 4096 - 1);
+--      call (I64[Sp])() returns to c1q9, args: 8, res: 8, upd: 8;
+--  c1q9:
+--      I64[(young<c1qb> + 8)] = c1qb;
+--      _s1pf::P64 = R1;         <------ INCORRECT!
+--      R1 = _s1pc::P64;
+--      call stg_makeStableName#(R1) returns to c1qb, args: 8, res: 8, upd: 8;
+--
+-- Notice that c1q6 now ends with a call. Sinking _s1pf::P64 = R1 past that
+-- call is clearly incorrect. This is what would happen if we assumed that
+-- safe foreign call has the same semantics as unsafe foreign call. To prevent
+-- this we need to treat safe foreign call as if was normal call.
+
+-----------------------------------
+-- mapping Expr in GHC.Cmm.Node
+
+mapForeignTarget :: (CmmExpr -> CmmExpr) -> ForeignTarget -> ForeignTarget
+mapForeignTarget exp   (ForeignTarget e c) = ForeignTarget (exp e) c
+mapForeignTarget _   m@(PrimTarget _)      = m
+
+wrapRecExp :: (CmmExpr -> CmmExpr) -> CmmExpr -> CmmExpr
+-- Take a transformer on expressions and apply it recursively.
+-- (wrapRecExp f e) first recursively applies itself to sub-expressions of e
+--                  then  uses f to rewrite the resulting expression
+wrapRecExp f (CmmMachOp op es)    = f (CmmMachOp op $ map (wrapRecExp f) es)
+wrapRecExp f (CmmLoad addr ty)    = f (CmmLoad (wrapRecExp f addr) ty)
+wrapRecExp f e                    = f e
+
+mapExp :: (CmmExpr -> CmmExpr) -> CmmNode e x -> CmmNode e x
+mapExp _ f@(CmmEntry{})                          = f
+mapExp _ m@(CmmComment _)                        = m
+mapExp _ m@(CmmTick _)                           = m
+mapExp f   (CmmUnwind regs)                      = CmmUnwind (map (fmap (fmap f)) regs)
+mapExp f   (CmmAssign r e)                       = CmmAssign r (f e)
+mapExp f   (CmmStore addr e)                     = CmmStore (f addr) (f e)
+mapExp f   (CmmUnsafeForeignCall tgt fs as)      = CmmUnsafeForeignCall (mapForeignTarget f tgt) fs (map f as)
+mapExp _ l@(CmmBranch _)                         = l
+mapExp f   (CmmCondBranch e ti fi l)             = CmmCondBranch (f e) ti fi l
+mapExp f   (CmmSwitch e ids)                     = CmmSwitch (f e) ids
+mapExp f   n@CmmCall {cml_target=tgt}            = n{cml_target = f tgt}
+mapExp f   (CmmForeignCall tgt fs as succ ret_args updfr intrbl) = CmmForeignCall (mapForeignTarget f tgt) fs (map f as) succ ret_args updfr intrbl
+
+mapExpDeep :: (CmmExpr -> CmmExpr) -> CmmNode e x -> CmmNode e x
+mapExpDeep f = mapExp $ wrapRecExp f
+
+------------------------------------------------------------------------
+-- mapping Expr in GHC.Cmm.Node, but not performing allocation if no changes
+
+mapForeignTargetM :: (CmmExpr -> Maybe CmmExpr) -> ForeignTarget -> Maybe ForeignTarget
+mapForeignTargetM f (ForeignTarget e c) = (\x -> ForeignTarget x c) `fmap` f e
+mapForeignTargetM _ (PrimTarget _)      = Nothing
+
+wrapRecExpM :: (CmmExpr -> Maybe CmmExpr) -> (CmmExpr -> Maybe CmmExpr)
+-- (wrapRecExpM f e) first recursively applies itself to sub-expressions of e
+--                   then  gives f a chance to rewrite the resulting expression
+wrapRecExpM f n@(CmmMachOp op es)  = maybe (f n) (f . CmmMachOp op)    (mapListM (wrapRecExpM f) es)
+wrapRecExpM f n@(CmmLoad addr ty)  = maybe (f n) (f . flip CmmLoad ty) (wrapRecExpM f addr)
+wrapRecExpM f e                    = f e
+
+mapExpM :: (CmmExpr -> Maybe CmmExpr) -> CmmNode e x -> Maybe (CmmNode e x)
+mapExpM _ (CmmEntry{})              = Nothing
+mapExpM _ (CmmComment _)            = Nothing
+mapExpM _ (CmmTick _)               = Nothing
+mapExpM f (CmmUnwind regs)          = CmmUnwind `fmap` mapM (\(r,e) -> mapM f e >>= \e' -> pure (r,e')) regs
+mapExpM f (CmmAssign r e)           = CmmAssign r `fmap` f e
+mapExpM f (CmmStore addr e)         = (\[addr', e'] -> CmmStore addr' e') `fmap` mapListM f [addr, e]
+mapExpM _ (CmmBranch _)             = Nothing
+mapExpM f (CmmCondBranch e ti fi l) = (\x -> CmmCondBranch x ti fi l) `fmap` f e
+mapExpM f (CmmSwitch e tbl)         = (\x -> CmmSwitch x tbl)       `fmap` f e
+mapExpM f (CmmCall tgt mb_id r o i s) = (\x -> CmmCall x mb_id r o i s) `fmap` f tgt
+mapExpM f (CmmUnsafeForeignCall tgt fs as)
+    = case mapForeignTargetM f tgt of
+        Just tgt' -> Just (CmmUnsafeForeignCall tgt' fs (mapListJ f as))
+        Nothing   -> (\xs -> CmmUnsafeForeignCall tgt fs xs) `fmap` mapListM f as
+mapExpM f (CmmForeignCall tgt fs as succ ret_args updfr intrbl)
+    = case mapForeignTargetM f tgt of
+        Just tgt' -> Just (CmmForeignCall tgt' fs (mapListJ f as) succ ret_args updfr intrbl)
+        Nothing   -> (\xs -> CmmForeignCall tgt fs xs succ ret_args updfr intrbl) `fmap` mapListM f as
+
+-- share as much as possible
+mapListM :: (a -> Maybe a) -> [a] -> Maybe [a]
+mapListM f xs = let (b, r) = mapListT f xs
+                in if b then Just r else Nothing
+
+mapListJ :: (a -> Maybe a) -> [a] -> [a]
+mapListJ f xs = snd (mapListT f xs)
+
+mapListT :: (a -> Maybe a) -> [a] -> (Bool, [a])
+mapListT f xs = foldr g (False, []) (zip3 (tails xs) xs (map f xs))
+    where g (_,   y, Nothing) (True, ys)  = (True,  y:ys)
+          g (_,   _, Just y)  (True, ys)  = (True,  y:ys)
+          g (ys', _, Nothing) (False, _)  = (False, ys')
+          g (_,   _, Just y)  (False, ys) = (True,  y:ys)
+
+mapExpDeepM :: (CmmExpr -> Maybe CmmExpr) -> CmmNode e x -> Maybe (CmmNode e x)
+mapExpDeepM f = mapExpM $ wrapRecExpM f
+
+-----------------------------------
+-- folding Expr in GHC.Cmm.Node
+
+foldExpForeignTarget :: (CmmExpr -> z -> z) -> ForeignTarget -> z -> z
+foldExpForeignTarget exp (ForeignTarget e _) z = exp e z
+foldExpForeignTarget _   (PrimTarget _)      z = z
+
+-- Take a folder on expressions and apply it recursively.
+-- Specifically (wrapRecExpf f e z) deals with CmmMachOp and CmmLoad
+-- itself, delegating all the other CmmExpr forms to 'f'.
+wrapRecExpf :: (CmmExpr -> z -> z) -> CmmExpr -> z -> z
+wrapRecExpf f e@(CmmMachOp _ es) z = foldr (wrapRecExpf f) (f e z) es
+wrapRecExpf f e@(CmmLoad addr _) z = wrapRecExpf f addr (f e z)
+wrapRecExpf f e                  z = f e z
+
+foldExp :: (CmmExpr -> z -> z) -> CmmNode e x -> z -> z
+foldExp _ (CmmEntry {}) z                         = z
+foldExp _ (CmmComment {}) z                       = z
+foldExp _ (CmmTick {}) z                          = z
+foldExp f (CmmUnwind xs) z                        = foldr (maybe id f) z (map snd xs)
+foldExp f (CmmAssign _ e) z                       = f e z
+foldExp f (CmmStore addr e) z                     = f addr $ f e z
+foldExp f (CmmUnsafeForeignCall t _ as) z         = foldr f (foldExpForeignTarget f t z) as
+foldExp _ (CmmBranch _) z                         = z
+foldExp f (CmmCondBranch e _ _ _) z               = f e z
+foldExp f (CmmSwitch e _) z                       = f e z
+foldExp f (CmmCall {cml_target=tgt}) z            = f tgt z
+foldExp f (CmmForeignCall {tgt=tgt, args=args}) z = foldr f (foldExpForeignTarget f tgt z) args
+
+foldExpDeep :: (CmmExpr -> z -> z) -> CmmNode e x -> z -> z
+foldExpDeep f = foldExp (wrapRecExpf f)
+
+-- -----------------------------------------------------------------------------
+
+mapSuccessors :: (Label -> Label) -> CmmNode O C -> CmmNode O C
+mapSuccessors f (CmmBranch bid)         = CmmBranch (f bid)
+mapSuccessors f (CmmCondBranch p y n l) = CmmCondBranch p (f y) (f n) l
+mapSuccessors f (CmmSwitch e ids)       = CmmSwitch e (mapSwitchTargets f ids)
+mapSuccessors _ n = n
+
+mapCollectSuccessors :: forall a. (Label -> (Label,a)) -> CmmNode O C
+                     -> (CmmNode O C, [a])
+mapCollectSuccessors f (CmmBranch bid)
+  = let (bid', acc) = f bid in (CmmBranch bid', [acc])
+mapCollectSuccessors f (CmmCondBranch p y n l)
+  = let (bidt, acct) = f y
+        (bidf, accf) = f n
+    in  (CmmCondBranch p bidt bidf l, [accf, acct])
+mapCollectSuccessors f (CmmSwitch e ids)
+  = let lbls = switchTargetsToList ids :: [Label]
+        lblMap = mapFromList $ zip lbls (map f lbls) :: LabelMap (Label, a)
+    in ( CmmSwitch e
+          (mapSwitchTargets
+            (\l -> fst $ mapFindWithDefault (error "impossible") l lblMap) ids)
+          , map snd (mapElems lblMap)
+        )
+mapCollectSuccessors _ n = (n, [])
+
+-- -----------------------------------------------------------------------------
+
+-- | Tickish in Cmm context (annotations only)
+type CmmTickish = Tickish ()
+
+-- | Tick scope identifier, allowing us to reason about what
+-- annotations in a Cmm block should scope over. We especially take
+-- care to allow optimisations to reorganise blocks without losing
+-- tick association in the process.
+data CmmTickScope
+  = GlobalScope
+    -- ^ The global scope is the "root" of the scope graph. Every
+    -- scope is a sub-scope of the global scope. It doesn't make sense
+    -- to add ticks to this scope. On the other hand, this means that
+    -- setting this scope on a block means no ticks apply to it.
+
+  | SubScope !U.Unique CmmTickScope
+    -- ^ Constructs a new sub-scope to an existing scope. This allows
+    -- us to translate Core-style scoping rules (see @tickishScoped@)
+    -- into the Cmm world. Suppose the following code:
+    --
+    --   tick<1> case ... of
+    --             A -> tick<2> ...
+    --             B -> tick<3> ...
+    --
+    -- We want the top-level tick annotation to apply to blocks
+    -- generated for the A and B alternatives. We can achieve that by
+    -- generating tick<1> into a block with scope a, while the code
+    -- for alternatives A and B gets generated into sub-scopes a/b and
+    -- a/c respectively.
+
+  | CombinedScope CmmTickScope CmmTickScope
+    -- ^ A combined scope scopes over everything that the two given
+    -- scopes cover. It is therefore a sub-scope of either scope. This
+    -- is required for optimisations. Consider common block elimination:
+    --
+    --   A -> tick<2> case ... of
+    --     C -> [common]
+    --   B -> tick<3> case ... of
+    --     D -> [common]
+    --
+    -- We will generate code for the C and D alternatives, and figure
+    -- out afterwards that it's actually common code. Scoping rules
+    -- dictate that the resulting common block needs to be covered by
+    -- both tick<2> and tick<3>, therefore we need to construct a
+    -- scope that is a child to *both* scope. Now we can do that - if
+    -- we assign the scopes a/c and b/d to the common-ed up blocks,
+    -- the new block could have a combined tick scope a/c+b/d, which
+    -- both tick<2> and tick<3> apply to.
+
+-- Note [CmmTick scoping details]:
+--
+-- The scope of a @CmmTick@ is given by the @CmmEntry@ node of the
+-- same block. Note that as a result of this, optimisations making
+-- tick scopes more specific can *reduce* the amount of code a tick
+-- scopes over. Fixing this would require a separate @CmmTickScope@
+-- field for @CmmTick@. Right now we do not do this simply because I
+-- couldn't find an example where it actually mattered -- multiple
+-- blocks within the same scope generally jump to each other, which
+-- prevents common block elimination from happening in the first
+-- place. But this is no strong reason, so if Cmm optimisations become
+-- more involved in future this might have to be revisited.
+
+-- | Output all scope paths.
+scopeToPaths :: CmmTickScope -> [[U.Unique]]
+scopeToPaths GlobalScope           = [[]]
+scopeToPaths (SubScope u s)        = map (u:) (scopeToPaths s)
+scopeToPaths (CombinedScope s1 s2) = scopeToPaths s1 ++ scopeToPaths s2
+
+-- | Returns the head uniques of the scopes. This is based on the
+-- assumption that the @Unique@ of @SubScope@ identifies the
+-- underlying super-scope. Used for efficient equality and comparison,
+-- see below.
+scopeUniques :: CmmTickScope -> [U.Unique]
+scopeUniques GlobalScope           = []
+scopeUniques (SubScope u _)        = [u]
+scopeUniques (CombinedScope s1 s2) = scopeUniques s1 ++ scopeUniques s2
+
+-- Equality and order is based on the head uniques defined above. We
+-- take care to short-cut the (extremely) common cases.
+instance Eq CmmTickScope where
+  GlobalScope    == GlobalScope     = True
+  GlobalScope    == _               = False
+  _              == GlobalScope     = False
+  (SubScope u _) == (SubScope u' _) = u == u'
+  (SubScope _ _) == _               = False
+  _              == (SubScope _ _)  = False
+  scope          == scope'          =
+    sortBy nonDetCmpUnique (scopeUniques scope) ==
+    sortBy nonDetCmpUnique (scopeUniques scope')
+    -- This is still deterministic because
+    -- the order is the same for equal lists
+
+-- This is non-deterministic but we do not currently support deterministic
+-- code-generation. See Note [Unique Determinism and code generation]
+-- See Note [No Ord for Unique]
+instance Ord CmmTickScope where
+  compare GlobalScope    GlobalScope     = EQ
+  compare GlobalScope    _               = LT
+  compare _              GlobalScope     = GT
+  compare (SubScope u _) (SubScope u' _) = nonDetCmpUnique u u'
+  compare scope scope'                   = cmpList nonDetCmpUnique
+     (sortBy nonDetCmpUnique $ scopeUniques scope)
+     (sortBy nonDetCmpUnique $ scopeUniques scope')
+
+instance Outputable CmmTickScope where
+  ppr GlobalScope     = text "global"
+  ppr (SubScope us GlobalScope)
+                      = ppr us
+  ppr (SubScope us s) = ppr s <> char '/' <> ppr us
+  ppr combined        = parens $ hcat $ punctuate (char '+') $
+                        map (hcat . punctuate (char '/') . map ppr . reverse) $
+                        scopeToPaths combined
+
+-- | Checks whether two tick scopes are sub-scopes of each other. True
+-- if the two scopes are equal.
+isTickSubScope :: CmmTickScope -> CmmTickScope -> Bool
+isTickSubScope = cmp
+  where cmp _              GlobalScope             = True
+        cmp GlobalScope    _                       = False
+        cmp (CombinedScope s1 s2) s'               = cmp s1 s' && cmp s2 s'
+        cmp s              (CombinedScope s1' s2') = cmp s s1' || cmp s s2'
+        cmp (SubScope u s) s'@(SubScope u' _)      = u == u' || cmp s s'
+
+-- | Combine two tick scopes. The new scope should be sub-scope of
+-- both parameters. We simplify automatically if one tick scope is a
+-- sub-scope of the other already.
+combineTickScopes :: CmmTickScope -> CmmTickScope -> CmmTickScope
+combineTickScopes s1 s2
+  | s1 `isTickSubScope` s2 = s1
+  | s2 `isTickSubScope` s1 = s2
+  | otherwise              = CombinedScope s1 s2
diff --git a/compiler/GHC/Cmm/Opt.hs b/compiler/GHC/Cmm/Opt.hs
new file mode 100644
index 0000000000..1db37ae58c
--- /dev/null
+++ b/compiler/GHC/Cmm/Opt.hs
@@ -0,0 +1,423 @@
+-----------------------------------------------------------------------------
+--
+-- Cmm optimisation
+--
+-- (c) The University of Glasgow 2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.Cmm.Opt (
+        constantFoldNode,
+        constantFoldExpr,
+        cmmMachOpFold,
+        cmmMachOpFoldM
+ ) where
+
+import GhcPrelude
+
+import GHC.Cmm.Utils
+import GHC.Cmm
+import DynFlags
+import Util
+
+import Outputable
+import GHC.Platform
+
+import Data.Bits
+import Data.Maybe
+
+
+constantFoldNode :: DynFlags -> CmmNode e x -> CmmNode e x
+constantFoldNode dflags = mapExp (constantFoldExpr dflags)
+
+constantFoldExpr :: DynFlags -> CmmExpr -> CmmExpr
+constantFoldExpr dflags = wrapRecExp f
+  where f (CmmMachOp op args) = cmmMachOpFold dflags op args
+        f (CmmRegOff r 0) = CmmReg r
+        f e = e
+
+-- -----------------------------------------------------------------------------
+-- MachOp constant folder
+
+-- Now, try to constant-fold the MachOps.  The arguments have already
+-- been optimized and folded.
+
+cmmMachOpFold
+    :: DynFlags
+    -> MachOp       -- The operation from an CmmMachOp
+    -> [CmmExpr]    -- The optimized arguments
+    -> CmmExpr
+
+cmmMachOpFold dflags op args = fromMaybe (CmmMachOp op args) (cmmMachOpFoldM dflags op args)
+
+-- Returns Nothing if no changes, useful for Hoopl, also reduces
+-- allocation!
+cmmMachOpFoldM
+    :: DynFlags
+    -> MachOp
+    -> [CmmExpr]
+    -> Maybe CmmExpr
+
+cmmMachOpFoldM _ op [CmmLit (CmmInt x rep)]
+  = Just $ case op of
+      MO_S_Neg _ -> CmmLit (CmmInt (-x) rep)
+      MO_Not _   -> CmmLit (CmmInt (complement x) rep)
+
+        -- these are interesting: we must first narrow to the
+        -- "from" type, in order to truncate to the correct size.
+        -- The final narrow/widen to the destination type
+        -- is implicit in the CmmLit.
+      MO_SF_Conv _from to -> CmmLit (CmmFloat (fromInteger x) to)
+      MO_SS_Conv  from to -> CmmLit (CmmInt (narrowS from x) to)
+      MO_UU_Conv  from to -> CmmLit (CmmInt (narrowU from x) to)
+
+      _ -> panic $ "cmmMachOpFoldM: unknown unary op: " ++ show op
+
+
+-- Eliminate conversion NOPs
+cmmMachOpFoldM _ (MO_SS_Conv rep1 rep2) [x] | rep1 == rep2 = Just x
+cmmMachOpFoldM _ (MO_UU_Conv rep1 rep2) [x] | rep1 == rep2 = Just x
+
+-- Eliminate nested conversions where possible
+cmmMachOpFoldM dflags conv_outer [CmmMachOp conv_inner [x]]
+  | Just (rep1,rep2,signed1) <- isIntConversion conv_inner,
+    Just (_,   rep3,signed2) <- isIntConversion conv_outer
+  = case () of
+        -- widen then narrow to the same size is a nop
+      _ | rep1 < rep2 && rep1 == rep3 -> Just x
+        -- Widen then narrow to different size: collapse to single conversion
+        -- but remember to use the signedness from the widening, just in case
+        -- the final conversion is a widen.
+        | rep1 < rep2 && rep2 > rep3 ->
+            Just $ cmmMachOpFold dflags (intconv signed1 rep1 rep3) [x]
+        -- Nested widenings: collapse if the signedness is the same
+        | rep1 < rep2 && rep2 < rep3 && signed1 == signed2 ->
+            Just $ cmmMachOpFold dflags (intconv signed1 rep1 rep3) [x]
+        -- Nested narrowings: collapse
+        | rep1 > rep2 && rep2 > rep3 ->
+            Just $ cmmMachOpFold dflags (MO_UU_Conv rep1 rep3) [x]
+        | otherwise ->
+            Nothing
+  where
+        isIntConversion (MO_UU_Conv rep1 rep2)
+          = Just (rep1,rep2,False)
+        isIntConversion (MO_SS_Conv rep1 rep2)
+          = Just (rep1,rep2,True)
+        isIntConversion _ = Nothing
+
+        intconv True  = MO_SS_Conv
+        intconv False = MO_UU_Conv
+
+-- ToDo: a narrow of a load can be collapsed into a narrow load, right?
+-- but what if the architecture only supports word-sized loads, should
+-- we do the transformation anyway?
+
+cmmMachOpFoldM dflags mop [CmmLit (CmmInt x xrep), CmmLit (CmmInt y _)]
+  = case mop of
+        -- for comparisons: don't forget to narrow the arguments before
+        -- comparing, since they might be out of range.
+        MO_Eq _   -> Just $ CmmLit (CmmInt (if x_u == y_u then 1 else 0) (wordWidth dflags))
+        MO_Ne _   -> Just $ CmmLit (CmmInt (if x_u /= y_u then 1 else 0) (wordWidth dflags))
+
+        MO_U_Gt _ -> Just $ CmmLit (CmmInt (if x_u >  y_u then 1 else 0) (wordWidth dflags))
+        MO_U_Ge _ -> Just $ CmmLit (CmmInt (if x_u >= y_u then 1 else 0) (wordWidth dflags))
+        MO_U_Lt _ -> Just $ CmmLit (CmmInt (if x_u <  y_u then 1 else 0) (wordWidth dflags))
+        MO_U_Le _ -> Just $ CmmLit (CmmInt (if x_u <= y_u then 1 else 0) (wordWidth dflags))
+
+        MO_S_Gt _ -> Just $ CmmLit (CmmInt (if x_s >  y_s then 1 else 0) (wordWidth dflags))
+        MO_S_Ge _ -> Just $ CmmLit (CmmInt (if x_s >= y_s then 1 else 0) (wordWidth dflags))
+        MO_S_Lt _ -> Just $ CmmLit (CmmInt (if x_s <  y_s then 1 else 0) (wordWidth dflags))
+        MO_S_Le _ -> Just $ CmmLit (CmmInt (if x_s <= y_s then 1 else 0) (wordWidth dflags))
+
+        MO_Add r -> Just $ CmmLit (CmmInt (x + y) r)
+        MO_Sub r -> Just $ CmmLit (CmmInt (x - y) r)
+        MO_Mul r -> Just $ CmmLit (CmmInt (x * y) r)
+        MO_U_Quot r | y /= 0 -> Just $ CmmLit (CmmInt (x_u `quot` y_u) r)
+        MO_U_Rem  r | y /= 0 -> Just $ CmmLit (CmmInt (x_u `rem`  y_u) r)
+        MO_S_Quot r | y /= 0 -> Just $ CmmLit (CmmInt (x `quot` y) r)
+        MO_S_Rem  r | y /= 0 -> Just $ CmmLit (CmmInt (x `rem` y) r)
+
+        MO_And   r -> Just $ CmmLit (CmmInt (x .&. y) r)
+        MO_Or    r -> Just $ CmmLit (CmmInt (x .|. y) r)
+        MO_Xor   r -> Just $ CmmLit (CmmInt (x `xor` y) r)
+
+        MO_Shl   r -> Just $ CmmLit (CmmInt (x `shiftL` fromIntegral y) r)
+        MO_U_Shr r -> Just $ CmmLit (CmmInt (x_u `shiftR` fromIntegral y) r)
+        MO_S_Shr r -> Just $ CmmLit (CmmInt (x `shiftR` fromIntegral y) r)
+
+        _          -> Nothing
+
+   where
+        x_u = narrowU xrep x
+        y_u = narrowU xrep y
+        x_s = narrowS xrep x
+        y_s = narrowS xrep y
+
+
+-- When possible, shift the constants to the right-hand side, so that we
+-- can match for strength reductions.  Note that the code generator will
+-- also assume that constants have been shifted to the right when
+-- possible.
+
+cmmMachOpFoldM dflags op [x@(CmmLit _), y]
+   | not (isLit y) && isCommutableMachOp op
+   = Just (cmmMachOpFold dflags op [y, x])
+
+-- Turn (a+b)+c into a+(b+c) where possible.  Because literals are
+-- moved to the right, it is more likely that we will find
+-- opportunities for constant folding when the expression is
+-- right-associated.
+--
+-- ToDo: this appears to introduce a quadratic behaviour due to the
+-- nested cmmMachOpFold.  Can we fix this?
+--
+-- Why do we check isLit arg1?  If arg1 is a lit, it means that arg2
+-- is also a lit (otherwise arg1 would be on the right).  If we
+-- put arg1 on the left of the rearranged expression, we'll get into a
+-- loop:  (x1+x2)+x3 => x1+(x2+x3)  => (x2+x3)+x1 => x2+(x3+x1) ...
+--
+-- Also don't do it if arg1 is PicBaseReg, so that we don't separate the
+-- PicBaseReg from the corresponding label (or label difference).
+--
+cmmMachOpFoldM dflags mop1 [CmmMachOp mop2 [arg1,arg2], arg3]
+   | mop2 `associates_with` mop1
+     && not (isLit arg1) && not (isPicReg arg1)
+   = Just (cmmMachOpFold dflags mop2 [arg1, cmmMachOpFold dflags mop1 [arg2,arg3]])
+   where
+     MO_Add{} `associates_with` MO_Sub{} = True
+     mop1 `associates_with` mop2 =
+        mop1 == mop2 && isAssociativeMachOp mop1
+
+-- special case: (a - b) + c  ==>  a + (c - b)
+cmmMachOpFoldM dflags mop1@(MO_Add{}) [CmmMachOp mop2@(MO_Sub{}) [arg1,arg2], arg3]
+   | not (isLit arg1) && not (isPicReg arg1)
+   = Just (cmmMachOpFold dflags mop1 [arg1, cmmMachOpFold dflags mop2 [arg3,arg2]])
+
+-- special case: (PicBaseReg + lit) + N  ==>  PicBaseReg + (lit+N)
+--
+-- this is better because lit+N is a single link-time constant (e.g. a
+-- CmmLabelOff), so the right-hand expression needs only one
+-- instruction, whereas the left needs two.  This happens when pointer
+-- tagging gives us label+offset, and PIC turns the label into
+-- PicBaseReg + label.
+--
+cmmMachOpFoldM _ MO_Add{} [ CmmMachOp op@MO_Add{} [pic, CmmLit lit]
+                          , CmmLit (CmmInt n rep) ]
+  | isPicReg pic
+  = Just $ CmmMachOp op [pic, CmmLit $ cmmOffsetLit lit off ]
+  where off = fromIntegral (narrowS rep n)
+
+-- Make a RegOff if we can
+cmmMachOpFoldM _ (MO_Add _) [CmmReg reg, CmmLit (CmmInt n rep)]
+  = Just $ cmmRegOff reg (fromIntegral (narrowS rep n))
+cmmMachOpFoldM _ (MO_Add _) [CmmRegOff reg off, CmmLit (CmmInt n rep)]
+  = Just $ cmmRegOff reg (off + fromIntegral (narrowS rep n))
+cmmMachOpFoldM _ (MO_Sub _) [CmmReg reg, CmmLit (CmmInt n rep)]
+  = Just $ cmmRegOff reg (- fromIntegral (narrowS rep n))
+cmmMachOpFoldM _ (MO_Sub _) [CmmRegOff reg off, CmmLit (CmmInt n rep)]
+  = Just $ cmmRegOff reg (off - fromIntegral (narrowS rep n))
+
+-- Fold label(+/-)offset into a CmmLit where possible
+
+cmmMachOpFoldM _ (MO_Add _) [CmmLit lit, CmmLit (CmmInt i rep)]
+  = Just $ CmmLit (cmmOffsetLit lit (fromIntegral (narrowU rep i)))
+cmmMachOpFoldM _ (MO_Add _) [CmmLit (CmmInt i rep), CmmLit lit]
+  = Just $ CmmLit (cmmOffsetLit lit (fromIntegral (narrowU rep i)))
+cmmMachOpFoldM _ (MO_Sub _) [CmmLit lit, CmmLit (CmmInt i rep)]
+  = Just $ CmmLit (cmmOffsetLit lit (fromIntegral (negate (narrowU rep i))))
+
+
+-- Comparison of literal with widened operand: perform the comparison
+-- at the smaller width, as long as the literal is within range.
+
+-- We can't do the reverse trick, when the operand is narrowed:
+-- narrowing throws away bits from the operand, there's no way to do
+-- the same comparison at the larger size.
+
+cmmMachOpFoldM dflags cmp [CmmMachOp conv [x], CmmLit (CmmInt i _)]
+  |     -- powerPC NCG has a TODO for I8/I16 comparisons, so don't try
+    platformArch (targetPlatform dflags) `elem` [ArchX86, ArchX86_64],
+        -- if the operand is widened:
+    Just (rep, signed, narrow_fn) <- maybe_conversion conv,
+        -- and this is a comparison operation:
+    Just narrow_cmp <- maybe_comparison cmp rep signed,
+        -- and the literal fits in the smaller size:
+    i == narrow_fn rep i
+        -- then we can do the comparison at the smaller size
+  = Just (cmmMachOpFold dflags narrow_cmp [x, CmmLit (CmmInt i rep)])
+ where
+    maybe_conversion (MO_UU_Conv from to)
+        | to > from
+        = Just (from, False, narrowU)
+    maybe_conversion (MO_SS_Conv from to)
+        | to > from
+        = Just (from, True, narrowS)
+
+        -- don't attempt to apply this optimisation when the source
+        -- is a float; see #1916
+    maybe_conversion _ = Nothing
+
+        -- careful (#2080): if the original comparison was signed, but
+        -- we were doing an unsigned widen, then we must do an
+        -- unsigned comparison at the smaller size.
+    maybe_comparison (MO_U_Gt _) rep _     = Just (MO_U_Gt rep)
+    maybe_comparison (MO_U_Ge _) rep _     = Just (MO_U_Ge rep)
+    maybe_comparison (MO_U_Lt _) rep _     = Just (MO_U_Lt rep)
+    maybe_comparison (MO_U_Le _) rep _     = Just (MO_U_Le rep)
+    maybe_comparison (MO_Eq   _) rep _     = Just (MO_Eq   rep)
+    maybe_comparison (MO_S_Gt _) rep True  = Just (MO_S_Gt rep)
+    maybe_comparison (MO_S_Ge _) rep True  = Just (MO_S_Ge rep)
+    maybe_comparison (MO_S_Lt _) rep True  = Just (MO_S_Lt rep)
+    maybe_comparison (MO_S_Le _) rep True  = Just (MO_S_Le rep)
+    maybe_comparison (MO_S_Gt _) rep False = Just (MO_U_Gt rep)
+    maybe_comparison (MO_S_Ge _) rep False = Just (MO_U_Ge rep)
+    maybe_comparison (MO_S_Lt _) rep False = Just (MO_U_Lt rep)
+    maybe_comparison (MO_S_Le _) rep False = Just (MO_U_Le rep)
+    maybe_comparison _ _ _ = Nothing
+
+-- We can often do something with constants of 0 and 1 ...
+-- See Note [Comparison operators]
+
+cmmMachOpFoldM dflags mop [x, y@(CmmLit (CmmInt 0 _))]
+  = case mop of
+        -- Arithmetic
+        MO_Add   _ -> Just x   -- x + 0 = x
+        MO_Sub   _ -> Just x   -- x - 0 = x
+        MO_Mul   _ -> Just y   -- x * 0 = 0
+
+        -- Logical operations
+        MO_And   _ -> Just y   -- x &     0 = 0
+        MO_Or    _ -> Just x   -- x |     0 = x
+        MO_Xor   _ -> Just x   -- x `xor` 0 = x
+
+        -- Shifts
+        MO_Shl   _ -> Just x   -- x << 0 = x
+        MO_S_Shr _ -> Just x   -- ditto shift-right
+        MO_U_Shr _ -> Just x
+
+        -- Comparisons; these ones are trickier
+        -- See Note [Comparison operators]
+        MO_Ne    _ | isComparisonExpr x -> Just x                -- (x > y) != 0  =  x > y
+        MO_Eq    _ | Just x' <- maybeInvertCmmExpr x -> Just x'  -- (x > y) == 0  =  x <= y
+        MO_U_Gt  _ | isComparisonExpr x -> Just x                -- (x > y) > 0   =  x > y
+        MO_S_Gt  _ | isComparisonExpr x -> Just x                -- ditto
+        MO_U_Lt  _ | isComparisonExpr x -> Just zero             -- (x > y) < 0  =  0
+        MO_S_Lt  _ | isComparisonExpr x -> Just zero
+        MO_U_Ge  _ | isComparisonExpr x -> Just one              -- (x > y) >= 0  =  1
+        MO_S_Ge  _ | isComparisonExpr x -> Just one
+
+        MO_U_Le  _ | Just x' <- maybeInvertCmmExpr x -> Just x'  -- (x > y) <= 0  =  x <= y
+        MO_S_Le  _ | Just x' <- maybeInvertCmmExpr x -> Just x'
+        _ -> Nothing
+  where
+    zero = CmmLit (CmmInt 0 (wordWidth dflags))
+    one  = CmmLit (CmmInt 1 (wordWidth dflags))
+
+cmmMachOpFoldM dflags mop [x, (CmmLit (CmmInt 1 rep))]
+  = case mop of
+        -- Arithmetic: x*1 = x, etc
+        MO_Mul    _ -> Just x
+        MO_S_Quot _ -> Just x
+        MO_U_Quot _ -> Just x
+        MO_S_Rem  _ -> Just $ CmmLit (CmmInt 0 rep)
+        MO_U_Rem  _ -> Just $ CmmLit (CmmInt 0 rep)
+
+        -- Comparisons; trickier
+        -- See Note [Comparison operators]
+        MO_Ne    _ | Just x' <- maybeInvertCmmExpr x -> Just x'  -- (x>y) != 1  =  x<=y
+        MO_Eq    _ | isComparisonExpr x -> Just x                -- (x>y) == 1  =  x>y
+        MO_U_Lt  _ | Just x' <- maybeInvertCmmExpr x -> Just x'  -- (x>y) < 1   =  x<=y
+        MO_S_Lt  _ | Just x' <- maybeInvertCmmExpr x -> Just x'  -- ditto
+        MO_U_Gt  _ | isComparisonExpr x -> Just zero             -- (x>y) > 1   = 0
+        MO_S_Gt  _ | isComparisonExpr x -> Just zero
+        MO_U_Le  _ | isComparisonExpr x -> Just one              -- (x>y) <= 1  = 1
+        MO_S_Le  _ | isComparisonExpr x -> Just one
+        MO_U_Ge  _ | isComparisonExpr x -> Just x                -- (x>y) >= 1  = x>y
+        MO_S_Ge  _ | isComparisonExpr x -> Just x
+        _ -> Nothing
+  where
+    zero = CmmLit (CmmInt 0 (wordWidth dflags))
+    one  = CmmLit (CmmInt 1 (wordWidth dflags))
+
+-- Now look for multiplication/division by powers of 2 (integers).
+
+cmmMachOpFoldM dflags mop [x, (CmmLit (CmmInt n _))]
+  = case mop of
+        MO_Mul rep
+           | Just p <- exactLog2 n ->
+                 Just (cmmMachOpFold dflags (MO_Shl rep) [x, CmmLit (CmmInt p rep)])
+        MO_U_Quot rep
+           | Just p <- exactLog2 n ->
+                 Just (cmmMachOpFold dflags (MO_U_Shr rep) [x, CmmLit (CmmInt p rep)])
+        MO_U_Rem rep
+           | Just _ <- exactLog2 n ->
+                 Just (cmmMachOpFold dflags (MO_And rep) [x, CmmLit (CmmInt (n - 1) rep)])
+        MO_S_Quot rep
+           | Just p <- exactLog2 n,
+             CmmReg _ <- x ->   -- We duplicate x in signedQuotRemHelper, hence require
+                                -- it is a reg.  FIXME: remove this restriction.
+                Just (cmmMachOpFold dflags (MO_S_Shr rep)
+                  [signedQuotRemHelper rep p, CmmLit (CmmInt p rep)])
+        MO_S_Rem rep
+           | Just p <- exactLog2 n,
+             CmmReg _ <- x ->   -- We duplicate x in signedQuotRemHelper, hence require
+                                -- it is a reg.  FIXME: remove this restriction.
+                -- We replace (x `rem` 2^p) by (x - (x `quot` 2^p) * 2^p).
+                -- Moreover, we fuse MO_S_Shr (last operation of MO_S_Quot)
+                -- and MO_S_Shl (multiplication by 2^p) into a single MO_And operation.
+                Just (cmmMachOpFold dflags (MO_Sub rep)
+                    [x, cmmMachOpFold dflags (MO_And rep)
+                      [signedQuotRemHelper rep p, CmmLit (CmmInt (- n) rep)]])
+        _ -> Nothing
+  where
+    -- In contrast with unsigned integers, for signed ones
+    -- shift right is not the same as quot, because it rounds
+    -- to minus infinity, whereas quot rounds toward zero.
+    -- To fix this up, we add one less than the divisor to the
+    -- dividend if it is a negative number.
+    --
+    -- to avoid a test/jump, we use the following sequence:
+    --      x1 = x >> word_size-1  (all 1s if -ve, all 0s if +ve)
+    --      x2 = y & (divisor-1)
+    --      result = x + x2
+    -- this could be done a bit more simply using conditional moves,
+    -- but we're processor independent here.
+    --
+    -- we optimise the divide by 2 case slightly, generating
+    --      x1 = x >> word_size-1  (unsigned)
+    --      return = x + x1
+    signedQuotRemHelper :: Width -> Integer -> CmmExpr
+    signedQuotRemHelper rep p = CmmMachOp (MO_Add rep) [x, x2]
+      where
+        bits = fromIntegral (widthInBits rep) - 1
+        shr = if p == 1 then MO_U_Shr rep else MO_S_Shr rep
+        x1 = CmmMachOp shr [x, CmmLit (CmmInt bits rep)]
+        x2 = if p == 1 then x1 else
+             CmmMachOp (MO_And rep) [x1, CmmLit (CmmInt (n-1) rep)]
+
+-- ToDo (#7116): optimise floating-point multiplication, e.g. x*2.0 -> x+x
+-- Unfortunately this needs a unique supply because x might not be a
+-- register.  See #2253 (program 6) for an example.
+
+
+-- Anything else is just too hard.
+
+cmmMachOpFoldM _ _ _ = Nothing
+
+{- Note [Comparison operators]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+If we have
+   CmmCondBranch ((x>#y) == 1) t f
+we really want to convert to
+   CmmCondBranch (x>#y) t f
+
+That's what the constant-folding operations on comparison operators do above.
+-}
+
+
+-- -----------------------------------------------------------------------------
+-- Utils
+
+isPicReg :: CmmExpr -> Bool
+isPicReg (CmmReg (CmmGlobal PicBaseReg)) = True
+isPicReg _ = False
diff --git a/compiler/GHC/Cmm/Parser.y b/compiler/GHC/Cmm/Parser.y
new file mode 100644
index 0000000000..d7235d0167
--- /dev/null
+++ b/compiler/GHC/Cmm/Parser.y
@@ -0,0 +1,1442 @@
+-----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow, 2004-2012
+--
+-- Parser for concrete Cmm.
+--
+-----------------------------------------------------------------------------
+
+{- -----------------------------------------------------------------------------
+Note [Syntax of .cmm files]
+
+NOTE: You are very much on your own in .cmm.  There is very little
+error checking at all:
+
+  * Type errors are detected by the (optional) -dcmm-lint pass, if you
+    don't turn this on then a type error will likely result in a panic
+    from the native code generator.
+
+  * Passing the wrong number of arguments or arguments of the wrong
+    type is not detected.
+
+There are two ways to write .cmm code:
+
+ (1) High-level Cmm code delegates the stack handling to GHC, and
+     never explicitly mentions Sp or registers.
+
+ (2) Low-level Cmm manages the stack itself, and must know about
+     calling conventions.
+
+Whether you want high-level or low-level Cmm is indicated by the
+presence of an argument list on a procedure.  For example:
+
+foo ( gcptr a, bits32 b )
+{
+  // this is high-level cmm code
+
+  if (b > 0) {
+     // we can make tail calls passing arguments:
+     jump stg_ap_0_fast(a);
+  }
+
+  push (stg_upd_frame_info, a) {
+    // stack frames can be explicitly pushed
+
+    (x,y) = call wibble(a,b,3,4);
+      // calls pass arguments and return results using the native
+      // Haskell calling convention.  The code generator will automatically
+      // construct a stack frame and an info table for the continuation.
+
+    return (x,y);
+      // we can return multiple values from the current proc
+  }
+}
+
+bar
+{
+  // this is low-level cmm code, indicated by the fact that we did not
+  // put an argument list on bar.
+
+  x = R1;  // the calling convention is explicit: better be careful
+           // that this works on all platforms!
+
+  jump %ENTRY_CODE(Sp(0))
+}
+
+Here is a list of rules for high-level and low-level code.  If you
+break the rules, you get a panic (for using a high-level construct in
+a low-level proc), or wrong code (when using low-level code in a
+high-level proc).  This stuff isn't checked! (TODO!)
+
+High-level only:
+
+  - tail-calls with arguments, e.g.
+    jump stg_fun (arg1, arg2);
+
+  - function calls:
+    (ret1,ret2) = call stg_fun (arg1, arg2);
+
+    This makes a call with the NativeNodeCall convention, and the
+    values are returned to the following code using the NativeReturn
+    convention.
+
+  - returning:
+    return (ret1, ret2)
+
+    These use the NativeReturn convention to return zero or more
+    results to the caller.
+
+  - pushing stack frames:
+    push (info_ptr, field1, ..., fieldN) { ... statements ... }
+
+  - reserving temporary stack space:
+
+      reserve N = x { ... }
+
+    this reserves an area of size N (words) on the top of the stack,
+    and binds its address to x (a local register).  Typically this is
+    used for allocating temporary storage for passing to foreign
+    functions.
+
+    Note that if you make any native calls or invoke the GC in the
+    scope of the reserve block, you are responsible for ensuring that
+    the stack you reserved is laid out correctly with an info table.
+
+Low-level only:
+
+  - References to Sp, R1-R8, F1-F4 etc.
+
+    NB. foreign calls may clobber the argument registers R1-R8, F1-F4
+    etc., so ensure they are saved into variables around foreign
+    calls.
+
+  - SAVE_THREAD_STATE() and LOAD_THREAD_STATE(), which modify Sp
+    directly.
+
+Both high-level and low-level code can use a raw tail-call:
+
+    jump stg_fun [R1,R2]
+
+NB. you *must* specify the list of GlobalRegs that are passed via a
+jump, otherwise the register allocator will assume that all the
+GlobalRegs are dead at the jump.
+
+
+Calling Conventions
+-------------------
+
+High-level procedures use the NativeNode calling convention, or the
+NativeReturn convention if the 'return' keyword is used (see Stack
+Frames below).
+
+Low-level procedures implement their own calling convention, so it can
+be anything at all.
+
+If a low-level procedure implements the NativeNode calling convention,
+then it can be called by high-level code using an ordinary function
+call.  In general this is hard to arrange because the calling
+convention depends on the number of physical registers available for
+parameter passing, but there are two cases where the calling
+convention is platform-independent:
+
+ - Zero arguments.
+
+ - One argument of pointer or non-pointer word type; this is always
+   passed in R1 according to the NativeNode convention.
+
+ - Returning a single value; these conventions are fixed and platform
+   independent.
+
+
+Stack Frames
+------------
+
+A stack frame is written like this:
+
+INFO_TABLE_RET ( label, FRAME_TYPE, info_ptr, field1, ..., fieldN )
+               return ( arg1, ..., argM )
+{
+  ... code ...
+}
+
+where field1 ... fieldN are the fields of the stack frame (with types)
+arg1...argN are the values returned to the stack frame (with types).
+The return values are assumed to be passed according to the
+NativeReturn convention.
+
+On entry to the code, the stack frame looks like:
+
+   |----------|
+   | fieldN   |
+   |   ...    |
+   | field1   |
+   |----------|
+   | info_ptr |
+   |----------|
+   |  argN    |
+   |   ...    | <- Sp
+
+and some of the args may be in registers.
+
+We prepend the code by a copyIn of the args, and assign all the stack
+frame fields to their formals.  The initial "arg offset" for stack
+layout purposes consists of the whole stack frame plus any args that
+might be on the stack.
+
+A tail-call may pass a stack frame to the callee using the following
+syntax:
+
+jump f (info_ptr, field1,..,fieldN) (arg1,..,argN)
+
+where info_ptr and field1..fieldN describe the stack frame, and
+arg1..argN are the arguments passed to f using the NativeNodeCall
+convention. Note if a field is longer than a word (e.g. a D_ on
+a 32-bit machine) then the call will push as many words as
+necessary to the stack to accommodate it (e.g. 2).
+
+
+----------------------------------------------------------------------------- -}
+
+{
+{-# LANGUAGE TupleSections #-}
+
+module GHC.Cmm.Parser ( parseCmmFile ) where
+
+import GhcPrelude
+
+import GHC.StgToCmm.ExtCode
+import GHC.Cmm.CallConv
+import GHC.StgToCmm.Prof
+import GHC.StgToCmm.Heap
+import GHC.StgToCmm.Monad hiding ( getCode, getCodeR, getCodeScoped, emitLabel, emit
+                               , emitStore, emitAssign, emitOutOfLine, withUpdFrameOff
+                               , getUpdFrameOff )
+import qualified GHC.StgToCmm.Monad as F
+import GHC.StgToCmm.Utils
+import GHC.StgToCmm.Foreign
+import GHC.StgToCmm.Expr
+import GHC.StgToCmm.Closure
+import GHC.StgToCmm.Layout     hiding (ArgRep(..))
+import GHC.StgToCmm.Ticky
+import GHC.StgToCmm.Bind  ( emitBlackHoleCode, emitUpdateFrame )
+import CoreSyn            ( Tickish(SourceNote) )
+
+import GHC.Cmm.Opt
+import GHC.Cmm.Graph
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch     ( mkSwitchTargets )
+import GHC.Cmm.Info
+import GHC.Cmm.BlockId
+import GHC.Cmm.Lexer
+import GHC.Cmm.CLabel
+import GHC.Cmm.Monad
+import GHC.Runtime.Layout
+import Lexer
+
+import CostCentre
+import ForeignCall
+import Module
+import GHC.Platform
+import Literal
+import Unique
+import UniqFM
+import SrcLoc
+import DynFlags
+import ErrUtils
+import StringBuffer
+import FastString
+import Panic
+import Constants
+import Outputable
+import BasicTypes
+import Bag              ( emptyBag, unitBag )
+import Var
+
+import Control.Monad
+import Data.Array
+import Data.Char        ( ord )
+import System.Exit
+import Data.Maybe
+import qualified Data.Map as M
+import qualified Data.ByteString.Char8 as BS8
+
+#include "HsVersions.h"
+}
+
+%expect 0
+
+%token
+        ':'     { L _ (CmmT_SpecChar ':') }
+        ';'     { L _ (CmmT_SpecChar ';') }
+        '{'     { L _ (CmmT_SpecChar '{') }
+        '}'     { L _ (CmmT_SpecChar '}') }
+        '['     { L _ (CmmT_SpecChar '[') }
+        ']'     { L _ (CmmT_SpecChar ']') }
+        '('     { L _ (CmmT_SpecChar '(') }
+        ')'     { L _ (CmmT_SpecChar ')') }
+        '='     { L _ (CmmT_SpecChar '=') }
+        '`'     { L _ (CmmT_SpecChar '`') }
+        '~'     { L _ (CmmT_SpecChar '~') }
+        '/'     { L _ (CmmT_SpecChar '/') }
+        '*'     { L _ (CmmT_SpecChar '*') }
+        '%'     { L _ (CmmT_SpecChar '%') }
+        '-'     { L _ (CmmT_SpecChar '-') }
+        '+'     { L _ (CmmT_SpecChar '+') }
+        '&'     { L _ (CmmT_SpecChar '&') }
+        '^'     { L _ (CmmT_SpecChar '^') }
+        '|'     { L _ (CmmT_SpecChar '|') }
+        '>'     { L _ (CmmT_SpecChar '>') }
+        '<'     { L _ (CmmT_SpecChar '<') }
+        ','     { L _ (CmmT_SpecChar ',') }
+        '!'     { L _ (CmmT_SpecChar '!') }
+
+        '..'    { L _ (CmmT_DotDot) }
+        '::'    { L _ (CmmT_DoubleColon) }
+        '>>'    { L _ (CmmT_Shr) }
+        '<<'    { L _ (CmmT_Shl) }
+        '>='    { L _ (CmmT_Ge) }
+        '<='    { L _ (CmmT_Le) }
+        '=='    { L _ (CmmT_Eq) }
+        '!='    { L _ (CmmT_Ne) }
+        '&&'    { L _ (CmmT_BoolAnd) }
+        '||'    { L _ (CmmT_BoolOr) }
+
+        'True'  { L _ (CmmT_True ) }
+        'False' { L _ (CmmT_False) }
+        'likely'{ L _ (CmmT_likely)}
+
+        'CLOSURE'       { L _ (CmmT_CLOSURE) }
+        'INFO_TABLE'    { L _ (CmmT_INFO_TABLE) }
+        'INFO_TABLE_RET'{ L _ (CmmT_INFO_TABLE_RET) }
+        'INFO_TABLE_FUN'{ L _ (CmmT_INFO_TABLE_FUN) }
+        'INFO_TABLE_CONSTR'{ L _ (CmmT_INFO_TABLE_CONSTR) }
+        'INFO_TABLE_SELECTOR'{ L _ (CmmT_INFO_TABLE_SELECTOR) }
+        'else'          { L _ (CmmT_else) }
+        'export'        { L _ (CmmT_export) }
+        'section'       { L _ (CmmT_section) }
+        'goto'          { L _ (CmmT_goto) }
+        'if'            { L _ (CmmT_if) }
+        'call'          { L _ (CmmT_call) }
+        'jump'          { L _ (CmmT_jump) }
+        'foreign'       { L _ (CmmT_foreign) }
+        'never'         { L _ (CmmT_never) }
+        'prim'          { L _ (CmmT_prim) }
+        'reserve'       { L _ (CmmT_reserve) }
+        'return'        { L _ (CmmT_return) }
+        'returns'       { L _ (CmmT_returns) }
+        'import'        { L _ (CmmT_import) }
+        'switch'        { L _ (CmmT_switch) }
+        'case'          { L _ (CmmT_case) }
+        'default'       { L _ (CmmT_default) }
+        'push'          { L _ (CmmT_push) }
+        'unwind'        { L _ (CmmT_unwind) }
+        'bits8'         { L _ (CmmT_bits8) }
+        'bits16'        { L _ (CmmT_bits16) }
+        'bits32'        { L _ (CmmT_bits32) }
+        'bits64'        { L _ (CmmT_bits64) }
+        'bits128'       { L _ (CmmT_bits128) }
+        'bits256'       { L _ (CmmT_bits256) }
+        'bits512'       { L _ (CmmT_bits512) }
+        'float32'       { L _ (CmmT_float32) }
+        'float64'       { L _ (CmmT_float64) }
+        'gcptr'         { L _ (CmmT_gcptr) }
+
+        GLOBALREG       { L _ (CmmT_GlobalReg   $$) }
+        NAME            { L _ (CmmT_Name        $$) }
+        STRING          { L _ (CmmT_String      $$) }
+        INT             { L _ (CmmT_Int         $$) }
+        FLOAT           { L _ (CmmT_Float       $$) }
+
+%monad { PD } { >>= } { return }
+%lexer { cmmlex } { L _ CmmT_EOF }
+%name cmmParse cmm
+%tokentype { Located CmmToken }
+
+-- C-- operator precedences, taken from the C-- spec
+%right '||'     -- non-std extension, called %disjoin in C--
+%right '&&'     -- non-std extension, called %conjoin in C--
+%right '!'
+%nonassoc '>=' '>' '<=' '<' '!=' '=='
+%left '|'
+%left '^'
+%left '&'
+%left '>>' '<<'
+%left '-' '+'
+%left '/' '*' '%'
+%right '~'
+
+%%
+
+cmm     :: { CmmParse () }
+        : {- empty -}                   { return () }
+        | cmmtop cmm                    { do $1; $2 }
+
+cmmtop  :: { CmmParse () }
+        : cmmproc                       { $1 }
+        | cmmdata                       { $1 }
+        | decl                          { $1 }
+        | 'CLOSURE' '(' NAME ',' NAME lits ')' ';'
+                {% liftP . withThisPackage $ \pkg ->
+                   do lits <- sequence $6;
+                      staticClosure pkg $3 $5 (map getLit lits) }
+
+-- The only static closures in the RTS are dummy closures like
+-- stg_END_TSO_QUEUE_closure and stg_dummy_ret.  We don't need
+-- to provide the full generality of static closures here.
+-- In particular:
+--      * CCS can always be CCS_DONT_CARE
+--      * closure is always extern
+--      * payload is always empty
+--      * we can derive closure and info table labels from a single NAME
+
+cmmdata :: { CmmParse () }
+        : 'section' STRING '{' data_label statics '}'
+                { do lbl <- $4;
+                     ss <- sequence $5;
+                     code (emitDecl (CmmData (Section (section $2) lbl) (Statics lbl $ concat ss))) }
+
+data_label :: { CmmParse CLabel }
+    : NAME ':'
+                {% liftP . withThisPackage $ \pkg ->
+                   return (mkCmmDataLabel pkg $1) }
+
+statics :: { [CmmParse [CmmStatic]] }
+        : {- empty -}                   { [] }
+        | static statics                { $1 : $2 }
+
+static  :: { CmmParse [CmmStatic] }
+        : type expr ';' { do e <- $2;
+                             return [CmmStaticLit (getLit e)] }
+        | type ';'                      { return [CmmUninitialised
+                                                        (widthInBytes (typeWidth $1))] }
+        | 'bits8' '[' ']' STRING ';'    { return [mkString $4] }
+        | 'bits8' '[' INT ']' ';'       { return [CmmUninitialised
+                                                        (fromIntegral $3)] }
+        | typenot8 '[' INT ']' ';'      { return [CmmUninitialised
+                                                (widthInBytes (typeWidth $1) *
+                                                        fromIntegral $3)] }
+        | 'CLOSURE' '(' NAME lits ')'
+                { do { lits <- sequence $4
+                ; dflags <- getDynFlags
+                     ; return $ map CmmStaticLit $
+                        mkStaticClosure dflags (mkForeignLabel $3 Nothing ForeignLabelInExternalPackage IsData)
+                         -- mkForeignLabel because these are only used
+                         -- for CHARLIKE and INTLIKE closures in the RTS.
+                        dontCareCCS (map getLit lits) [] [] [] } }
+        -- arrays of closures required for the CHARLIKE & INTLIKE arrays
+
+lits    :: { [CmmParse CmmExpr] }
+        : {- empty -}           { [] }
+        | ',' expr lits         { $2 : $3 }
+
+cmmproc :: { CmmParse () }
+        : info maybe_conv maybe_formals maybe_body
+                { do ((entry_ret_label, info, stk_formals, formals), agraph) <-
+                       getCodeScoped $ loopDecls $ do {
+                         (entry_ret_label, info, stk_formals) <- $1;
+                         dflags <- getDynFlags;
+                         formals <- sequence (fromMaybe [] $3);
+                         withName (showSDoc dflags (ppr entry_ret_label))
+                           $4;
+                         return (entry_ret_label, info, stk_formals, formals) }
+                     let do_layout = isJust $3
+                     code (emitProcWithStackFrame $2 info
+                                entry_ret_label stk_formals formals agraph
+                                do_layout ) }
+
+maybe_conv :: { Convention }
+           : {- empty -}        { NativeNodeCall }
+           | 'return'           { NativeReturn }
+
+maybe_body :: { CmmParse () }
+           : ';'                { return () }
+           | '{' body '}'       { withSourceNote $1 $3 $2 }
+
+info    :: { CmmParse (CLabel, Maybe CmmInfoTable, [LocalReg]) }
+        : NAME
+                {% liftP . withThisPackage $ \pkg ->
+                   do   newFunctionName $1 pkg
+                        return (mkCmmCodeLabel pkg $1, Nothing, []) }
+
+
+        | 'INFO_TABLE' '(' NAME ',' INT ',' INT ',' INT ',' STRING ',' STRING ')'
+                -- ptrs, nptrs, closure type, description, type
+                {% liftP . withThisPackage $ \pkg ->
+                   do dflags <- getDynFlags
+                      let prof = profilingInfo dflags $11 $13
+                          rep  = mkRTSRep (fromIntegral $9) $
+                                   mkHeapRep dflags False (fromIntegral $5)
+                                                   (fromIntegral $7) Thunk
+                              -- not really Thunk, but that makes the info table
+                              -- we want.
+                      return (mkCmmEntryLabel pkg $3,
+                              Just $ CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
+                                           , cit_rep = rep
+                                           , cit_prof = prof, cit_srt = Nothing, cit_clo = Nothing },
+                              []) }
+
+        | 'INFO_TABLE_FUN' '(' NAME ',' INT ',' INT ',' INT ',' STRING ',' STRING ',' INT ')'
+                -- ptrs, nptrs, closure type, description, type, fun type
+                {% liftP . withThisPackage $ \pkg ->
+                   do dflags <- getDynFlags
+                      let prof = profilingInfo dflags $11 $13
+                          ty   = Fun 0 (ArgSpec (fromIntegral $15))
+                                -- Arity zero, arg_type $15
+                          rep = mkRTSRep (fromIntegral $9) $
+                                    mkHeapRep dflags False (fromIntegral $5)
+                                                    (fromIntegral $7) ty
+                      return (mkCmmEntryLabel pkg $3,
+                              Just $ CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
+                                           , cit_rep = rep
+                                           , cit_prof = prof, cit_srt = Nothing, cit_clo = Nothing },
+                              []) }
+                -- we leave most of the fields zero here.  This is only used
+                -- to generate the BCO info table in the RTS at the moment.
+
+        | 'INFO_TABLE_CONSTR' '(' NAME ',' INT ',' INT ',' INT ',' INT ',' STRING ',' STRING ')'
+                -- ptrs, nptrs, tag, closure type, description, type
+                {% liftP . withThisPackage $ \pkg ->
+                   do dflags <- getDynFlags
+                      let prof = profilingInfo dflags $13 $15
+                          ty  = Constr (fromIntegral $9)  -- Tag
+                                       (BS8.pack $13)
+                          rep = mkRTSRep (fromIntegral $11) $
+                                  mkHeapRep dflags False (fromIntegral $5)
+                                                  (fromIntegral $7) ty
+                      return (mkCmmEntryLabel pkg $3,
+                              Just $ CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
+                                           , cit_rep = rep
+                                           , cit_prof = prof, cit_srt = Nothing,cit_clo = Nothing },
+                              []) }
+
+                     -- If profiling is on, this string gets duplicated,
+                     -- but that's the way the old code did it we can fix it some other time.
+
+        | 'INFO_TABLE_SELECTOR' '(' NAME ',' INT ',' INT ',' STRING ',' STRING ')'
+                -- selector, closure type, description, type
+                {% liftP . withThisPackage $ \pkg ->
+                   do dflags <- getDynFlags
+                      let prof = profilingInfo dflags $9 $11
+                          ty  = ThunkSelector (fromIntegral $5)
+                          rep = mkRTSRep (fromIntegral $7) $
+                                   mkHeapRep dflags False 0 0 ty
+                      return (mkCmmEntryLabel pkg $3,
+                              Just $ CmmInfoTable { cit_lbl = mkCmmInfoLabel pkg $3
+                                           , cit_rep = rep
+                                           , cit_prof = prof, cit_srt = Nothing, cit_clo = Nothing },
+                              []) }
+
+        | 'INFO_TABLE_RET' '(' NAME ',' INT ')'
+                -- closure type (no live regs)
+                {% liftP . withThisPackage $ \pkg ->
+                   do let prof = NoProfilingInfo
+                          rep  = mkRTSRep (fromIntegral $5) $ mkStackRep []
+                      return (mkCmmRetLabel pkg $3,
+                              Just $ CmmInfoTable { cit_lbl = mkCmmRetInfoLabel pkg $3
+                                           , cit_rep = rep
+                                           , cit_prof = prof, cit_srt = Nothing, cit_clo = Nothing },
+                              []) }
+
+        | 'INFO_TABLE_RET' '(' NAME ',' INT ',' formals0 ')'
+                -- closure type, live regs
+                {% liftP . withThisPackage $ \pkg ->
+                   do dflags <- getDynFlags
+                      live <- sequence $7
+                      let prof = NoProfilingInfo
+                          -- drop one for the info pointer
+                          bitmap = mkLiveness dflags (drop 1 live)
+                          rep  = mkRTSRep (fromIntegral $5) $ mkStackRep bitmap
+                      return (mkCmmRetLabel pkg $3,
+                              Just $ CmmInfoTable { cit_lbl = mkCmmRetInfoLabel pkg $3
+                                           , cit_rep = rep
+                                           , cit_prof = prof, cit_srt = Nothing, cit_clo = Nothing },
+                              live) }
+
+body    :: { CmmParse () }
+        : {- empty -}                   { return () }
+        | decl body                     { do $1; $2 }
+        | stmt body                     { do $1; $2 }
+
+decl    :: { CmmParse () }
+        : type names ';'                { mapM_ (newLocal $1) $2 }
+        | 'import' importNames ';'      { mapM_ newImport $2 }
+        | 'export' names ';'            { return () }  -- ignore exports
+
+
+-- an imported function name, with optional packageId
+importNames
+        :: { [(FastString, CLabel)] }
+        : importName                    { [$1] }
+        | importName ',' importNames    { $1 : $3 }
+
+importName
+        :: { (FastString,  CLabel) }
+
+        -- A label imported without an explicit packageId.
+        --      These are taken to come from some foreign, unnamed package.
+        : NAME
+        { ($1, mkForeignLabel $1 Nothing ForeignLabelInExternalPackage IsFunction) }
+
+        -- as previous 'NAME', but 'IsData'
+        | 'CLOSURE' NAME
+        { ($2, mkForeignLabel $2 Nothing ForeignLabelInExternalPackage IsData) }
+
+        -- A label imported with an explicit packageId.
+        | STRING NAME
+        { ($2, mkCmmCodeLabel (fsToUnitId (mkFastString $1)) $2) }
+
+
+names   :: { [FastString] }
+        : NAME                          { [$1] }
+        | NAME ',' names                { $1 : $3 }
+
+stmt    :: { CmmParse () }
+        : ';'                                   { return () }
+
+        | NAME ':'
+                { do l <- newLabel $1; emitLabel l }
+
+
+
+        | lreg '=' expr ';'
+                { do reg <- $1; e <- $3; withSourceNote $2 $4 (emitAssign reg e) }
+        | type '[' expr ']' '=' expr ';'
+                { withSourceNote $2 $7 (doStore $1 $3 $6) }
+
+        -- Gah! We really want to say "foreign_results" but that causes
+        -- a shift/reduce conflict with assignment.  We either
+        -- we expand out the no-result and single result cases or
+        -- we tweak the syntax to avoid the conflict.  The later
+        -- option is taken here because the other way would require
+        -- multiple levels of expanding and get unwieldy.
+        | foreign_results 'foreign' STRING foreignLabel '(' cmm_hint_exprs0 ')' safety opt_never_returns ';'
+                {% foreignCall $3 $1 $4 $6 $8 $9 }
+        | foreign_results 'prim' '%' NAME '(' exprs0 ')' ';'
+                {% primCall $1 $4 $6 }
+        -- stmt-level macros, stealing syntax from ordinary C-- function calls.
+        -- Perhaps we ought to use the %%-form?
+        | NAME '(' exprs0 ')' ';'
+                {% stmtMacro $1 $3  }
+        | 'switch' maybe_range expr '{' arms default '}'
+                { do as <- sequence $5; doSwitch $2 $3 as $6 }
+        | 'goto' NAME ';'
+                { do l <- lookupLabel $2; emit (mkBranch l) }
+        | 'return' '(' exprs0 ')' ';'
+                { doReturn $3 }
+        | 'jump' expr vols ';'
+                { doRawJump $2 $3 }
+        | 'jump' expr '(' exprs0 ')' ';'
+                { doJumpWithStack $2 [] $4 }
+        | 'jump' expr '(' exprs0 ')' '(' exprs0 ')' ';'
+                { doJumpWithStack $2 $4 $7 }
+        | 'call' expr '(' exprs0 ')' ';'
+                { doCall $2 [] $4 }
+        | '(' formals ')' '=' 'call' expr '(' exprs0 ')' ';'
+                { doCall $6 $2 $8 }
+        | 'if' bool_expr cond_likely 'goto' NAME
+                { do l <- lookupLabel $5; cmmRawIf $2 l $3 }
+        | 'if' bool_expr cond_likely '{' body '}' else
+                { cmmIfThenElse $2 (withSourceNote $4 $6 $5) $7 $3 }
+        | 'push' '(' exprs0 ')' maybe_body
+                { pushStackFrame $3 $5 }
+        | 'reserve' expr '=' lreg maybe_body
+                { reserveStackFrame $2 $4 $5 }
+        | 'unwind' unwind_regs ';'
+                { $2 >>= code . emitUnwind }
+
+unwind_regs
+        :: { CmmParse [(GlobalReg, Maybe CmmExpr)] }
+        : GLOBALREG '=' expr_or_unknown ',' unwind_regs
+                { do e <- $3; rest <- $5; return (($1, e) : rest) }
+        | GLOBALREG '=' expr_or_unknown
+                { do e <- $3; return [($1, e)] }
+
+-- | Used by unwind to indicate unknown unwinding values.
+expr_or_unknown
+        :: { CmmParse (Maybe CmmExpr) }
+        : 'return'
+                { do return Nothing }
+        | expr
+                { do e <- $1; return (Just e) }
+
+foreignLabel     :: { CmmParse CmmExpr }
+        : NAME                          { return (CmmLit (CmmLabel (mkForeignLabel $1 Nothing ForeignLabelInThisPackage IsFunction))) }
+
+opt_never_returns :: { CmmReturnInfo }
+        :                               { CmmMayReturn }
+        | 'never' 'returns'             { CmmNeverReturns }
+
+bool_expr :: { CmmParse BoolExpr }
+        : bool_op                       { $1 }
+        | expr                          { do e <- $1; return (BoolTest e) }
+
+bool_op :: { CmmParse BoolExpr }
+        : bool_expr '&&' bool_expr      { do e1 <- $1; e2 <- $3;
+                                          return (BoolAnd e1 e2) }
+        | bool_expr '||' bool_expr      { do e1 <- $1; e2 <- $3;
+                                          return (BoolOr e1 e2)  }
+        | '!' bool_expr                 { do e <- $2; return (BoolNot e) }
+        | '(' bool_op ')'               { $2 }
+
+safety  :: { Safety }
+        : {- empty -}                   { PlayRisky }
+        | STRING                        {% parseSafety $1 }
+
+vols    :: { [GlobalReg] }
+        : '[' ']'                       { [] }
+        | '[' '*' ']'                   {% do df <- getDynFlags
+                                         ; return (realArgRegsCover df) }
+                                           -- All of them. See comment attached
+                                           -- to realArgRegsCover
+        | '[' globals ']'               { $2 }
+
+globals :: { [GlobalReg] }
+        : GLOBALREG                     { [$1] }
+        | GLOBALREG ',' globals         { $1 : $3 }
+
+maybe_range :: { Maybe (Integer,Integer) }
+        : '[' INT '..' INT ']'  { Just ($2, $4) }
+        | {- empty -}           { Nothing }
+
+arms    :: { [CmmParse ([Integer],Either BlockId (CmmParse ()))] }
+        : {- empty -}                   { [] }
+        | arm arms                      { $1 : $2 }
+
+arm     :: { CmmParse ([Integer],Either BlockId (CmmParse ())) }
+        : 'case' ints ':' arm_body      { do b <- $4; return ($2, b) }
+
+arm_body :: { CmmParse (Either BlockId (CmmParse ())) }
+        : '{' body '}'                  { return (Right (withSourceNote $1 $3 $2)) }
+        | 'goto' NAME ';'               { do l <- lookupLabel $2; return (Left l) }
+
+ints    :: { [Integer] }
+        : INT                           { [ $1 ] }
+        | INT ',' ints                  { $1 : $3 }
+
+default :: { Maybe (CmmParse ()) }
+        : 'default' ':' '{' body '}'    { Just (withSourceNote $3 $5 $4) }
+        -- taking a few liberties with the C-- syntax here; C-- doesn't have
+        -- 'default' branches
+        | {- empty -}                   { Nothing }
+
+-- Note: OldCmm doesn't support a first class 'else' statement, though
+-- CmmNode does.
+else    :: { CmmParse () }
+        : {- empty -}                   { return () }
+        | 'else' '{' body '}'           { withSourceNote $2 $4 $3 }
+
+cond_likely :: { Maybe Bool }
+        : '(' 'likely' ':' 'True'  ')'  { Just True  }
+        | '(' 'likely' ':' 'False' ')'  { Just False }
+        | {- empty -}                   { Nothing }
+
+
+-- we have to write this out longhand so that Happy's precedence rules
+-- can kick in.
+expr    :: { CmmParse CmmExpr }
+        : expr '/' expr                 { mkMachOp MO_U_Quot [$1,$3] }
+        | expr '*' expr                 { mkMachOp MO_Mul [$1,$3] }
+        | expr '%' expr                 { mkMachOp MO_U_Rem [$1,$3] }
+        | expr '-' expr                 { mkMachOp MO_Sub [$1,$3] }
+        | expr '+' expr                 { mkMachOp MO_Add [$1,$3] }
+        | expr '>>' expr                { mkMachOp MO_U_Shr [$1,$3] }
+        | expr '<<' expr                { mkMachOp MO_Shl [$1,$3] }
+        | expr '&' expr                 { mkMachOp MO_And [$1,$3] }
+        | expr '^' expr                 { mkMachOp MO_Xor [$1,$3] }
+        | expr '|' expr                 { mkMachOp MO_Or [$1,$3] }
+        | expr '>=' expr                { mkMachOp MO_U_Ge [$1,$3] }
+        | expr '>' expr                 { mkMachOp MO_U_Gt [$1,$3] }
+        | expr '<=' expr                { mkMachOp MO_U_Le [$1,$3] }
+        | expr '<' expr                 { mkMachOp MO_U_Lt [$1,$3] }
+        | expr '!=' expr                { mkMachOp MO_Ne [$1,$3] }
+        | expr '==' expr                { mkMachOp MO_Eq [$1,$3] }
+        | '~' expr                      { mkMachOp MO_Not [$2] }
+        | '-' expr                      { mkMachOp MO_S_Neg [$2] }
+        | expr0 '`' NAME '`' expr0      {% do { mo <- nameToMachOp $3 ;
+                                                return (mkMachOp mo [$1,$5]) } }
+        | expr0                         { $1 }
+
+expr0   :: { CmmParse CmmExpr }
+        : INT   maybe_ty         { return (CmmLit (CmmInt $1 (typeWidth $2))) }
+        | FLOAT maybe_ty         { return (CmmLit (CmmFloat $1 (typeWidth $2))) }
+        | STRING                 { do s <- code (newStringCLit $1);
+                                      return (CmmLit s) }
+        | reg                    { $1 }
+        | type '[' expr ']'      { do e <- $3; return (CmmLoad e $1) }
+        | '%' NAME '(' exprs0 ')' {% exprOp $2 $4 }
+        | '(' expr ')'           { $2 }
+
+
+-- leaving out the type of a literal gives you the native word size in C--
+maybe_ty :: { CmmType }
+        : {- empty -}                   {% do dflags <- getDynFlags; return $ bWord dflags }
+        | '::' type                     { $2 }
+
+cmm_hint_exprs0 :: { [CmmParse (CmmExpr, ForeignHint)] }
+        : {- empty -}                   { [] }
+        | cmm_hint_exprs                { $1 }
+
+cmm_hint_exprs :: { [CmmParse (CmmExpr, ForeignHint)] }
+        : cmm_hint_expr                 { [$1] }
+        | cmm_hint_expr ',' cmm_hint_exprs      { $1 : $3 }
+
+cmm_hint_expr :: { CmmParse (CmmExpr, ForeignHint) }
+        : expr                          { do e <- $1;
+                                             return (e, inferCmmHint e) }
+        | expr STRING                   {% do h <- parseCmmHint $2;
+                                              return $ do
+                                                e <- $1; return (e, h) }
+
+exprs0  :: { [CmmParse CmmExpr] }
+        : {- empty -}                   { [] }
+        | exprs                         { $1 }
+
+exprs   :: { [CmmParse CmmExpr] }
+        : expr                          { [ $1 ] }
+        | expr ',' exprs                { $1 : $3 }
+
+reg     :: { CmmParse CmmExpr }
+        : NAME                  { lookupName $1 }
+        | GLOBALREG             { return (CmmReg (CmmGlobal $1)) }
+
+foreign_results :: { [CmmParse (LocalReg, ForeignHint)] }
+        : {- empty -}                   { [] }
+        | '(' foreign_formals ')' '='   { $2 }
+
+foreign_formals :: { [CmmParse (LocalReg, ForeignHint)] }
+        : foreign_formal                        { [$1] }
+        | foreign_formal ','                    { [$1] }
+        | foreign_formal ',' foreign_formals    { $1 : $3 }
+
+foreign_formal :: { CmmParse (LocalReg, ForeignHint) }
+        : local_lreg            { do e <- $1; return (e, inferCmmHint (CmmReg (CmmLocal e))) }
+        | STRING local_lreg     {% do h <- parseCmmHint $1;
+                                      return $ do
+                                         e <- $2; return (e,h) }
+
+local_lreg :: { CmmParse LocalReg }
+        : NAME                  { do e <- lookupName $1;
+                                     return $
+                                       case e of
+                                        CmmReg (CmmLocal r) -> r
+                                        other -> pprPanic "CmmParse:" (ftext $1 <> text " not a local register") }
+
+lreg    :: { CmmParse CmmReg }
+        : NAME                  { do e <- lookupName $1;
+                                     return $
+                                       case e of
+                                        CmmReg r -> r
+                                        other -> pprPanic "CmmParse:" (ftext $1 <> text " not a register") }
+        | GLOBALREG             { return (CmmGlobal $1) }
+
+maybe_formals :: { Maybe [CmmParse LocalReg] }
+        : {- empty -}           { Nothing }
+        | '(' formals0 ')'      { Just $2 }
+
+formals0 :: { [CmmParse LocalReg] }
+        : {- empty -}           { [] }
+        | formals               { $1 }
+
+formals :: { [CmmParse LocalReg] }
+        : formal ','            { [$1] }
+        | formal                { [$1] }
+        | formal ',' formals       { $1 : $3 }
+
+formal :: { CmmParse LocalReg }
+        : type NAME             { newLocal $1 $2 }
+
+type    :: { CmmType }
+        : 'bits8'               { b8 }
+        | typenot8              { $1 }
+
+typenot8 :: { CmmType }
+        : 'bits16'              { b16 }
+        | 'bits32'              { b32 }
+        | 'bits64'              { b64 }
+        | 'bits128'             { b128 }
+        | 'bits256'             { b256 }
+        | 'bits512'             { b512 }
+        | 'float32'             { f32 }
+        | 'float64'             { f64 }
+        | 'gcptr'               {% do dflags <- getDynFlags; return $ gcWord dflags }
+
+{
+section :: String -> SectionType
+section "text"      = Text
+section "data"      = Data
+section "rodata"    = ReadOnlyData
+section "relrodata" = RelocatableReadOnlyData
+section "bss"       = UninitialisedData
+section s           = OtherSection s
+
+mkString :: String -> CmmStatic
+mkString s = CmmString (BS8.pack s)
+
+-- |
+-- Given an info table, decide what the entry convention for the proc
+-- is.  That is, for an INFO_TABLE_RET we want the return convention,
+-- otherwise it is a NativeNodeCall.
+--
+infoConv :: Maybe CmmInfoTable -> Convention
+infoConv Nothing = NativeNodeCall
+infoConv (Just info)
+  | isStackRep (cit_rep info) = NativeReturn
+  | otherwise                 = NativeNodeCall
+
+-- mkMachOp infers the type of the MachOp from the type of its first
+-- argument.  We assume that this is correct: for MachOps that don't have
+-- symmetrical args (e.g. shift ops), the first arg determines the type of
+-- the op.
+mkMachOp :: (Width -> MachOp) -> [CmmParse CmmExpr] -> CmmParse CmmExpr
+mkMachOp fn args = do
+  dflags <- getDynFlags
+  arg_exprs <- sequence args
+  return (CmmMachOp (fn (typeWidth (cmmExprType dflags (head arg_exprs)))) arg_exprs)
+
+getLit :: CmmExpr -> CmmLit
+getLit (CmmLit l) = l
+getLit (CmmMachOp (MO_S_Neg _) [CmmLit (CmmInt i r)])  = CmmInt (negate i) r
+getLit _ = panic "invalid literal" -- TODO messy failure
+
+nameToMachOp :: FastString -> PD (Width -> MachOp)
+nameToMachOp name =
+  case lookupUFM machOps name of
+        Nothing -> fail ("unknown primitive " ++ unpackFS name)
+        Just m  -> return m
+
+exprOp :: FastString -> [CmmParse CmmExpr] -> PD (CmmParse CmmExpr)
+exprOp name args_code = do
+  dflags <- getDynFlags
+  case lookupUFM (exprMacros dflags) name of
+     Just f  -> return $ do
+        args <- sequence args_code
+        return (f args)
+     Nothing -> do
+        mo <- nameToMachOp name
+        return $ mkMachOp mo args_code
+
+exprMacros :: DynFlags -> UniqFM ([CmmExpr] -> CmmExpr)
+exprMacros dflags = listToUFM [
+  ( fsLit "ENTRY_CODE",   \ [x] -> entryCode dflags x ),
+  ( fsLit "INFO_PTR",     \ [x] -> closureInfoPtr dflags x ),
+  ( fsLit "STD_INFO",     \ [x] -> infoTable dflags x ),
+  ( fsLit "FUN_INFO",     \ [x] -> funInfoTable dflags x ),
+  ( fsLit "GET_ENTRY",    \ [x] -> entryCode dflags (closureInfoPtr dflags x) ),
+  ( fsLit "GET_STD_INFO", \ [x] -> infoTable dflags (closureInfoPtr dflags x) ),
+  ( fsLit "GET_FUN_INFO", \ [x] -> funInfoTable dflags (closureInfoPtr dflags x) ),
+  ( fsLit "INFO_TYPE",    \ [x] -> infoTableClosureType dflags x ),
+  ( fsLit "INFO_PTRS",    \ [x] -> infoTablePtrs dflags x ),
+  ( fsLit "INFO_NPTRS",   \ [x] -> infoTableNonPtrs dflags x )
+  ]
+
+-- we understand a subset of C-- primitives:
+machOps = listToUFM $
+        map (\(x, y) -> (mkFastString x, y)) [
+        ( "add",        MO_Add ),
+        ( "sub",        MO_Sub ),
+        ( "eq",         MO_Eq ),
+        ( "ne",         MO_Ne ),
+        ( "mul",        MO_Mul ),
+        ( "neg",        MO_S_Neg ),
+        ( "quot",       MO_S_Quot ),
+        ( "rem",        MO_S_Rem ),
+        ( "divu",       MO_U_Quot ),
+        ( "modu",       MO_U_Rem ),
+
+        ( "ge",         MO_S_Ge ),
+        ( "le",         MO_S_Le ),
+        ( "gt",         MO_S_Gt ),
+        ( "lt",         MO_S_Lt ),
+
+        ( "geu",        MO_U_Ge ),
+        ( "leu",        MO_U_Le ),
+        ( "gtu",        MO_U_Gt ),
+        ( "ltu",        MO_U_Lt ),
+
+        ( "and",        MO_And ),
+        ( "or",         MO_Or ),
+        ( "xor",        MO_Xor ),
+        ( "com",        MO_Not ),
+        ( "shl",        MO_Shl ),
+        ( "shrl",       MO_U_Shr ),
+        ( "shra",       MO_S_Shr ),
+
+        ( "fadd",       MO_F_Add ),
+        ( "fsub",       MO_F_Sub ),
+        ( "fneg",       MO_F_Neg ),
+        ( "fmul",       MO_F_Mul ),
+        ( "fquot",      MO_F_Quot ),
+
+        ( "feq",        MO_F_Eq ),
+        ( "fne",        MO_F_Ne ),
+        ( "fge",        MO_F_Ge ),
+        ( "fle",        MO_F_Le ),
+        ( "fgt",        MO_F_Gt ),
+        ( "flt",        MO_F_Lt ),
+
+        ( "lobits8",  flip MO_UU_Conv W8  ),
+        ( "lobits16", flip MO_UU_Conv W16 ),
+        ( "lobits32", flip MO_UU_Conv W32 ),
+        ( "lobits64", flip MO_UU_Conv W64 ),
+
+        ( "zx16",     flip MO_UU_Conv W16 ),
+        ( "zx32",     flip MO_UU_Conv W32 ),
+        ( "zx64",     flip MO_UU_Conv W64 ),
+
+        ( "sx16",     flip MO_SS_Conv W16 ),
+        ( "sx32",     flip MO_SS_Conv W32 ),
+        ( "sx64",     flip MO_SS_Conv W64 ),
+
+        ( "f2f32",    flip MO_FF_Conv W32 ),  -- TODO; rounding mode
+        ( "f2f64",    flip MO_FF_Conv W64 ),  -- TODO; rounding mode
+        ( "f2i8",     flip MO_FS_Conv W8 ),
+        ( "f2i16",    flip MO_FS_Conv W16 ),
+        ( "f2i32",    flip MO_FS_Conv W32 ),
+        ( "f2i64",    flip MO_FS_Conv W64 ),
+        ( "i2f32",    flip MO_SF_Conv W32 ),
+        ( "i2f64",    flip MO_SF_Conv W64 )
+        ]
+
+callishMachOps :: UniqFM ([CmmExpr] -> (CallishMachOp, [CmmExpr]))
+callishMachOps = listToUFM $
+        map (\(x, y) -> (mkFastString x, y)) [
+        ( "read_barrier", (MO_ReadBarrier,)),
+        ( "write_barrier", (MO_WriteBarrier,)),
+        ( "memcpy", memcpyLikeTweakArgs MO_Memcpy ),
+        ( "memset", memcpyLikeTweakArgs MO_Memset ),
+        ( "memmove", memcpyLikeTweakArgs MO_Memmove ),
+        ( "memcmp", memcpyLikeTweakArgs MO_Memcmp ),
+
+        ("prefetch0", (MO_Prefetch_Data 0,)),
+        ("prefetch1", (MO_Prefetch_Data 1,)),
+        ("prefetch2", (MO_Prefetch_Data 2,)),
+        ("prefetch3", (MO_Prefetch_Data 3,)),
+
+        ( "popcnt8",  (MO_PopCnt W8,)),
+        ( "popcnt16", (MO_PopCnt W16,)),
+        ( "popcnt32", (MO_PopCnt W32,)),
+        ( "popcnt64", (MO_PopCnt W64,)),
+
+        ( "pdep8",  (MO_Pdep W8,)),
+        ( "pdep16", (MO_Pdep W16,)),
+        ( "pdep32", (MO_Pdep W32,)),
+        ( "pdep64", (MO_Pdep W64,)),
+
+        ( "pext8",  (MO_Pext W8,)),
+        ( "pext16", (MO_Pext W16,)),
+        ( "pext32", (MO_Pext W32,)),
+        ( "pext64", (MO_Pext W64,)),
+
+        ( "cmpxchg8",  (MO_Cmpxchg W8,)),
+        ( "cmpxchg16", (MO_Cmpxchg W16,)),
+        ( "cmpxchg32", (MO_Cmpxchg W32,)),
+        ( "cmpxchg64", (MO_Cmpxchg W64,))
+
+        -- ToDo: the rest, maybe
+        -- edit: which rest?
+        -- also: how do we tell CMM Lint how to type check callish macops?
+    ]
+  where
+    memcpyLikeTweakArgs :: (Int -> CallishMachOp) -> [CmmExpr] -> (CallishMachOp, [CmmExpr])
+    memcpyLikeTweakArgs op [] = pgmError "memcpy-like function requires at least one argument"
+    memcpyLikeTweakArgs op args@(_:_) =
+        (op align, args')
+      where
+        args' = init args
+        align = case last args of
+          CmmLit (CmmInt alignInteger _) -> fromInteger alignInteger
+          e -> pprPgmError "Non-constant alignment in memcpy-like function:" (ppr e)
+        -- The alignment of memcpy-ish operations must be a
+        -- compile-time constant. We verify this here, passing it around
+        -- in the MO_* constructor. In order to do this, however, we
+        -- must intercept the arguments in primCall.
+
+parseSafety :: String -> PD Safety
+parseSafety "safe"   = return PlaySafe
+parseSafety "unsafe" = return PlayRisky
+parseSafety "interruptible" = return PlayInterruptible
+parseSafety str      = fail ("unrecognised safety: " ++ str)
+
+parseCmmHint :: String -> PD ForeignHint
+parseCmmHint "ptr"    = return AddrHint
+parseCmmHint "signed" = return SignedHint
+parseCmmHint str      = fail ("unrecognised hint: " ++ str)
+
+-- labels are always pointers, so we might as well infer the hint
+inferCmmHint :: CmmExpr -> ForeignHint
+inferCmmHint (CmmLit (CmmLabel _)) = AddrHint
+inferCmmHint (CmmReg (CmmGlobal g)) | isPtrGlobalReg g = AddrHint
+inferCmmHint _ = NoHint
+
+isPtrGlobalReg Sp                    = True
+isPtrGlobalReg SpLim                 = True
+isPtrGlobalReg Hp                    = True
+isPtrGlobalReg HpLim                 = True
+isPtrGlobalReg CCCS                  = True
+isPtrGlobalReg CurrentTSO            = True
+isPtrGlobalReg CurrentNursery        = True
+isPtrGlobalReg (VanillaReg _ VGcPtr) = True
+isPtrGlobalReg _                     = False
+
+happyError :: PD a
+happyError = PD $ \_ s -> unP srcParseFail s
+
+-- -----------------------------------------------------------------------------
+-- Statement-level macros
+
+stmtMacro :: FastString -> [CmmParse CmmExpr] -> PD (CmmParse ())
+stmtMacro fun args_code = do
+  case lookupUFM stmtMacros fun of
+    Nothing -> fail ("unknown macro: " ++ unpackFS fun)
+    Just fcode -> return $ do
+        args <- sequence args_code
+        code (fcode args)
+
+stmtMacros :: UniqFM ([CmmExpr] -> FCode ())
+stmtMacros = listToUFM [
+  ( fsLit "CCS_ALLOC",             \[words,ccs]  -> profAlloc words ccs ),
+  ( fsLit "ENTER_CCS_THUNK",       \[e] -> enterCostCentreThunk e ),
+
+  ( fsLit "CLOSE_NURSERY",         \[]  -> emitCloseNursery ),
+  ( fsLit "OPEN_NURSERY",          \[]  -> emitOpenNursery ),
+
+  -- completely generic heap and stack checks, for use in high-level cmm.
+  ( fsLit "HP_CHK_GEN",            \[bytes] ->
+                                      heapStackCheckGen Nothing (Just bytes) ),
+  ( fsLit "STK_CHK_GEN",           \[] ->
+                                      heapStackCheckGen (Just (CmmLit CmmHighStackMark)) Nothing ),
+
+  -- A stack check for a fixed amount of stack.  Sounds a bit strange, but
+  -- we use the stack for a bit of temporary storage in a couple of primops
+  ( fsLit "STK_CHK_GEN_N",         \[bytes] ->
+                                      heapStackCheckGen (Just bytes) Nothing ),
+
+  -- A stack check on entry to a thunk, where the argument is the thunk pointer.
+  ( fsLit "STK_CHK_NP"   ,         \[node] -> entryHeapCheck' False node 0 [] (return ())),
+
+  ( fsLit "LOAD_THREAD_STATE",     \[] -> emitLoadThreadState ),
+  ( fsLit "SAVE_THREAD_STATE",     \[] -> emitSaveThreadState ),
+
+  ( fsLit "LDV_ENTER",             \[e] -> ldvEnter e ),
+  ( fsLit "LDV_RECORD_CREATE",     \[e] -> ldvRecordCreate e ),
+
+  ( fsLit "PUSH_UPD_FRAME",        \[sp,e] -> emitPushUpdateFrame sp e ),
+  ( fsLit "SET_HDR",               \[ptr,info,ccs] ->
+                                        emitSetDynHdr ptr info ccs ),
+  ( fsLit "TICK_ALLOC_PRIM",       \[hdr,goods,slop] ->
+                                        tickyAllocPrim hdr goods slop ),
+  ( fsLit "TICK_ALLOC_PAP",        \[goods,slop] ->
+                                        tickyAllocPAP goods slop ),
+  ( fsLit "TICK_ALLOC_UP_THK",     \[goods,slop] ->
+                                        tickyAllocThunk goods slop ),
+  ( fsLit "UPD_BH_UPDATABLE",      \[reg] -> emitBlackHoleCode reg )
+ ]
+
+emitPushUpdateFrame :: CmmExpr -> CmmExpr -> FCode ()
+emitPushUpdateFrame sp e = do
+  dflags <- getDynFlags
+  emitUpdateFrame dflags sp mkUpdInfoLabel e
+
+pushStackFrame :: [CmmParse CmmExpr] -> CmmParse () -> CmmParse ()
+pushStackFrame fields body = do
+  dflags <- getDynFlags
+  exprs <- sequence fields
+  updfr_off <- getUpdFrameOff
+  let (new_updfr_off, _, g) = copyOutOflow dflags NativeReturn Ret Old
+                                           [] updfr_off exprs
+  emit g
+  withUpdFrameOff new_updfr_off body
+
+reserveStackFrame
+  :: CmmParse CmmExpr
+  -> CmmParse CmmReg
+  -> CmmParse ()
+  -> CmmParse ()
+reserveStackFrame psize preg body = do
+  dflags <- getDynFlags
+  old_updfr_off <- getUpdFrameOff
+  reg <- preg
+  esize <- psize
+  let size = case constantFoldExpr dflags esize of
+               CmmLit (CmmInt n _) -> n
+               _other -> pprPanic "CmmParse: not a compile-time integer: "
+                            (ppr esize)
+  let frame = old_updfr_off + wORD_SIZE dflags * fromIntegral size
+  emitAssign reg (CmmStackSlot Old frame)
+  withUpdFrameOff frame body
+
+profilingInfo dflags desc_str ty_str
+  = if not (gopt Opt_SccProfilingOn dflags)
+    then NoProfilingInfo
+    else ProfilingInfo (BS8.pack desc_str) (BS8.pack ty_str)
+
+staticClosure :: UnitId -> FastString -> FastString -> [CmmLit] -> CmmParse ()
+staticClosure pkg cl_label info payload
+  = do dflags <- getDynFlags
+       let lits = mkStaticClosure dflags (mkCmmInfoLabel pkg info) dontCareCCS payload [] [] []
+       code $ emitDataLits (mkCmmDataLabel pkg cl_label) lits
+
+foreignCall
+        :: String
+        -> [CmmParse (LocalReg, ForeignHint)]
+        -> CmmParse CmmExpr
+        -> [CmmParse (CmmExpr, ForeignHint)]
+        -> Safety
+        -> CmmReturnInfo
+        -> PD (CmmParse ())
+foreignCall conv_string results_code expr_code args_code safety ret
+  = do  conv <- case conv_string of
+          "C" -> return CCallConv
+          "stdcall" -> return StdCallConv
+          _ -> fail ("unknown calling convention: " ++ conv_string)
+        return $ do
+          dflags <- getDynFlags
+          results <- sequence results_code
+          expr <- expr_code
+          args <- sequence args_code
+          let
+                  expr' = adjCallTarget dflags conv expr args
+                  (arg_exprs, arg_hints) = unzip args
+                  (res_regs,  res_hints) = unzip results
+                  fc = ForeignConvention conv arg_hints res_hints ret
+                  target = ForeignTarget expr' fc
+          _ <- code $ emitForeignCall safety res_regs target arg_exprs
+          return ()
+
+
+doReturn :: [CmmParse CmmExpr] -> CmmParse ()
+doReturn exprs_code = do
+  dflags <- getDynFlags
+  exprs <- sequence exprs_code
+  updfr_off <- getUpdFrameOff
+  emit (mkReturnSimple dflags exprs updfr_off)
+
+mkReturnSimple  :: DynFlags -> [CmmActual] -> UpdFrameOffset -> CmmAGraph
+mkReturnSimple dflags actuals updfr_off =
+  mkReturn dflags e actuals updfr_off
+  where e = entryCode dflags (CmmLoad (CmmStackSlot Old updfr_off)
+                             (gcWord dflags))
+
+doRawJump :: CmmParse CmmExpr -> [GlobalReg] -> CmmParse ()
+doRawJump expr_code vols = do
+  dflags <- getDynFlags
+  expr <- expr_code
+  updfr_off <- getUpdFrameOff
+  emit (mkRawJump dflags expr updfr_off vols)
+
+doJumpWithStack :: CmmParse CmmExpr -> [CmmParse CmmExpr]
+                -> [CmmParse CmmExpr] -> CmmParse ()
+doJumpWithStack expr_code stk_code args_code = do
+  dflags <- getDynFlags
+  expr <- expr_code
+  stk_args <- sequence stk_code
+  args <- sequence args_code
+  updfr_off <- getUpdFrameOff
+  emit (mkJumpExtra dflags NativeNodeCall expr args updfr_off stk_args)
+
+doCall :: CmmParse CmmExpr -> [CmmParse LocalReg] -> [CmmParse CmmExpr]
+       -> CmmParse ()
+doCall expr_code res_code args_code = do
+  dflags <- getDynFlags
+  expr <- expr_code
+  args <- sequence args_code
+  ress <- sequence res_code
+  updfr_off <- getUpdFrameOff
+  c <- code $ mkCall expr (NativeNodeCall,NativeReturn) ress args updfr_off []
+  emit c
+
+adjCallTarget :: DynFlags -> CCallConv -> CmmExpr -> [(CmmExpr, ForeignHint) ]
+              -> CmmExpr
+-- On Windows, we have to add the '@N' suffix to the label when making
+-- a call with the stdcall calling convention.
+adjCallTarget dflags StdCallConv (CmmLit (CmmLabel lbl)) args
+ | platformOS (targetPlatform dflags) == OSMinGW32
+  = CmmLit (CmmLabel (addLabelSize lbl (sum (map size args))))
+  where size (e, _) = max (wORD_SIZE dflags) (widthInBytes (typeWidth (cmmExprType dflags e)))
+                 -- c.f. CgForeignCall.emitForeignCall
+adjCallTarget _ _ expr _
+  = expr
+
+primCall
+        :: [CmmParse (CmmFormal, ForeignHint)]
+        -> FastString
+        -> [CmmParse CmmExpr]
+        -> PD (CmmParse ())
+primCall results_code name args_code
+  = case lookupUFM callishMachOps name of
+        Nothing -> fail ("unknown primitive " ++ unpackFS name)
+        Just f  -> return $ do
+                results <- sequence results_code
+                args <- sequence args_code
+                let (p, args') = f args
+                code (emitPrimCall (map fst results) p args')
+
+doStore :: CmmType -> CmmParse CmmExpr  -> CmmParse CmmExpr -> CmmParse ()
+doStore rep addr_code val_code
+  = do dflags <- getDynFlags
+       addr <- addr_code
+       val <- val_code
+        -- if the specified store type does not match the type of the expr
+        -- on the rhs, then we insert a coercion that will cause the type
+        -- mismatch to be flagged by cmm-lint.  If we don't do this, then
+        -- the store will happen at the wrong type, and the error will not
+        -- be noticed.
+       let val_width = typeWidth (cmmExprType dflags val)
+           rep_width = typeWidth rep
+       let coerce_val
+                | val_width /= rep_width = CmmMachOp (MO_UU_Conv val_width rep_width) [val]
+                | otherwise              = val
+       emitStore addr coerce_val
+
+-- -----------------------------------------------------------------------------
+-- If-then-else and boolean expressions
+
+data BoolExpr
+  = BoolExpr `BoolAnd` BoolExpr
+  | BoolExpr `BoolOr`  BoolExpr
+  | BoolNot BoolExpr
+  | BoolTest CmmExpr
+
+-- ToDo: smart constructors which simplify the boolean expression.
+
+cmmIfThenElse cond then_part else_part likely = do
+     then_id <- newBlockId
+     join_id <- newBlockId
+     c <- cond
+     emitCond c then_id likely
+     else_part
+     emit (mkBranch join_id)
+     emitLabel then_id
+     then_part
+     -- fall through to join
+     emitLabel join_id
+
+cmmRawIf cond then_id likely = do
+    c <- cond
+    emitCond c then_id likely
+
+-- 'emitCond cond true_id'  emits code to test whether the cond is true,
+-- branching to true_id if so, and falling through otherwise.
+emitCond (BoolTest e) then_id likely = do
+  else_id <- newBlockId
+  emit (mkCbranch e then_id else_id likely)
+  emitLabel else_id
+emitCond (BoolNot (BoolTest (CmmMachOp op args))) then_id likely
+  | Just op' <- maybeInvertComparison op
+  = emitCond (BoolTest (CmmMachOp op' args)) then_id (not <$> likely)
+emitCond (BoolNot e) then_id likely = do
+  else_id <- newBlockId
+  emitCond e else_id likely
+  emit (mkBranch then_id)
+  emitLabel else_id
+emitCond (e1 `BoolOr` e2) then_id likely = do
+  emitCond e1 then_id likely
+  emitCond e2 then_id likely
+emitCond (e1 `BoolAnd` e2) then_id likely = do
+        -- we'd like to invert one of the conditionals here to avoid an
+        -- extra branch instruction, but we can't use maybeInvertComparison
+        -- here because we can't look too closely at the expression since
+        -- we're in a loop.
+  and_id <- newBlockId
+  else_id <- newBlockId
+  emitCond e1 and_id likely
+  emit (mkBranch else_id)
+  emitLabel and_id
+  emitCond e2 then_id likely
+  emitLabel else_id
+
+-- -----------------------------------------------------------------------------
+-- Source code notes
+
+-- | Generate a source note spanning from "a" to "b" (inclusive), then
+-- proceed with parsing. This allows debugging tools to reason about
+-- locations in Cmm code.
+withSourceNote :: Located a -> Located b -> CmmParse c -> CmmParse c
+withSourceNote a b parse = do
+  name <- getName
+  case combineSrcSpans (getLoc a) (getLoc b) of
+    RealSrcSpan span -> code (emitTick (SourceNote span name)) >> parse
+    _other           -> parse
+
+-- -----------------------------------------------------------------------------
+-- Table jumps
+
+-- We use a simplified form of C-- switch statements for now.  A
+-- switch statement always compiles to a table jump.  Each arm can
+-- specify a list of values (not ranges), and there can be a single
+-- default branch.  The range of the table is given either by the
+-- optional range on the switch (eg. switch [0..7] {...}), or by
+-- the minimum/maximum values from the branches.
+
+doSwitch :: Maybe (Integer,Integer)
+         -> CmmParse CmmExpr
+         -> [([Integer],Either BlockId (CmmParse ()))]
+         -> Maybe (CmmParse ()) -> CmmParse ()
+doSwitch mb_range scrut arms deflt
+   = do
+        -- Compile code for the default branch
+        dflt_entry <-
+                case deflt of
+                  Nothing -> return Nothing
+                  Just e  -> do b <- forkLabelledCode e; return (Just b)
+
+        -- Compile each case branch
+        table_entries <- mapM emitArm arms
+        let table = M.fromList (concat table_entries)
+
+        dflags <- getDynFlags
+        let range = fromMaybe (0, tARGET_MAX_WORD dflags) mb_range
+
+        expr <- scrut
+        -- ToDo: check for out of range and jump to default if necessary
+        emit $ mkSwitch expr (mkSwitchTargets False range dflt_entry table)
+   where
+        emitArm :: ([Integer],Either BlockId (CmmParse ())) -> CmmParse [(Integer,BlockId)]
+        emitArm (ints,Left blockid) = return [ (i,blockid) | i <- ints ]
+        emitArm (ints,Right code) = do
+           blockid <- forkLabelledCode code
+           return [ (i,blockid) | i <- ints ]
+
+forkLabelledCode :: CmmParse () -> CmmParse BlockId
+forkLabelledCode p = do
+  (_,ag) <- getCodeScoped p
+  l <- newBlockId
+  emitOutOfLine l ag
+  return l
+
+-- -----------------------------------------------------------------------------
+-- Putting it all together
+
+-- The initial environment: we define some constants that the compiler
+-- knows about here.
+initEnv :: DynFlags -> Env
+initEnv dflags = listToUFM [
+  ( fsLit "SIZEOF_StgHeader",
+    VarN (CmmLit (CmmInt (fromIntegral (fixedHdrSize dflags)) (wordWidth dflags)) )),
+  ( fsLit "SIZEOF_StgInfoTable",
+    VarN (CmmLit (CmmInt (fromIntegral (stdInfoTableSizeB dflags)) (wordWidth dflags)) ))
+  ]
+
+parseCmmFile :: DynFlags -> FilePath -> IO (Messages, Maybe CmmGroup)
+parseCmmFile dflags filename = withTiming dflags (text "ParseCmm"<+>brackets (text filename)) (\_ -> ()) $ do
+  buf <- hGetStringBuffer filename
+  let
+        init_loc = mkRealSrcLoc (mkFastString filename) 1 1
+        init_state = (mkPState dflags buf init_loc) { lex_state = [0] }
+                -- reset the lex_state: the Lexer monad leaves some stuff
+                -- in there we don't want.
+  case unPD cmmParse dflags init_state of
+    PFailed pst ->
+        return (getMessages pst dflags, Nothing)
+    POk pst code -> do
+        st <- initC
+        let fcode = getCmm $ unEC code "global" (initEnv dflags) [] >> return ()
+            (cmm,_) = runC dflags no_module st fcode
+        let ms = getMessages pst dflags
+        if (errorsFound dflags ms)
+         then return (ms, Nothing)
+         else return (ms, Just cmm)
+  where
+        no_module = panic "parseCmmFile: no module"
+}
diff --git a/compiler/GHC/Cmm/Pipeline.hs b/compiler/GHC/Cmm/Pipeline.hs
new file mode 100644
index 0000000000..6db9e23ee1
--- /dev/null
+++ b/compiler/GHC/Cmm/Pipeline.hs
@@ -0,0 +1,367 @@
+{-# LANGUAGE BangPatterns #-}
+
+module GHC.Cmm.Pipeline (
+  -- | Converts C-- with an implicit stack and native C-- calls into
+  -- optimized, CPS converted and native-call-less C--.  The latter
+  -- C-- can be used to generate assembly.
+  cmmPipeline
+) where
+
+import GhcPrelude
+
+import GHC.Cmm
+import GHC.Cmm.Lint
+import GHC.Cmm.Info.Build
+import GHC.Cmm.CommonBlockElim
+import GHC.Cmm.Switch.Implement
+import GHC.Cmm.ProcPoint
+import GHC.Cmm.ContFlowOpt
+import GHC.Cmm.LayoutStack
+import GHC.Cmm.Sink
+import GHC.Cmm.Dataflow.Collections
+
+import UniqSupply
+import DynFlags
+import ErrUtils
+import HscTypes
+import Control.Monad
+import Outputable
+import GHC.Platform
+
+-----------------------------------------------------------------------------
+-- | Top level driver for C-- pipeline
+-----------------------------------------------------------------------------
+
+cmmPipeline
+ :: HscEnv -- Compilation env including
+           -- dynamic flags: -dcmm-lint -ddump-cmm-cps
+ -> ModuleSRTInfo        -- Info about SRTs generated so far
+ -> CmmGroup             -- Input C-- with Procedures
+ -> IO (ModuleSRTInfo, CmmGroup) -- Output CPS transformed C--
+
+cmmPipeline hsc_env srtInfo prog = withTimingSilent dflags (text "Cmm pipeline") forceRes $
+  do let dflags = hsc_dflags hsc_env
+
+     tops <- {-# SCC "tops" #-} mapM (cpsTop hsc_env) prog
+
+     (srtInfo, cmms) <- {-# SCC "doSRTs" #-} doSRTs dflags srtInfo tops
+     dumpWith dflags Opt_D_dump_cmm_cps "Post CPS Cmm" FormatCMM (ppr cmms)
+
+     return (srtInfo, cmms)
+
+  where forceRes (info, group) =
+          info `seq` foldr (\decl r -> decl `seq` r) () group
+
+        dflags = hsc_dflags hsc_env
+
+cpsTop :: HscEnv -> CmmDecl -> IO (CAFEnv, [CmmDecl])
+cpsTop _ p@(CmmData {}) = return (mapEmpty, [p])
+cpsTop hsc_env proc =
+    do
+       ----------- Control-flow optimisations ----------------------------------
+
+       -- The first round of control-flow optimisation speeds up the
+       -- later passes by removing lots of empty blocks, so we do it
+       -- even when optimisation isn't turned on.
+       --
+       CmmProc h l v g <- {-# SCC "cmmCfgOpts(1)" #-}
+            return $ cmmCfgOptsProc splitting_proc_points proc
+       dump Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
+
+       let !TopInfo {stack_info=StackInfo { arg_space = entry_off
+                                          , do_layout = do_layout }} = h
+
+       ----------- Eliminate common blocks -------------------------------------
+       g <- {-# SCC "elimCommonBlocks" #-}
+            condPass Opt_CmmElimCommonBlocks elimCommonBlocks g
+                          Opt_D_dump_cmm_cbe "Post common block elimination"
+
+       -- Any work storing block Labels must be performed _after_
+       -- elimCommonBlocks
+
+       ----------- Implement switches ------------------------------------------
+       g <- {-# SCC "createSwitchPlans" #-}
+            runUniqSM $ cmmImplementSwitchPlans dflags g
+       dump Opt_D_dump_cmm_switch "Post switch plan" g
+
+       ----------- Proc points -------------------------------------------------
+       let call_pps = {-# SCC "callProcPoints" #-} callProcPoints g
+       proc_points <-
+          if splitting_proc_points
+             then do
+               pp <- {-# SCC "minimalProcPointSet" #-} runUniqSM $
+                  minimalProcPointSet (targetPlatform dflags) call_pps g
+               dumpWith dflags Opt_D_dump_cmm_proc "Proc points"
+                     FormatCMM (ppr l $$ ppr pp $$ ppr g)
+               return pp
+             else
+               return call_pps
+
+       ----------- Layout the stack and manifest Sp ----------------------------
+       (g, stackmaps) <-
+            {-# SCC "layoutStack" #-}
+            if do_layout
+               then runUniqSM $ cmmLayoutStack dflags proc_points entry_off g
+               else return (g, mapEmpty)
+       dump Opt_D_dump_cmm_sp "Layout Stack" g
+
+       ----------- Sink and inline assignments  --------------------------------
+       g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
+            condPass Opt_CmmSink (cmmSink dflags) g
+                     Opt_D_dump_cmm_sink "Sink assignments"
+
+       ------------- CAF analysis ----------------------------------------------
+       let cafEnv = {-# SCC "cafAnal" #-} cafAnal call_pps l g
+       dumpWith dflags Opt_D_dump_cmm_caf "CAFEnv" FormatText (ppr cafEnv)
+
+       g <- if splitting_proc_points
+            then do
+               ------------- Split into separate procedures -----------------------
+               let pp_map = {-# SCC "procPointAnalysis" #-}
+                            procPointAnalysis proc_points g
+               dumpWith dflags Opt_D_dump_cmm_procmap "procpoint map"
+                  FormatCMM (ppr pp_map)
+               g <- {-# SCC "splitAtProcPoints" #-} runUniqSM $
+                    splitAtProcPoints dflags l call_pps proc_points pp_map
+                                      (CmmProc h l v g)
+               dumps Opt_D_dump_cmm_split "Post splitting" g
+               return g
+             else do
+               -- attach info tables to return points
+               return $ [attachContInfoTables call_pps (CmmProc h l v g)]
+
+       ------------- Populate info tables with stack info -----------------
+       g <- {-# SCC "setInfoTableStackMap" #-}
+            return $ map (setInfoTableStackMap dflags stackmaps) g
+       dumps Opt_D_dump_cmm_info "after setInfoTableStackMap" g
+
+       ----------- Control-flow optimisations -----------------------------
+       g <- {-# SCC "cmmCfgOpts(2)" #-}
+            return $ if optLevel dflags >= 1
+                     then map (cmmCfgOptsProc splitting_proc_points) g
+                     else g
+       g <- return (map removeUnreachableBlocksProc g)
+            -- See Note [unreachable blocks]
+       dumps Opt_D_dump_cmm_cfg "Post control-flow optimisations" g
+
+       return (cafEnv, g)
+
+  where dflags = hsc_dflags hsc_env
+        platform = targetPlatform dflags
+        dump = dumpGraph dflags
+
+        dumps flag name
+           = mapM_ (dumpWith dflags flag name FormatCMM . ppr)
+
+        condPass flag pass g dumpflag dumpname =
+            if gopt flag dflags
+               then do
+                    g <- return $ pass g
+                    dump dumpflag dumpname g
+                    return g
+               else return g
+
+        -- we don't need to split proc points for the NCG, unless
+        -- tablesNextToCode is off.  The latter is because we have no
+        -- label to put on info tables for basic blocks that are not
+        -- the entry point.
+        splitting_proc_points = hscTarget dflags /= HscAsm
+                             || not (tablesNextToCode dflags)
+                             || -- Note [inconsistent-pic-reg]
+                                usingInconsistentPicReg
+        usingInconsistentPicReg
+           = case (platformArch platform, platformOS platform, positionIndependent dflags)
+             of   (ArchX86, OSDarwin, pic) -> pic
+                  _                        -> False
+
+-- Note [Sinking after stack layout]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- In the past we considered running sinking pass also before stack
+-- layout, but after making some measurements we realized that:
+--
+--   a) running sinking only before stack layout produces slower
+--      code than running sinking only before stack layout
+--
+--   b) running sinking both before and after stack layout produces
+--      code that has the same performance as when running sinking
+--      only after stack layout.
+--
+-- In other words sinking before stack layout doesn't buy as anything.
+--
+-- An interesting question is "why is it better to run sinking after
+-- stack layout"? It seems that the major reason are stores and loads
+-- generated by stack layout. Consider this code before stack layout:
+--
+--  c1E:
+--      _c1C::P64 = R3;
+--      _c1B::P64 = R2;
+--      _c1A::P64 = R1;
+--      I64[(young<c1D> + 8)] = c1D;
+--      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
+--  c1D:
+--      R3 = _c1C::P64;
+--      R2 = _c1B::P64;
+--      R1 = _c1A::P64;
+--      call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
+--
+-- Stack layout pass will save all local variables live across a call
+-- (_c1C, _c1B and _c1A in this example) on the stack just before
+-- making a call and reload them from the stack after returning from a
+-- call:
+--
+--  c1E:
+--      _c1C::P64 = R3;
+--      _c1B::P64 = R2;
+--      _c1A::P64 = R1;
+--      I64[Sp - 32] = c1D;
+--      P64[Sp - 24] = _c1A::P64;
+--      P64[Sp - 16] = _c1B::P64;
+--      P64[Sp - 8] = _c1C::P64;
+--      Sp = Sp - 32;
+--      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
+--  c1D:
+--      _c1A::P64 = P64[Sp + 8];
+--      _c1B::P64 = P64[Sp + 16];
+--      _c1C::P64 = P64[Sp + 24];
+--      R3 = _c1C::P64;
+--      R2 = _c1B::P64;
+--      R1 = _c1A::P64;
+--      Sp = Sp + 32;
+--      call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
+--
+-- If we don't run sinking pass after stack layout we are basically
+-- left with such code. However, running sinking on this code can lead
+-- to significant improvements:
+--
+--  c1E:
+--      I64[Sp - 32] = c1D;
+--      P64[Sp - 24] = R1;
+--      P64[Sp - 16] = R2;
+--      P64[Sp - 8] = R3;
+--      Sp = Sp - 32;
+--      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
+--  c1D:
+--      R3 = P64[Sp + 24];
+--      R2 = P64[Sp + 16];
+--      R1 = P64[Sp + 8];
+--      Sp = Sp + 32;
+--      call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
+--
+-- Now we only have 9 assignments instead of 15.
+--
+-- There is one case when running sinking before stack layout could
+-- be beneficial. Consider this:
+--
+--   L1:
+--      x = y
+--      call f() returns L2
+--   L2: ...x...y...
+--
+-- Since both x and y are live across a call to f, they will be stored
+-- on the stack during stack layout and restored after the call:
+--
+--   L1:
+--      x = y
+--      P64[Sp - 24] = L2
+--      P64[Sp - 16] = x
+--      P64[Sp - 8]  = y
+--      Sp = Sp - 24
+--      call f() returns L2
+--   L2:
+--      y = P64[Sp + 16]
+--      x = P64[Sp + 8]
+--      Sp = Sp + 24
+--      ...x...y...
+--
+-- However, if we run sinking before stack layout we would propagate x
+-- to its usage place (both x and y must be local register for this to
+-- be possible - global registers cannot be floated past a call):
+--
+--   L1:
+--      x = y
+--      call f() returns L2
+--   L2: ...y...y...
+--
+-- Thus making x dead at the call to f(). If we ran stack layout now
+-- we would generate less stores and loads:
+--
+--   L1:
+--      x = y
+--      P64[Sp - 16] = L2
+--      P64[Sp - 8]  = y
+--      Sp = Sp - 16
+--      call f() returns L2
+--   L2:
+--      y = P64[Sp + 8]
+--      Sp = Sp + 16
+--      ...y...y...
+--
+-- But since we don't see any benefits from running sinking before stack
+-- layout, this situation probably doesn't arise too often in practice.
+--
+
+{- Note [inconsistent-pic-reg]
+
+On x86/Darwin, PIC is implemented by inserting a sequence like
+
+    call 1f
+ 1: popl %reg
+
+at the proc entry point, and then referring to labels as offsets from
+%reg.  If we don't split proc points, then we could have many entry
+points in a proc that would need this sequence, and each entry point
+would then get a different value for %reg.  If there are any join
+points, then at the join point we don't have a consistent value for
+%reg, so we don't know how to refer to labels.
+
+Hence, on x86/Darwin, we have to split proc points, and then each proc
+point will get its own PIC initialisation sequence.
+
+This isn't an issue on x86/ELF, where the sequence is
+
+    call 1f
+ 1: popl %reg
+    addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %reg
+
+so %reg always has a consistent value: the address of
+_GLOBAL_OFFSET_TABLE_, regardless of which entry point we arrived via.
+
+-}
+
+{- Note [unreachable blocks]
+
+The control-flow optimiser sometimes leaves unreachable blocks behind
+containing junk code.  These aren't necessarily a problem, but
+removing them is good because it might save time in the native code
+generator later.
+
+-}
+
+runUniqSM :: UniqSM a -> IO a
+runUniqSM m = do
+  us <- mkSplitUniqSupply 'u'
+  return (initUs_ us m)
+
+
+dumpGraph :: DynFlags -> DumpFlag -> String -> CmmGraph -> IO ()
+dumpGraph dflags flag name g = do
+  when (gopt Opt_DoCmmLinting dflags) $ do_lint g
+  dumpWith dflags flag name FormatCMM (ppr g)
+ where
+  do_lint g = case cmmLintGraph dflags g of
+                 Just err -> do { fatalErrorMsg dflags err
+                                ; ghcExit dflags 1
+                                }
+                 Nothing  -> return ()
+
+dumpWith :: DynFlags -> DumpFlag -> String -> DumpFormat -> SDoc -> IO ()
+dumpWith dflags flag txt fmt sdoc = do
+  dumpIfSet_dyn dflags flag txt fmt sdoc
+  when (not (dopt flag dflags)) $
+    -- If `-ddump-cmm-verbose -ddump-to-file` is specified,
+    -- dump each Cmm pipeline stage output to a separate file.  #16930
+    when (dopt Opt_D_dump_cmm_verbose dflags)
+      $ dumpAction dflags (mkDumpStyle dflags alwaysQualify)
+                   (dumpOptionsFromFlag flag) txt fmt sdoc
+  dumpIfSet_dyn dflags Opt_D_dump_cmm_verbose_by_proc txt fmt sdoc
diff --git a/compiler/GHC/Cmm/Ppr.hs b/compiler/GHC/Cmm/Ppr.hs
new file mode 100644
index 0000000000..891cbd9c6d
--- /dev/null
+++ b/compiler/GHC/Cmm/Ppr.hs
@@ -0,0 +1,309 @@
+{-# LANGUAGE GADTs, TypeFamilies, FlexibleContexts, FlexibleInstances #-}
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+----------------------------------------------------------------------------
+--
+-- Pretty-printing of Cmm as (a superset of) C--
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+--
+-- This is where we walk over CmmNode emitting an external representation,
+-- suitable for parsing, in a syntax strongly reminiscent of C--. This
+-- is the "External Core" for the Cmm layer.
+--
+-- As such, this should be a well-defined syntax: we want it to look nice.
+-- Thus, we try wherever possible to use syntax defined in [1],
+-- "The C-- Reference Manual", http://www.cs.tufts.edu/~nr/c--/index.html. We
+-- differ slightly, in some cases. For one, we use I8 .. I64 for types, rather
+-- than C--'s bits8 .. bits64.
+--
+-- We try to ensure that all information available in the abstract
+-- syntax is reproduced, or reproducible, in the concrete syntax.
+-- Data that is not in printed out can be reconstructed according to
+-- conventions used in the pretty printer. There are at least two such
+-- cases:
+--      1) if a value has wordRep type, the type is not appended in the
+--      output.
+--      2) MachOps that operate over wordRep type are printed in a
+--      C-style, rather than as their internal MachRep name.
+--
+-- These conventions produce much more readable Cmm output.
+--
+-- A useful example pass over Cmm is in nativeGen/MachCodeGen.hs
+
+module GHC.Cmm.Ppr
+  ( module GHC.Cmm.Ppr.Decl
+  , module GHC.Cmm.Ppr.Expr
+  )
+where
+
+import GhcPrelude hiding (succ)
+
+import GHC.Cmm.CLabel
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import DynFlags
+import FastString
+import Outputable
+import GHC.Cmm.Ppr.Decl
+import GHC.Cmm.Ppr.Expr
+import Util
+
+import BasicTypes
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+
+-------------------------------------------------
+-- Outputable instances
+
+instance Outputable CmmStackInfo where
+    ppr = pprStackInfo
+
+instance Outputable CmmTopInfo where
+    ppr = pprTopInfo
+
+
+instance Outputable (CmmNode e x) where
+    ppr = pprNode
+
+instance Outputable Convention where
+    ppr = pprConvention
+
+instance Outputable ForeignConvention where
+    ppr = pprForeignConvention
+
+instance Outputable ForeignTarget where
+    ppr = pprForeignTarget
+
+instance Outputable CmmReturnInfo where
+    ppr = pprReturnInfo
+
+instance Outputable (Block CmmNode C C) where
+    ppr = pprBlock
+instance Outputable (Block CmmNode C O) where
+    ppr = pprBlock
+instance Outputable (Block CmmNode O C) where
+    ppr = pprBlock
+instance Outputable (Block CmmNode O O) where
+    ppr = pprBlock
+
+instance Outputable (Graph CmmNode e x) where
+    ppr = pprGraph
+
+instance Outputable CmmGraph where
+    ppr = pprCmmGraph
+
+----------------------------------------------------------
+-- Outputting types Cmm contains
+
+pprStackInfo :: CmmStackInfo -> SDoc
+pprStackInfo (StackInfo {arg_space=arg_space, updfr_space=updfr_space}) =
+  text "arg_space: " <> ppr arg_space <+>
+  text "updfr_space: " <> ppr updfr_space
+
+pprTopInfo :: CmmTopInfo -> SDoc
+pprTopInfo (TopInfo {info_tbls=info_tbl, stack_info=stack_info}) =
+  vcat [text "info_tbls: " <> ppr info_tbl,
+        text "stack_info: " <> ppr stack_info]
+
+----------------------------------------------------------
+-- Outputting blocks and graphs
+
+pprBlock :: IndexedCO x SDoc SDoc ~ SDoc
+         => Block CmmNode e x -> IndexedCO e SDoc SDoc
+pprBlock block
+    = foldBlockNodesB3 ( ($$) . ppr
+                       , ($$) . (nest 4) . ppr
+                       , ($$) . (nest 4) . ppr
+                       )
+                       block
+                       empty
+
+pprGraph :: Graph CmmNode e x -> SDoc
+pprGraph GNil = empty
+pprGraph (GUnit block) = ppr block
+pprGraph (GMany entry body exit)
+   = text "{"
+  $$ nest 2 (pprMaybeO entry $$ (vcat $ map ppr $ bodyToBlockList body) $$ pprMaybeO exit)
+  $$ text "}"
+  where pprMaybeO :: Outputable (Block CmmNode e x)
+                  => MaybeO ex (Block CmmNode e x) -> SDoc
+        pprMaybeO NothingO = empty
+        pprMaybeO (JustO block) = ppr block
+
+pprCmmGraph :: CmmGraph -> SDoc
+pprCmmGraph g
+   = text "{" <> text "offset"
+  $$ nest 2 (vcat $ map ppr blocks)
+  $$ text "}"
+  where blocks = revPostorder g
+    -- revPostorder has the side-effect of discarding unreachable code,
+    -- so pretty-printed Cmm will omit any unreachable blocks.  This can
+    -- sometimes be confusing.
+
+---------------------------------------------
+-- Outputting CmmNode and types which it contains
+
+pprConvention :: Convention -> SDoc
+pprConvention (NativeNodeCall   {}) = text "<native-node-call-convention>"
+pprConvention (NativeDirectCall {}) = text "<native-direct-call-convention>"
+pprConvention (NativeReturn {})     = text "<native-ret-convention>"
+pprConvention  Slow                 = text "<slow-convention>"
+pprConvention  GC                   = text "<gc-convention>"
+
+pprForeignConvention :: ForeignConvention -> SDoc
+pprForeignConvention (ForeignConvention c args res ret) =
+          doubleQuotes (ppr c) <+> text "arg hints: " <+> ppr args <+> text " result hints: " <+> ppr res <+> ppr ret
+
+pprReturnInfo :: CmmReturnInfo -> SDoc
+pprReturnInfo CmmMayReturn = empty
+pprReturnInfo CmmNeverReturns = text "never returns"
+
+pprForeignTarget :: ForeignTarget -> SDoc
+pprForeignTarget (ForeignTarget fn c) = ppr c <+> ppr_target fn
+  where
+        ppr_target :: CmmExpr -> SDoc
+        ppr_target t@(CmmLit _) = ppr t
+        ppr_target fn'          = parens (ppr fn')
+
+pprForeignTarget (PrimTarget op)
+ -- HACK: We're just using a ForeignLabel to get this printed, the label
+ --       might not really be foreign.
+ = ppr
+               (CmmLabel (mkForeignLabel
+                         (mkFastString (show op))
+                         Nothing ForeignLabelInThisPackage IsFunction))
+
+pprNode :: CmmNode e x -> SDoc
+pprNode node = pp_node <+> pp_debug
+  where
+    pp_node :: SDoc
+    pp_node = sdocWithDynFlags $ \dflags -> case node of
+      -- label:
+      CmmEntry id tscope -> lbl <> colon <+>
+         (sdocWithDynFlags $ \dflags ->
+           ppUnless (gopt Opt_SuppressTicks dflags) (text "//" <+> ppr tscope))
+          where
+            lbl = if gopt Opt_SuppressUniques dflags
+                then text "_lbl_"
+                else ppr id
+
+      -- // text
+      CmmComment s -> text "//" <+> ftext s
+
+      -- //tick bla<...>
+      CmmTick t -> ppUnless (gopt Opt_SuppressTicks dflags) $
+                   text "//tick" <+> ppr t
+
+      -- unwind reg = expr;
+      CmmUnwind regs ->
+          text "unwind "
+          <> commafy (map (\(r,e) -> ppr r <+> char '=' <+> ppr e) regs) <> semi
+
+      -- reg = expr;
+      CmmAssign reg expr -> ppr reg <+> equals <+> ppr expr <> semi
+
+      -- rep[lv] = expr;
+      CmmStore lv expr -> rep <> brackets(ppr lv) <+> equals <+> ppr expr <> semi
+          where
+            rep = sdocWithDynFlags $ \dflags ->
+                  ppr ( cmmExprType dflags expr )
+
+      -- call "ccall" foo(x, y)[r1, r2];
+      -- ToDo ppr volatile
+      CmmUnsafeForeignCall target results args ->
+          hsep [ ppUnless (null results) $
+                    parens (commafy $ map ppr results) <+> equals,
+                 text "call",
+                 ppr target <> parens (commafy $ map ppr args) <> semi]
+
+      -- goto label;
+      CmmBranch ident -> text "goto" <+> ppr ident <> semi
+
+      -- if (expr) goto t; else goto f;
+      CmmCondBranch expr t f l ->
+          hsep [ text "if"
+               , parens(ppr expr)
+               , case l of
+                   Nothing -> empty
+                   Just b -> parens (text "likely:" <+> ppr b)
+               , text "goto"
+               , ppr t <> semi
+               , text "else goto"
+               , ppr f <> semi
+               ]
+
+      CmmSwitch expr ids ->
+          hang (hsep [ text "switch"
+                     , range
+                     , if isTrivialCmmExpr expr
+                       then ppr expr
+                       else parens (ppr expr)
+                     , text "{"
+                     ])
+             4 (vcat (map ppCase cases) $$ def) $$ rbrace
+          where
+            (cases, mbdef) = switchTargetsFallThrough ids
+            ppCase (is,l) = hsep
+                            [ text "case"
+                            , commafy $ map integer is
+                            , text ": goto"
+                            , ppr l <> semi
+                            ]
+            def | Just l <- mbdef = hsep
+                            [ text "default:"
+                            , braces (text "goto" <+> ppr l <> semi)
+                            ]
+                | otherwise = empty
+
+            range = brackets $ hsep [integer lo, text "..", integer hi]
+              where (lo,hi) = switchTargetsRange ids
+
+      CmmCall tgt k regs out res updfr_off ->
+          hcat [ text "call", space
+               , pprFun tgt, parens (interpp'SP regs), space
+               , returns <+>
+                 text "args: " <> ppr out <> comma <+>
+                 text "res: " <> ppr res <> comma <+>
+                 text "upd: " <> ppr updfr_off
+               , semi ]
+          where pprFun f@(CmmLit _) = ppr f
+                pprFun f = parens (ppr f)
+
+                returns
+                  | Just r <- k = text "returns to" <+> ppr r <> comma
+                  | otherwise   = empty
+
+      CmmForeignCall {tgt=t, res=rs, args=as, succ=s, ret_args=a, ret_off=u, intrbl=i} ->
+          hcat $ if i then [text "interruptible", space] else [] ++
+               [ text "foreign call", space
+               , ppr t, text "(...)", space
+               , text "returns to" <+> ppr s
+                    <+> text "args:" <+> parens (ppr as)
+                    <+> text "ress:" <+> parens (ppr rs)
+               , text "ret_args:" <+> ppr a
+               , text "ret_off:" <+> ppr u
+               , semi ]
+
+    pp_debug :: SDoc
+    pp_debug =
+      if not debugIsOn then empty
+      else case node of
+             CmmEntry {}             -> empty -- Looks terrible with text "  // CmmEntry"
+             CmmComment {}           -> empty -- Looks also terrible with text "  // CmmComment"
+             CmmTick {}              -> empty
+             CmmUnwind {}            -> text "  // CmmUnwind"
+             CmmAssign {}            -> text "  // CmmAssign"
+             CmmStore {}             -> text "  // CmmStore"
+             CmmUnsafeForeignCall {} -> text "  // CmmUnsafeForeignCall"
+             CmmBranch {}            -> text "  // CmmBranch"
+             CmmCondBranch {}        -> text "  // CmmCondBranch"
+             CmmSwitch {}            -> text "  // CmmSwitch"
+             CmmCall {}              -> text "  // CmmCall"
+             CmmForeignCall {}       -> text "  // CmmForeignCall"
+
+    commafy :: [SDoc] -> SDoc
+    commafy xs = hsep $ punctuate comma xs
diff --git a/compiler/GHC/Cmm/Ppr/Decl.hs b/compiler/GHC/Cmm/Ppr/Decl.hs
new file mode 100644
index 0000000000..2544e6a0d3
--- /dev/null
+++ b/compiler/GHC/Cmm/Ppr/Decl.hs
@@ -0,0 +1,169 @@
+----------------------------------------------------------------------------
+--
+-- Pretty-printing of common Cmm types
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+--
+-- This is where we walk over Cmm emitting an external representation,
+-- suitable for parsing, in a syntax strongly reminiscent of C--. This
+-- is the "External Core" for the Cmm layer.
+--
+-- As such, this should be a well-defined syntax: we want it to look nice.
+-- Thus, we try wherever possible to use syntax defined in [1],
+-- "The C-- Reference Manual", http://www.cs.tufts.edu/~nr/c--/index.html. We
+-- differ slightly, in some cases. For one, we use I8 .. I64 for types, rather
+-- than C--'s bits8 .. bits64.
+--
+-- We try to ensure that all information available in the abstract
+-- syntax is reproduced, or reproducible, in the concrete syntax.
+-- Data that is not in printed out can be reconstructed according to
+-- conventions used in the pretty printer. There are at least two such
+-- cases:
+--      1) if a value has wordRep type, the type is not appended in the
+--      output.
+--      2) MachOps that operate over wordRep type are printed in a
+--      C-style, rather than as their internal MachRep name.
+--
+-- These conventions produce much more readable Cmm output.
+--
+-- A useful example pass over Cmm is in nativeGen/MachCodeGen.hs
+--
+
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+module GHC.Cmm.Ppr.Decl
+    ( writeCmms, pprCmms, pprCmmGroup, pprSection, pprStatic
+    )
+where
+
+import GhcPrelude
+
+import GHC.Cmm.Ppr.Expr
+import GHC.Cmm
+
+import DynFlags
+import Outputable
+import FastString
+
+import Data.List
+import System.IO
+
+import qualified Data.ByteString as BS
+
+
+pprCmms :: (Outputable info, Outputable g)
+        => [GenCmmGroup CmmStatics info g] -> SDoc
+pprCmms cmms = pprCode CStyle (vcat (intersperse separator $ map ppr cmms))
+        where
+          separator = space $$ text "-------------------" $$ space
+
+writeCmms :: (Outputable info, Outputable g)
+          => DynFlags -> Handle -> [GenCmmGroup CmmStatics info g] -> IO ()
+writeCmms dflags handle cmms = printForC dflags handle (pprCmms cmms)
+
+-----------------------------------------------------------------------------
+
+instance (Outputable d, Outputable info, Outputable i)
+      => Outputable (GenCmmDecl d info i) where
+    ppr t = pprTop t
+
+instance Outputable CmmStatics where
+    ppr = pprStatics
+
+instance Outputable CmmStatic where
+    ppr = pprStatic
+
+instance Outputable CmmInfoTable where
+    ppr = pprInfoTable
+
+
+-----------------------------------------------------------------------------
+
+pprCmmGroup :: (Outputable d, Outputable info, Outputable g)
+            => GenCmmGroup d info g -> SDoc
+pprCmmGroup tops
+    = vcat $ intersperse blankLine $ map pprTop tops
+
+-- --------------------------------------------------------------------------
+-- Top level `procedure' blocks.
+--
+pprTop :: (Outputable d, Outputable info, Outputable i)
+       => GenCmmDecl d info i -> SDoc
+
+pprTop (CmmProc info lbl live graph)
+
+  = vcat [ ppr lbl <> lparen <> rparen <+> lbrace <+> text "// " <+> ppr live
+         , nest 8 $ lbrace <+> ppr info $$ rbrace
+         , nest 4 $ ppr graph
+         , rbrace ]
+
+-- --------------------------------------------------------------------------
+-- We follow [1], 4.5
+--
+--      section "data" { ... }
+--
+pprTop (CmmData section ds) =
+    (hang (pprSection section <+> lbrace) 4 (ppr ds))
+    $$ rbrace
+
+-- --------------------------------------------------------------------------
+-- Info tables.
+
+pprInfoTable :: CmmInfoTable -> SDoc
+pprInfoTable (CmmInfoTable { cit_lbl = lbl, cit_rep = rep
+                           , cit_prof = prof_info
+                           , cit_srt = srt })
+  = vcat [ text "label: " <> ppr lbl
+         , text "rep: " <> ppr rep
+         , case prof_info of
+             NoProfilingInfo -> empty
+             ProfilingInfo ct cd ->
+               vcat [ text "type: " <> text (show (BS.unpack ct))
+                    , text "desc: " <> text (show (BS.unpack cd)) ]
+         , text "srt: " <> ppr srt ]
+
+instance Outputable ForeignHint where
+  ppr NoHint     = empty
+  ppr SignedHint = quotes(text "signed")
+--  ppr AddrHint   = quotes(text "address")
+-- Temp Jan08
+  ppr AddrHint   = (text "PtrHint")
+
+-- --------------------------------------------------------------------------
+-- Static data.
+--      Strings are printed as C strings, and we print them as I8[],
+--      following C--
+--
+pprStatics :: CmmStatics -> SDoc
+pprStatics (Statics lbl ds) = vcat ((ppr lbl <> colon) : map ppr ds)
+
+pprStatic :: CmmStatic -> SDoc
+pprStatic s = case s of
+    CmmStaticLit lit   -> nest 4 $ text "const" <+> pprLit lit <> semi
+    CmmUninitialised i -> nest 4 $ text "I8" <> brackets (int i)
+    CmmString s'       -> nest 4 $ text "I8[]" <+> text (show s')
+
+-- --------------------------------------------------------------------------
+-- data sections
+--
+pprSection :: Section -> SDoc
+pprSection (Section t suffix) =
+  section <+> doubleQuotes (pprSectionType t <+> char '.' <+> ppr suffix)
+  where
+    section = text "section"
+
+pprSectionType :: SectionType -> SDoc
+pprSectionType s = doubleQuotes (ptext t)
+ where
+  t = case s of
+    Text              -> sLit "text"
+    Data              -> sLit "data"
+    ReadOnlyData      -> sLit "readonly"
+    ReadOnlyData16    -> sLit "readonly16"
+    RelocatableReadOnlyData
+                      -> sLit "relreadonly"
+    UninitialisedData -> sLit "uninitialised"
+    CString           -> sLit "cstring"
+    OtherSection s'   -> sLit s' -- Not actually a literal though.
diff --git a/compiler/GHC/Cmm/Ppr/Expr.hs b/compiler/GHC/Cmm/Ppr/Expr.hs
new file mode 100644
index 0000000000..53a335e561
--- /dev/null
+++ b/compiler/GHC/Cmm/Ppr/Expr.hs
@@ -0,0 +1,286 @@
+----------------------------------------------------------------------------
+--
+-- Pretty-printing of common Cmm types
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+--
+-- This is where we walk over Cmm emitting an external representation,
+-- suitable for parsing, in a syntax strongly reminiscent of C--. This
+-- is the "External Core" for the Cmm layer.
+--
+-- As such, this should be a well-defined syntax: we want it to look nice.
+-- Thus, we try wherever possible to use syntax defined in [1],
+-- "The C-- Reference Manual", http://www.cs.tufts.edu/~nr/c--/index.html. We
+-- differ slightly, in some cases. For one, we use I8 .. I64 for types, rather
+-- than C--'s bits8 .. bits64.
+--
+-- We try to ensure that all information available in the abstract
+-- syntax is reproduced, or reproducible, in the concrete syntax.
+-- Data that is not in printed out can be reconstructed according to
+-- conventions used in the pretty printer. There are at least two such
+-- cases:
+--      1) if a value has wordRep type, the type is not appended in the
+--      output.
+--      2) MachOps that operate over wordRep type are printed in a
+--      C-style, rather than as their internal MachRep name.
+--
+-- These conventions produce much more readable Cmm output.
+--
+-- A useful example pass over Cmm is in nativeGen/MachCodeGen.hs
+--
+
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+module GHC.Cmm.Ppr.Expr
+    ( pprExpr, pprLit
+    )
+where
+
+import GhcPrelude
+
+import GHC.Cmm.Expr
+
+import Outputable
+import DynFlags
+
+import Data.Maybe
+import Numeric ( fromRat )
+
+-----------------------------------------------------------------------------
+
+instance Outputable CmmExpr where
+    ppr e = pprExpr e
+
+instance Outputable CmmReg where
+    ppr e = pprReg e
+
+instance Outputable CmmLit where
+    ppr l = pprLit l
+
+instance Outputable LocalReg where
+    ppr e = pprLocalReg e
+
+instance Outputable Area where
+    ppr e = pprArea e
+
+instance Outputable GlobalReg where
+    ppr e = pprGlobalReg e
+
+-- --------------------------------------------------------------------------
+-- Expressions
+--
+
+pprExpr :: CmmExpr -> SDoc
+pprExpr e
+    = sdocWithDynFlags $ \dflags ->
+      case e of
+        CmmRegOff reg i ->
+                pprExpr (CmmMachOp (MO_Add rep)
+                           [CmmReg reg, CmmLit (CmmInt (fromIntegral i) rep)])
+                where rep = typeWidth (cmmRegType dflags reg)
+        CmmLit lit -> pprLit lit
+        _other     -> pprExpr1 e
+
+-- Here's the precedence table from GHC.Cmm.Parser:
+-- %nonassoc '>=' '>' '<=' '<' '!=' '=='
+-- %left '|'
+-- %left '^'
+-- %left '&'
+-- %left '>>' '<<'
+-- %left '-' '+'
+-- %left '/' '*' '%'
+-- %right '~'
+
+-- We just cope with the common operators for now, the rest will get
+-- a default conservative behaviour.
+
+-- %nonassoc '>=' '>' '<=' '<' '!=' '=='
+pprExpr1, pprExpr7, pprExpr8 :: CmmExpr -> SDoc
+pprExpr1 (CmmMachOp op [x,y]) | Just doc <- infixMachOp1 op
+   = pprExpr7 x <+> doc <+> pprExpr7 y
+pprExpr1 e = pprExpr7 e
+
+infixMachOp1, infixMachOp7, infixMachOp8 :: MachOp -> Maybe SDoc
+
+infixMachOp1 (MO_Eq     _) = Just (text "==")
+infixMachOp1 (MO_Ne     _) = Just (text "!=")
+infixMachOp1 (MO_Shl    _) = Just (text "<<")
+infixMachOp1 (MO_U_Shr  _) = Just (text ">>")
+infixMachOp1 (MO_U_Ge   _) = Just (text ">=")
+infixMachOp1 (MO_U_Le   _) = Just (text "<=")
+infixMachOp1 (MO_U_Gt   _) = Just (char '>')
+infixMachOp1 (MO_U_Lt   _) = Just (char '<')
+infixMachOp1 _             = Nothing
+
+-- %left '-' '+'
+pprExpr7 (CmmMachOp (MO_Add rep1) [x, CmmLit (CmmInt i rep2)]) | i < 0
+   = pprExpr7 (CmmMachOp (MO_Sub rep1) [x, CmmLit (CmmInt (negate i) rep2)])
+pprExpr7 (CmmMachOp op [x,y]) | Just doc <- infixMachOp7 op
+   = pprExpr7 x <+> doc <+> pprExpr8 y
+pprExpr7 e = pprExpr8 e
+
+infixMachOp7 (MO_Add _)  = Just (char '+')
+infixMachOp7 (MO_Sub _)  = Just (char '-')
+infixMachOp7 _           = Nothing
+
+-- %left '/' '*' '%'
+pprExpr8 (CmmMachOp op [x,y]) | Just doc <- infixMachOp8 op
+   = pprExpr8 x <+> doc <+> pprExpr9 y
+pprExpr8 e = pprExpr9 e
+
+infixMachOp8 (MO_U_Quot _) = Just (char '/')
+infixMachOp8 (MO_Mul _)    = Just (char '*')
+infixMachOp8 (MO_U_Rem _)  = Just (char '%')
+infixMachOp8 _             = Nothing
+
+pprExpr9 :: CmmExpr -> SDoc
+pprExpr9 e =
+   case e of
+        CmmLit    lit       -> pprLit1 lit
+        CmmLoad   expr rep  -> ppr rep <> brackets (ppr expr)
+        CmmReg    reg       -> ppr reg
+        CmmRegOff  reg off  -> parens (ppr reg <+> char '+' <+> int off)
+        CmmStackSlot a off  -> parens (ppr a   <+> char '+' <+> int off)
+        CmmMachOp mop args  -> genMachOp mop args
+
+genMachOp :: MachOp -> [CmmExpr] -> SDoc
+genMachOp mop args
+   | Just doc <- infixMachOp mop = case args of
+        -- dyadic
+        [x,y] -> pprExpr9 x <+> doc <+> pprExpr9 y
+
+        -- unary
+        [x]   -> doc <> pprExpr9 x
+
+        _     -> pprTrace "GHC.Cmm.Ppr.Expr.genMachOp: machop with strange number of args"
+                          (pprMachOp mop <+>
+                            parens (hcat $ punctuate comma (map pprExpr args)))
+                          empty
+
+   | isJust (infixMachOp1 mop)
+   || isJust (infixMachOp7 mop)
+   || isJust (infixMachOp8 mop)  = parens (pprExpr (CmmMachOp mop args))
+
+   | otherwise = char '%' <> ppr_op <> parens (commafy (map pprExpr args))
+        where ppr_op = text (map (\c -> if c == ' ' then '_' else c)
+                                 (show mop))
+                -- replace spaces in (show mop) with underscores,
+
+--
+-- Unsigned ops on the word size of the machine get nice symbols.
+-- All else get dumped in their ugly format.
+--
+infixMachOp :: MachOp -> Maybe SDoc
+infixMachOp mop
+        = case mop of
+            MO_And    _ -> Just $ char '&'
+            MO_Or     _ -> Just $ char '|'
+            MO_Xor    _ -> Just $ char '^'
+            MO_Not    _ -> Just $ char '~'
+            MO_S_Neg  _ -> Just $ char '-' -- there is no unsigned neg :)
+            _ -> Nothing
+
+-- --------------------------------------------------------------------------
+-- Literals.
+--  To minimise line noise we adopt the convention that if the literal
+--  has the natural machine word size, we do not append the type
+--
+pprLit :: CmmLit -> SDoc
+pprLit lit = sdocWithDynFlags $ \dflags ->
+             case lit of
+    CmmInt i rep ->
+        hcat [ (if i < 0 then parens else id)(integer i)
+             , ppUnless (rep == wordWidth dflags) $
+               space <> dcolon <+> ppr rep ]
+
+    CmmFloat f rep     -> hsep [ double (fromRat f), dcolon, ppr rep ]
+    CmmVec lits        -> char '<' <> commafy (map pprLit lits) <> char '>'
+    CmmLabel clbl      -> ppr clbl
+    CmmLabelOff clbl i -> ppr clbl <> ppr_offset i
+    CmmLabelDiffOff clbl1 clbl2 i _ -> ppr clbl1 <> char '-'
+                                  <> ppr clbl2 <> ppr_offset i
+    CmmBlock id        -> ppr id
+    CmmHighStackMark -> text "<highSp>"
+
+pprLit1 :: CmmLit -> SDoc
+pprLit1 lit@(CmmLabelOff {}) = parens (pprLit lit)
+pprLit1 lit                  = pprLit lit
+
+ppr_offset :: Int -> SDoc
+ppr_offset i
+    | i==0      = empty
+    | i>=0      = char '+' <> int i
+    | otherwise = char '-' <> int (-i)
+
+-- --------------------------------------------------------------------------
+-- Registers, whether local (temps) or global
+--
+pprReg :: CmmReg -> SDoc
+pprReg r
+    = case r of
+        CmmLocal  local  -> pprLocalReg  local
+        CmmGlobal global -> pprGlobalReg global
+
+--
+-- We only print the type of the local reg if it isn't wordRep
+--
+pprLocalReg :: LocalReg -> SDoc
+pprLocalReg (LocalReg uniq rep) = sdocWithDynFlags $ \dflags ->
+--   = ppr rep <> char '_' <> ppr uniq
+-- Temp Jan08
+    char '_' <> pprUnique dflags uniq <>
+       (if isWord32 rep -- && not (isGcPtrType rep) -- Temp Jan08               -- sigh
+                    then dcolon <> ptr <> ppr rep
+                    else dcolon <> ptr <> ppr rep)
+   where
+     pprUnique dflags unique =
+        if gopt Opt_SuppressUniques dflags
+            then text "_locVar_"
+            else ppr unique
+     ptr = empty
+         --if isGcPtrType rep
+         --      then doubleQuotes (text "ptr")
+         --      else empty
+
+-- Stack areas
+pprArea :: Area -> SDoc
+pprArea Old        = text "old"
+pprArea (Young id) = hcat [ text "young<", ppr id, text ">" ]
+
+-- needs to be kept in syn with CmmExpr.hs.GlobalReg
+--
+pprGlobalReg :: GlobalReg -> SDoc
+pprGlobalReg gr
+    = case gr of
+        VanillaReg n _ -> char 'R' <> int n
+-- Temp Jan08
+--        VanillaReg n VNonGcPtr -> char 'R' <> int n
+--        VanillaReg n VGcPtr    -> char 'P' <> int n
+        FloatReg   n   -> char 'F' <> int n
+        DoubleReg  n   -> char 'D' <> int n
+        LongReg    n   -> char 'L' <> int n
+        XmmReg     n   -> text "XMM" <> int n
+        YmmReg     n   -> text "YMM" <> int n
+        ZmmReg     n   -> text "ZMM" <> int n
+        Sp             -> text "Sp"
+        SpLim          -> text "SpLim"
+        Hp             -> text "Hp"
+        HpLim          -> text "HpLim"
+        MachSp         -> text "MachSp"
+        UnwindReturnReg-> text "UnwindReturnReg"
+        CCCS           -> text "CCCS"
+        CurrentTSO     -> text "CurrentTSO"
+        CurrentNursery -> text "CurrentNursery"
+        HpAlloc        -> text "HpAlloc"
+        EagerBlackholeInfo -> text "stg_EAGER_BLACKHOLE_info"
+        GCEnter1       -> text "stg_gc_enter_1"
+        GCFun          -> text "stg_gc_fun"
+        BaseReg        -> text "BaseReg"
+        PicBaseReg     -> text "PicBaseReg"
+
+-----------------------------------------------------------------------------
+
+commafy :: [SDoc] -> SDoc
+commafy xs = fsep $ punctuate comma xs
diff --git a/compiler/GHC/Cmm/ProcPoint.hs b/compiler/GHC/Cmm/ProcPoint.hs
new file mode 100644
index 0000000000..00a7a73d89
--- /dev/null
+++ b/compiler/GHC/Cmm/ProcPoint.hs
@@ -0,0 +1,496 @@
+{-# LANGUAGE GADTs, DisambiguateRecordFields, BangPatterns #-}
+
+module GHC.Cmm.ProcPoint
+    ( ProcPointSet, Status(..)
+    , callProcPoints, minimalProcPointSet
+    , splitAtProcPoints, procPointAnalysis
+    , attachContInfoTables
+    )
+where
+
+import GhcPrelude hiding (last, unzip, succ, zip)
+
+import DynFlags
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import GHC.Cmm
+import GHC.Cmm.Ppr () -- For Outputable instances
+import GHC.Cmm.Utils
+import GHC.Cmm.Info
+import GHC.Cmm.Liveness
+import GHC.Cmm.Switch
+import Data.List (sortBy)
+import Maybes
+import Control.Monad
+import Outputable
+import GHC.Platform
+import UniqSupply
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+
+-- Compute a minimal set of proc points for a control-flow graph.
+
+-- Determine a protocol for each proc point (which live variables will
+-- be passed as arguments and which will be on the stack).
+
+{-
+A proc point is a basic block that, after CPS transformation, will
+start a new function.  The entry block of the original function is a
+proc point, as is the continuation of each function call.
+A third kind of proc point arises if we want to avoid copying code.
+Suppose we have code like the following:
+
+  f() {
+    if (...) { ..1..; call foo(); ..2..}
+    else     { ..3..; call bar(); ..4..}
+    x = y + z;
+    return x;
+  }
+
+The statement 'x = y + z' can be reached from two different proc
+points: the continuations of foo() and bar().  We would prefer not to
+put a copy in each continuation; instead we would like 'x = y + z' to
+be the start of a new procedure to which the continuations can jump:
+
+  f_cps () {
+    if (...) { ..1..; push k_foo; jump foo_cps(); }
+    else     { ..3..; push k_bar; jump bar_cps(); }
+  }
+  k_foo() { ..2..; jump k_join(y, z); }
+  k_bar() { ..4..; jump k_join(y, z); }
+  k_join(y, z) { x = y + z; return x; }
+
+You might think then that a criterion to make a node a proc point is
+that it is directly reached by two distinct proc points.  (Note
+[Direct reachability].)  But this criterion is a bit too simple; for
+example, 'return x' is also reached by two proc points, yet there is
+no point in pulling it out of k_join.  A good criterion would be to
+say that a node should be made a proc point if it is reached by a set
+of proc points that is different than its immediate dominator.  NR
+believes this criterion can be shown to produce a minimum set of proc
+points, and given a dominator tree, the proc points can be chosen in
+time linear in the number of blocks.  Lacking a dominator analysis,
+however, we turn instead to an iterative solution, starting with no
+proc points and adding them according to these rules:
+
+  1. The entry block is a proc point.
+  2. The continuation of a call is a proc point.
+  3. A node is a proc point if it is directly reached by more proc
+     points than one of its predecessors.
+
+Because we don't understand the problem very well, we apply rule 3 at
+most once per iteration, then recompute the reachability information.
+(See Note [No simple dataflow].)  The choice of the new proc point is
+arbitrary, and I don't know if the choice affects the final solution,
+so I don't know if the number of proc points chosen is the
+minimum---but the set will be minimal.
+
+
+
+Note [Proc-point analysis]
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Given a specified set of proc-points (a set of block-ids), "proc-point
+analysis" figures out, for every block, which proc-point it belongs to.
+All the blocks belonging to proc-point P will constitute a single
+top-level C procedure.
+
+A non-proc-point block B "belongs to" a proc-point P iff B is
+reachable from P without going through another proc-point.
+
+Invariant: a block B should belong to at most one proc-point; if it
+belongs to two, that's a bug.
+
+Note [Non-existing proc-points]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+On some architectures it might happen that the list of proc-points
+computed before stack layout pass will be invalidated by the stack
+layout. This will happen if stack layout removes from the graph
+blocks that were determined to be proc-points. Later on in the pipeline
+we use list of proc-points to perform [Proc-point analysis], but
+if a proc-point does not exist anymore then we will get compiler panic.
+See #8205.
+-}
+
+type ProcPointSet = LabelSet
+
+data Status
+  = ReachedBy ProcPointSet  -- set of proc points that directly reach the block
+  | ProcPoint               -- this block is itself a proc point
+
+instance Outputable Status where
+  ppr (ReachedBy ps)
+      | setNull ps = text "<not-reached>"
+      | otherwise = text "reached by" <+>
+                    (hsep $ punctuate comma $ map ppr $ setElems ps)
+  ppr ProcPoint = text "<procpt>"
+
+--------------------------------------------------
+-- Proc point analysis
+
+-- Once you know what the proc-points are, figure out
+-- what proc-points each block is reachable from
+-- See Note [Proc-point analysis]
+procPointAnalysis :: ProcPointSet -> CmmGraph -> LabelMap Status
+procPointAnalysis procPoints cmmGraph@(CmmGraph {g_graph = graph}) =
+    analyzeCmmFwd procPointLattice procPointTransfer cmmGraph initProcPoints
+  where
+    initProcPoints =
+        mkFactBase
+            procPointLattice
+            [ (id, ProcPoint)
+            | id <- setElems procPoints
+            -- See Note [Non-existing proc-points]
+            , id `setMember` labelsInGraph
+            ]
+    labelsInGraph = labelsDefined graph
+
+procPointTransfer :: TransferFun Status
+procPointTransfer block facts =
+    let label = entryLabel block
+        !fact = case getFact procPointLattice label facts of
+            ProcPoint -> ReachedBy $! setSingleton label
+            f -> f
+        result = map (\id -> (id, fact)) (successors block)
+    in mkFactBase procPointLattice result
+
+procPointLattice :: DataflowLattice Status
+procPointLattice = DataflowLattice unreached add_to
+  where
+    unreached = ReachedBy setEmpty
+    add_to (OldFact ProcPoint) _ = NotChanged ProcPoint
+    add_to _ (NewFact ProcPoint) = Changed ProcPoint -- because of previous case
+    add_to (OldFact (ReachedBy p)) (NewFact (ReachedBy p'))
+        | setSize union > setSize p = Changed (ReachedBy union)
+        | otherwise = NotChanged (ReachedBy p)
+      where
+        union = setUnion p' p
+
+----------------------------------------------------------------------
+
+-- It is worth distinguishing two sets of proc points: those that are
+-- induced by calls in the original graph and those that are
+-- introduced because they're reachable from multiple proc points.
+--
+-- Extract the set of Continuation BlockIds, see Note [Continuation BlockIds].
+callProcPoints      :: CmmGraph -> ProcPointSet
+callProcPoints g = foldlGraphBlocks add (setSingleton (g_entry g)) g
+  where add :: LabelSet -> CmmBlock -> LabelSet
+        add set b = case lastNode b of
+                      CmmCall {cml_cont = Just k} -> setInsert k set
+                      CmmForeignCall {succ=k}     -> setInsert k set
+                      _ -> set
+
+minimalProcPointSet :: Platform -> ProcPointSet -> CmmGraph
+                    -> UniqSM ProcPointSet
+-- Given the set of successors of calls (which must be proc-points)
+-- figure out the minimal set of necessary proc-points
+minimalProcPointSet platform callProcPoints g
+  = extendPPSet platform g (revPostorder g) callProcPoints
+
+extendPPSet
+    :: Platform -> CmmGraph -> [CmmBlock] -> ProcPointSet -> UniqSM ProcPointSet
+extendPPSet platform g blocks procPoints =
+    let env = procPointAnalysis procPoints g
+        add pps block = let id = entryLabel block
+                        in  case mapLookup id env of
+                              Just ProcPoint -> setInsert id pps
+                              _ -> pps
+        procPoints' = foldlGraphBlocks add setEmpty g
+        newPoints = mapMaybe ppSuccessor blocks
+        newPoint  = listToMaybe newPoints
+        ppSuccessor b =
+            let nreached id = case mapLookup id env `orElse`
+                                    pprPanic "no ppt" (ppr id <+> ppr b) of
+                                ProcPoint -> 1
+                                ReachedBy ps -> setSize ps
+                block_procpoints = nreached (entryLabel b)
+                -- | Looking for a successor of b that is reached by
+                -- more proc points than b and is not already a proc
+                -- point.  If found, it can become a proc point.
+                newId succ_id = not (setMember succ_id procPoints') &&
+                                nreached succ_id > block_procpoints
+            in  listToMaybe $ filter newId $ successors b
+
+    in case newPoint of
+         Just id ->
+             if setMember id procPoints'
+                then panic "added old proc pt"
+                else extendPPSet platform g blocks (setInsert id procPoints')
+         Nothing -> return procPoints'
+
+
+-- At this point, we have found a set of procpoints, each of which should be
+-- the entry point of a procedure.
+-- Now, we create the procedure for each proc point,
+-- which requires that we:
+-- 1. build a map from proc points to the blocks reachable from the proc point
+-- 2. turn each branch to a proc point into a jump
+-- 3. turn calls and returns into jumps
+-- 4. build info tables for the procedures -- and update the info table for
+--    the SRTs in the entry procedure as well.
+-- Input invariant: A block should only be reachable from a single ProcPoint.
+-- ToDo: use the _ret naming convention that the old code generator
+-- used. -- EZY
+splitAtProcPoints :: DynFlags -> CLabel -> ProcPointSet-> ProcPointSet -> LabelMap Status ->
+                     CmmDecl -> UniqSM [CmmDecl]
+splitAtProcPoints dflags entry_label callPPs procPoints procMap
+                  (CmmProc (TopInfo {info_tbls = info_tbls})
+                           top_l _ g@(CmmGraph {g_entry=entry})) =
+  do -- Build a map from procpoints to the blocks they reach
+     let add_block
+             :: LabelMap (LabelMap CmmBlock)
+             -> CmmBlock
+             -> LabelMap (LabelMap CmmBlock)
+         add_block graphEnv b =
+           case mapLookup bid procMap of
+             Just ProcPoint -> add graphEnv bid bid b
+             Just (ReachedBy set) ->
+               case setElems set of
+                 []   -> graphEnv
+                 [id] -> add graphEnv id bid b
+                 _    -> panic "Each block should be reachable from only one ProcPoint"
+             Nothing -> graphEnv
+           where bid = entryLabel b
+         add graphEnv procId bid b = mapInsert procId graph' graphEnv
+               where graph  = mapLookup procId graphEnv `orElse` mapEmpty
+                     graph' = mapInsert bid b graph
+
+     let liveness = cmmGlobalLiveness dflags g
+     let ppLiveness pp = filter isArgReg $
+                         regSetToList $
+                         expectJust "ppLiveness" $ mapLookup pp liveness
+
+     graphEnv <- return $ foldlGraphBlocks add_block mapEmpty g
+
+     -- Build a map from proc point BlockId to pairs of:
+     --  * Labels for their new procedures
+     --  * Labels for the info tables of their new procedures (only if
+     --    the proc point is a callPP)
+     -- Due to common blockification, we may overestimate the set of procpoints.
+     let add_label map pp = mapInsert pp lbls map
+           where lbls | pp == entry = (entry_label, fmap cit_lbl (mapLookup entry info_tbls))
+                      | otherwise   = (block_lbl, guard (setMember pp callPPs) >>
+                                                    Just info_table_lbl)
+                      where block_lbl      = blockLbl pp
+                            info_table_lbl = infoTblLbl pp
+
+         procLabels :: LabelMap (CLabel, Maybe CLabel)
+         procLabels = foldl' add_label mapEmpty
+                             (filter (flip mapMember (toBlockMap g)) (setElems procPoints))
+
+     -- In each new graph, add blocks jumping off to the new procedures,
+     -- and replace branches to procpoints with branches to the jump-off blocks
+     let add_jump_block
+             :: (LabelMap Label, [CmmBlock])
+             -> (Label, CLabel)
+             -> UniqSM (LabelMap Label, [CmmBlock])
+         add_jump_block (env, bs) (pp, l) =
+           do bid <- liftM mkBlockId getUniqueM
+              let b = blockJoin (CmmEntry bid GlobalScope) emptyBlock jump
+                  live = ppLiveness pp
+                  jump = CmmCall (CmmLit (CmmLabel l)) Nothing live 0 0 0
+              return (mapInsert pp bid env, b : bs)
+
+         add_jumps
+             :: LabelMap CmmGraph
+             -> (Label, LabelMap CmmBlock)
+             -> UniqSM (LabelMap CmmGraph)
+         add_jumps newGraphEnv (ppId, blockEnv) =
+           do let needed_jumps = -- find which procpoints we currently branch to
+                    mapFoldr add_if_branch_to_pp [] blockEnv
+                  add_if_branch_to_pp :: CmmBlock -> [(BlockId, CLabel)] -> [(BlockId, CLabel)]
+                  add_if_branch_to_pp block rst =
+                    case lastNode block of
+                      CmmBranch id          -> add_if_pp id rst
+                      CmmCondBranch _ ti fi _ -> add_if_pp ti (add_if_pp fi rst)
+                      CmmSwitch _ ids       -> foldr add_if_pp rst $ switchTargetsToList ids
+                      _                     -> rst
+
+                  -- when jumping to a PP that has an info table, if
+                  -- tablesNextToCode is off we must jump to the entry
+                  -- label instead.
+                  jump_label (Just info_lbl) _
+                             | tablesNextToCode dflags = info_lbl
+                             | otherwise               = toEntryLbl info_lbl
+                  jump_label Nothing         block_lbl = block_lbl
+
+                  add_if_pp id rst = case mapLookup id procLabels of
+                                       Just (lbl, mb_info_lbl) -> (id, jump_label mb_info_lbl lbl) : rst
+                                       Nothing                 -> rst
+              (jumpEnv, jumpBlocks) <-
+                 foldM add_jump_block (mapEmpty, []) needed_jumps
+                  -- update the entry block
+              let b = expectJust "block in env" $ mapLookup ppId blockEnv
+                  blockEnv' = mapInsert ppId b blockEnv
+                  -- replace branches to procpoints with branches to jumps
+                  blockEnv'' = toBlockMap $ replaceBranches jumpEnv $ ofBlockMap ppId blockEnv'
+                  -- add the jump blocks to the graph
+                  blockEnv''' = foldl' (flip addBlock) blockEnv'' jumpBlocks
+              let g' = ofBlockMap ppId blockEnv'''
+              -- pprTrace "g' pre jumps" (ppr g') $ do
+              return (mapInsert ppId g' newGraphEnv)
+
+     graphEnv <- foldM add_jumps mapEmpty $ mapToList graphEnv
+
+     let to_proc (bid, g)
+             | bid == entry
+             =  CmmProc (TopInfo {info_tbls  = info_tbls,
+                                  stack_info = stack_info})
+                        top_l live g'
+             | otherwise
+             = case expectJust "pp label" $ mapLookup bid procLabels of
+                 (lbl, Just info_lbl)
+                    -> CmmProc (TopInfo { info_tbls = mapSingleton (g_entry g) (mkEmptyContInfoTable info_lbl)
+                                        , stack_info=stack_info})
+                               lbl live g'
+                 (lbl, Nothing)
+                    -> CmmProc (TopInfo {info_tbls = mapEmpty, stack_info=stack_info})
+                               lbl live g'
+                where
+                 g' = replacePPIds g
+                 live = ppLiveness (g_entry g')
+                 stack_info = StackInfo { arg_space = 0
+                                        , updfr_space =  Nothing
+                                        , do_layout = True }
+                               -- cannot use panic, this is printed by -ddump-cmm
+
+         -- References to procpoint IDs can now be replaced with the
+         -- infotable's label
+         replacePPIds g = {-# SCC "replacePPIds" #-}
+                          mapGraphNodes (id, mapExp repl, mapExp repl) g
+           where repl e@(CmmLit (CmmBlock bid)) =
+                   case mapLookup bid procLabels of
+                     Just (_, Just info_lbl)  -> CmmLit (CmmLabel info_lbl)
+                     _ -> e
+                 repl e = e
+
+     -- The C back end expects to see return continuations before the
+     -- call sites.  Here, we sort them in reverse order -- it gets
+     -- reversed later.
+     let (_, block_order) =
+             foldl' add_block_num (0::Int, mapEmpty :: LabelMap Int)
+                   (revPostorder g)
+         add_block_num (i, map) block =
+           (i + 1, mapInsert (entryLabel block) i map)
+         sort_fn (bid, _) (bid', _) =
+           compare (expectJust "block_order" $ mapLookup bid  block_order)
+                   (expectJust "block_order" $ mapLookup bid' block_order)
+     procs <- return $ map to_proc $ sortBy sort_fn $ mapToList graphEnv
+     return -- pprTrace "procLabels" (ppr procLabels)
+            -- pprTrace "splitting graphs" (ppr procs)
+            procs
+splitAtProcPoints _ _ _ _ _ t@(CmmData _ _) = return [t]
+
+-- Only called from GHC.Cmm.ProcPoint.splitAtProcPoints. NB. does a
+-- recursive lookup, see comment below.
+replaceBranches :: LabelMap BlockId -> CmmGraph -> CmmGraph
+replaceBranches env cmmg
+  = {-# SCC "replaceBranches" #-}
+    ofBlockMap (g_entry cmmg) $ mapMap f $ toBlockMap cmmg
+  where
+    f block = replaceLastNode block $ last (lastNode block)
+
+    last :: CmmNode O C -> CmmNode O C
+    last (CmmBranch id)          = CmmBranch (lookup id)
+    last (CmmCondBranch e ti fi l) = CmmCondBranch e (lookup ti) (lookup fi) l
+    last (CmmSwitch e ids)       = CmmSwitch e (mapSwitchTargets lookup ids)
+    last l@(CmmCall {})          = l { cml_cont = Nothing }
+            -- NB. remove the continuation of a CmmCall, since this
+            -- label will now be in a different CmmProc.  Not only
+            -- is this tidier, it stops CmmLint from complaining.
+    last l@(CmmForeignCall {})   = l
+    lookup id = fmap lookup (mapLookup id env) `orElse` id
+            -- XXX: this is a recursive lookup, it follows chains
+            -- until the lookup returns Nothing, at which point we
+            -- return the last BlockId
+
+-- --------------------------------------------------------------
+-- Not splitting proc points: add info tables for continuations
+
+attachContInfoTables :: ProcPointSet -> CmmDecl -> CmmDecl
+attachContInfoTables call_proc_points (CmmProc top_info top_l live g)
+ = CmmProc top_info{info_tbls = info_tbls'} top_l live g
+ where
+   info_tbls' = mapUnion (info_tbls top_info) $
+                mapFromList [ (l, mkEmptyContInfoTable (infoTblLbl l))
+                            | l <- setElems call_proc_points
+                            , l /= g_entry g ]
+attachContInfoTables _ other_decl
+ = other_decl
+
+----------------------------------------------------------------
+
+{-
+Note [Direct reachability]
+
+Block B is directly reachable from proc point P iff control can flow
+from P to B without passing through an intervening proc point.
+-}
+
+----------------------------------------------------------------
+
+{-
+Note [No simple dataflow]
+
+Sadly, it seems impossible to compute the proc points using a single
+dataflow pass.  One might attempt to use this simple lattice:
+
+  data Location = Unknown
+                | InProc BlockId -- node is in procedure headed by the named proc point
+                | ProcPoint      -- node is itself a proc point
+
+At a join, a node in two different blocks becomes a proc point.
+The difficulty is that the change of information during iterative
+computation may promote a node prematurely.  Here's a program that
+illustrates the difficulty:
+
+  f () {
+  entry:
+    ....
+  L1:
+    if (...) { ... }
+    else { ... }
+
+  L2: if (...) { g(); goto L1; }
+      return x + y;
+  }
+
+The only proc-point needed (besides the entry) is L1.  But in an
+iterative analysis, consider what happens to L2.  On the first pass
+through, it rises from Unknown to 'InProc entry', but when L1 is
+promoted to a proc point (because it's the successor of g()), L1's
+successors will be promoted to 'InProc L1'.  The problem hits when the
+new fact 'InProc L1' flows into L2 which is already bound to 'InProc entry'.
+The join operation makes it a proc point when in fact it needn't be,
+because its immediate dominator L1 is already a proc point and there
+are no other proc points that directly reach L2.
+-}
+
+
+
+{- Note [Separate Adams optimization]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+It may be worthwhile to attempt the Adams optimization by rewriting
+the graph before the assignment of proc-point protocols.  Here are a
+couple of rules:
+
+  g() returns to k;                    g() returns to L;
+  k: CopyIn c ress; goto L:
+   ...                        ==>        ...
+  L: // no CopyIn node here            L: CopyIn c ress;
+
+
+And when c == c' and ress == ress', this also:
+
+  g() returns to k;                    g() returns to L;
+  k: CopyIn c ress; goto L:
+   ...                        ==>        ...
+  L: CopyIn c' ress'                   L: CopyIn c' ress' ;
+
+In both cases the goal is to eliminate k.
+-}
diff --git a/compiler/GHC/Cmm/Sink.hs b/compiler/GHC/Cmm/Sink.hs
new file mode 100644
index 0000000000..8e231df300
--- /dev/null
+++ b/compiler/GHC/Cmm/Sink.hs
@@ -0,0 +1,854 @@
+{-# LANGUAGE GADTs #-}
+module GHC.Cmm.Sink (
+     cmmSink
+  ) where
+
+import GhcPrelude
+
+import GHC.Cmm
+import GHC.Cmm.Opt
+import GHC.Cmm.Liveness
+import GHC.Cmm.Utils
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Graph
+import GHC.Platform.Regs
+import GHC.Platform (isARM, platformArch)
+
+import DynFlags
+import Unique
+import UniqFM
+
+import qualified Data.IntSet as IntSet
+import Data.List (partition)
+import qualified Data.Set as Set
+import Data.Maybe
+
+-- Compact sets for membership tests of local variables.
+
+type LRegSet = IntSet.IntSet
+
+emptyLRegSet :: LRegSet
+emptyLRegSet = IntSet.empty
+
+nullLRegSet :: LRegSet -> Bool
+nullLRegSet = IntSet.null
+
+insertLRegSet :: LocalReg -> LRegSet -> LRegSet
+insertLRegSet l = IntSet.insert (getKey (getUnique l))
+
+elemLRegSet :: LocalReg -> LRegSet -> Bool
+elemLRegSet l = IntSet.member (getKey (getUnique l))
+
+-- -----------------------------------------------------------------------------
+-- Sinking and inlining
+
+-- This is an optimisation pass that
+--  (a) moves assignments closer to their uses, to reduce register pressure
+--  (b) pushes assignments into a single branch of a conditional if possible
+--  (c) inlines assignments to registers that are mentioned only once
+--  (d) discards dead assignments
+--
+-- This tightens up lots of register-heavy code.  It is particularly
+-- helpful in the Cmm generated by the Stg->Cmm code generator, in
+-- which every function starts with a copyIn sequence like:
+--
+--    x1 = R1
+--    x2 = Sp[8]
+--    x3 = Sp[16]
+--    if (Sp - 32 < SpLim) then L1 else L2
+--
+-- we really want to push the x1..x3 assignments into the L2 branch.
+--
+-- Algorithm:
+--
+--  * Start by doing liveness analysis.
+--
+--  * Keep a list of assignments A; earlier ones may refer to later ones.
+--    Currently we only sink assignments to local registers, because we don't
+--    have liveness information about global registers.
+--
+--  * Walk forwards through the graph, look at each node N:
+--
+--    * If it is a dead assignment, i.e. assignment to a register that is
+--      not used after N, discard it.
+--
+--    * Try to inline based on current list of assignments
+--      * If any assignments in A (1) occur only once in N, and (2) are
+--        not live after N, inline the assignment and remove it
+--        from A.
+--
+--      * If an assignment in A is cheap (RHS is local register), then
+--        inline the assignment and keep it in A in case it is used afterwards.
+--
+--      * Otherwise don't inline.
+--
+--    * If N is assignment to a local register pick up the assignment
+--      and add it to A.
+--
+--    * If N is not an assignment to a local register:
+--      * remove any assignments from A that conflict with N, and
+--        place them before N in the current block.  We call this
+--        "dropping" the assignments.
+--
+--      * An assignment conflicts with N if it:
+--        - assigns to a register mentioned in N
+--        - mentions a register assigned by N
+--        - reads from memory written by N
+--      * do this recursively, dropping dependent assignments
+--
+--    * At an exit node:
+--      * drop any assignments that are live on more than one successor
+--        and are not trivial
+--      * if any successor has more than one predecessor (a join-point),
+--        drop everything live in that successor. Since we only propagate
+--        assignments that are not dead at the successor, we will therefore
+--        eliminate all assignments dead at this point. Thus analysis of a
+--        join-point will always begin with an empty list of assignments.
+--
+--
+-- As a result of above algorithm, sinking deletes some dead assignments
+-- (transitively, even).  This isn't as good as removeDeadAssignments,
+-- but it's much cheaper.
+
+-- -----------------------------------------------------------------------------
+-- things that we aren't optimising very well yet.
+--
+-- -----------
+-- (1) From GHC's FastString.hashStr:
+--
+--  s2ay:
+--      if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
+--  c2gn:
+--      R1 = _s2au::I64;
+--      call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
+--  c2gp:
+--      _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
+--                                 4091);
+--      _s2an::I64 = _s2an::I64 + 1;
+--      _s2au::I64 = _s2cO::I64;
+--      goto s2ay;
+--
+-- a nice loop, but we didn't eliminate the silly assignment at the end.
+-- See Note [dependent assignments], which would probably fix this.
+-- This is #8336.
+--
+-- -----------
+-- (2) From stg_atomically_frame in PrimOps.cmm
+--
+-- We have a diamond control flow:
+--
+--     x = ...
+--       |
+--      / \
+--     A   B
+--      \ /
+--       |
+--    use of x
+--
+-- Now x won't be sunk down to its use, because we won't push it into
+-- both branches of the conditional.  We certainly do have to check
+-- that we can sink it past all the code in both A and B, but having
+-- discovered that, we could sink it to its use.
+--
+
+-- -----------------------------------------------------------------------------
+
+type Assignment = (LocalReg, CmmExpr, AbsMem)
+  -- Assignment caches AbsMem, an abstraction of the memory read by
+  -- the RHS of the assignment.
+
+type Assignments = [Assignment]
+  -- A sequence of assignments; kept in *reverse* order
+  -- So the list [ x=e1, y=e2 ] means the sequence of assignments
+  --     y = e2
+  --     x = e1
+
+cmmSink :: DynFlags -> CmmGraph -> CmmGraph
+cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
+  where
+  liveness = cmmLocalLiveness dflags graph
+  getLive l = mapFindWithDefault Set.empty l liveness
+
+  blocks = revPostorder graph
+
+  join_pts = findJoinPoints blocks
+
+  sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock]
+  sink _ [] = []
+  sink sunk (b:bs) =
+    -- pprTrace "sink" (ppr lbl) $
+    blockJoin first final_middle final_last : sink sunk' bs
+    where
+      lbl = entryLabel b
+      (first, middle, last) = blockSplit b
+
+      succs = successors last
+
+      -- Annotate the middle nodes with the registers live *after*
+      -- the node.  This will help us decide whether we can inline
+      -- an assignment in the current node or not.
+      live = Set.unions (map getLive succs)
+      live_middle = gen_kill dflags last live
+      ann_middles = annotate dflags live_middle (blockToList middle)
+
+      -- Now sink and inline in this block
+      (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
+      fold_last = constantFoldNode dflags last
+      (final_last, assigs') = tryToInline dflags live fold_last assigs
+
+      -- We cannot sink into join points (successors with more than
+      -- one predecessor), so identify the join points and the set
+      -- of registers live in them.
+      (joins, nonjoins) = partition (`mapMember` join_pts) succs
+      live_in_joins = Set.unions (map getLive joins)
+
+      -- We do not want to sink an assignment into multiple branches,
+      -- so identify the set of registers live in multiple successors.
+      -- This is made more complicated because when we sink an assignment
+      -- into one branch, this might change the set of registers that are
+      -- now live in multiple branches.
+      init_live_sets = map getLive nonjoins
+      live_in_multi live_sets r =
+         case filter (Set.member r) live_sets of
+           (_one:_two:_) -> True
+           _ -> False
+
+      -- Now, drop any assignments that we will not sink any further.
+      (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
+
+      drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
+          where
+            should_drop =  conflicts dflags a final_last
+                        || not (isTrivial dflags rhs) && live_in_multi live_sets r
+                        || r `Set.member` live_in_joins
+
+            live_sets' | should_drop = live_sets
+                       | otherwise   = map upd live_sets
+
+            upd set | r `Set.member` set = set `Set.union` live_rhs
+                    | otherwise          = set
+
+            live_rhs = foldRegsUsed dflags extendRegSet emptyRegSet rhs
+
+      final_middle = foldl' blockSnoc middle' dropped_last
+
+      sunk' = mapUnion sunk $
+                 mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
+                             | l <- succs ]
+
+{- TODO: enable this later, when we have some good tests in place to
+   measure the effect and tune it.
+
+-- small: an expression we don't mind duplicating
+isSmall :: CmmExpr -> Bool
+isSmall (CmmReg (CmmLocal _)) = True  --
+isSmall (CmmLit _) = True
+isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
+isSmall (CmmRegOff (CmmLocal _) _) = True
+isSmall _ = False
+-}
+
+--
+-- We allow duplication of trivial expressions: registers (both local and
+-- global) and literals.
+--
+isTrivial :: DynFlags -> CmmExpr -> Bool
+isTrivial _ (CmmReg (CmmLocal _)) = True
+isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
+  if isARM (platformArch (targetPlatform dflags))
+  then True -- CodeGen.Platform.ARM does not have globalRegMaybe
+  else isJust (globalRegMaybe (targetPlatform dflags) r)
+  -- GlobalRegs that are loads from BaseReg are not trivial
+isTrivial _ (CmmLit _) = True
+isTrivial _ _          = False
+
+--
+-- annotate each node with the set of registers live *after* the node
+--
+annotate :: DynFlags -> LocalRegSet -> [CmmNode O O] -> [(LocalRegSet, CmmNode O O)]
+annotate dflags live nodes = snd $ foldr ann (live,[]) nodes
+  where ann n (live,nodes) = (gen_kill dflags n live, (live,n) : nodes)
+
+--
+-- Find the blocks that have multiple successors (join points)
+--
+findJoinPoints :: [CmmBlock] -> LabelMap Int
+findJoinPoints blocks = mapFilter (>1) succ_counts
+ where
+  all_succs = concatMap successors blocks
+
+  succ_counts :: LabelMap Int
+  succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
+
+--
+-- filter the list of assignments to remove any assignments that
+-- are not live in a continuation.
+--
+filterAssignments :: DynFlags -> LocalRegSet -> Assignments -> Assignments
+filterAssignments dflags live assigs = reverse (go assigs [])
+  where go []             kept = kept
+        go (a@(r,_,_):as) kept | needed    = go as (a:kept)
+                               | otherwise = go as kept
+           where
+              needed = r `Set.member` live
+                       || any (conflicts dflags a) (map toNode kept)
+                       --  Note that we must keep assignments that are
+                       -- referred to by other assignments we have
+                       -- already kept.
+
+-- -----------------------------------------------------------------------------
+-- Walk through the nodes of a block, sinking and inlining assignments
+-- as we go.
+--
+-- On input we pass in a:
+--    * list of nodes in the block
+--    * a list of assignments that appeared *before* this block and
+--      that are being sunk.
+--
+-- On output we get:
+--    * a new block
+--    * a list of assignments that will be placed *after* that block.
+--
+
+walk :: DynFlags
+     -> [(LocalRegSet, CmmNode O O)]    -- nodes of the block, annotated with
+                                        -- the set of registers live *after*
+                                        -- this node.
+
+     -> Assignments                     -- The current list of
+                                        -- assignments we are sinking.
+                                        -- Earlier assignments may refer
+                                        -- to later ones.
+
+     -> ( Block CmmNode O O             -- The new block
+        , Assignments                   -- Assignments to sink further
+        )
+
+walk dflags nodes assigs = go nodes emptyBlock assigs
+ where
+   go []               block as = (block, as)
+   go ((live,node):ns) block as
+    | shouldDiscard node live           = go ns block as
+       -- discard dead assignment
+    | Just a <- shouldSink dflags node2 = go ns block (a : as1)
+    | otherwise                         = go ns block' as'
+    where
+      node1 = constantFoldNode dflags node
+
+      (node2, as1) = tryToInline dflags live node1 as
+
+      (dropped, as') = dropAssignmentsSimple dflags
+                          (\a -> conflicts dflags a node2) as1
+
+      block' = foldl' blockSnoc block dropped `blockSnoc` node2
+
+
+--
+-- Heuristic to decide whether to pick up and sink an assignment
+-- Currently we pick up all assignments to local registers.  It might
+-- be profitable to sink assignments to global regs too, but the
+-- liveness analysis doesn't track those (yet) so we can't.
+--
+shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
+shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
+  where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
+shouldSink _ _other = Nothing
+
+--
+-- discard dead assignments.  This doesn't do as good a job as
+-- removeDeadAssignments, because it would need multiple passes
+-- to get all the dead code, but it catches the common case of
+-- superfluous reloads from the stack that the stack allocator
+-- leaves behind.
+--
+-- Also we catch "r = r" here.  You might think it would fall
+-- out of inlining, but the inliner will see that r is live
+-- after the instruction and choose not to inline r in the rhs.
+--
+shouldDiscard :: CmmNode e x -> LocalRegSet -> Bool
+shouldDiscard node live
+   = case node of
+       CmmAssign r (CmmReg r') | r == r' -> True
+       CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
+       _otherwise -> False
+
+
+toNode :: Assignment -> CmmNode O O
+toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
+
+dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> Assignments
+                      -> ([CmmNode O O], Assignments)
+dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
+
+dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
+                -> ([CmmNode O O], Assignments)
+dropAssignments dflags should_drop state assigs
+ = (dropped, reverse kept)
+ where
+   (dropped,kept) = go state assigs [] []
+
+   go _ []             dropped kept = (dropped, kept)
+   go state (assig : rest) dropped kept
+      | conflict  = go state' rest (toNode assig : dropped) kept
+      | otherwise = go state' rest dropped (assig:kept)
+      where
+        (dropit, state') = should_drop assig state
+        conflict = dropit || any (conflicts dflags assig) dropped
+
+
+-- -----------------------------------------------------------------------------
+-- Try to inline assignments into a node.
+-- This also does constant folding for primpops, since
+-- inlining opens up opportunities for doing so.
+
+tryToInline
+   :: DynFlags
+   -> LocalRegSet               -- set of registers live after this
+                                -- node.  We cannot inline anything
+                                -- that is live after the node, unless
+                                -- it is small enough to duplicate.
+   -> CmmNode O x               -- The node to inline into
+   -> Assignments               -- Assignments to inline
+   -> (
+        CmmNode O x             -- New node
+      , Assignments             -- Remaining assignments
+      )
+
+tryToInline dflags live node assigs = go usages node emptyLRegSet assigs
+ where
+  usages :: UniqFM Int -- Maps each LocalReg to a count of how often it is used
+  usages = foldLocalRegsUsed dflags addUsage emptyUFM node
+
+  go _usages node _skipped [] = (node, [])
+
+  go usages node skipped (a@(l,rhs,_) : rest)
+   | cannot_inline           = dont_inline
+   | occurs_none             = discard  -- Note [discard during inlining]
+   | occurs_once             = inline_and_discard
+   | isTrivial dflags rhs    = inline_and_keep
+   | otherwise               = dont_inline
+   where
+        inline_and_discard = go usages' inl_node skipped rest
+          where usages' = foldLocalRegsUsed dflags addUsage usages rhs
+
+        discard = go usages node skipped rest
+
+        dont_inline        = keep node  -- don't inline the assignment, keep it
+        inline_and_keep    = keep inl_node -- inline the assignment, keep it
+
+        keep node' = (final_node, a : rest')
+          where (final_node, rest') = go usages' node' (insertLRegSet l skipped) rest
+                usages' = foldLocalRegsUsed dflags (\m r -> addToUFM m r 2)
+                                            usages rhs
+                -- we must not inline anything that is mentioned in the RHS
+                -- of a binding that we have already skipped, so we set the
+                -- usages of the regs on the RHS to 2.
+
+        cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments]
+                        || l `elemLRegSet` skipped
+                        || not (okToInline dflags rhs node)
+
+        l_usages = lookupUFM usages l
+        l_live   = l `elemRegSet` live
+
+        occurs_once = not l_live && l_usages == Just 1
+        occurs_none = not l_live && l_usages == Nothing
+
+        inl_node = improveConditional (mapExpDeep inl_exp node)
+
+        inl_exp :: CmmExpr -> CmmExpr
+        -- inl_exp is where the inlining actually takes place!
+        inl_exp (CmmReg    (CmmLocal l'))     | l == l' = rhs
+        inl_exp (CmmRegOff (CmmLocal l') off) | l == l'
+                    = cmmOffset dflags rhs off
+                    -- re-constant fold after inlining
+        inl_exp (CmmMachOp op args) = cmmMachOpFold dflags op args
+        inl_exp other = other
+
+
+{- Note [improveConditional]
+
+cmmMachOpFold tries to simplify conditionals to turn things like
+  (a == b) != 1
+into
+  (a != b)
+but there's one case it can't handle: when the comparison is over
+floating-point values, we can't invert it, because floating-point
+comparisons aren't invertible (because of NaNs).
+
+But we *can* optimise this conditional by swapping the true and false
+branches. Given
+  CmmCondBranch ((a >## b) != 1) t f
+we can turn it into
+  CmmCondBranch (a >## b) f t
+
+So here we catch conditionals that weren't optimised by cmmMachOpFold,
+and apply above transformation to eliminate the comparison against 1.
+
+It's tempting to just turn every != into == and then let cmmMachOpFold
+do its thing, but that risks changing a nice fall-through conditional
+into one that requires two jumps. (see swapcond_last in
+GHC.Cmm.ContFlowOpt), so instead we carefully look for just the cases where
+we can eliminate a comparison.
+-}
+improveConditional :: CmmNode O x -> CmmNode O x
+improveConditional
+  (CmmCondBranch (CmmMachOp mop [x, CmmLit (CmmInt 1 _)]) t f l)
+  | neLike mop, isComparisonExpr x
+  = CmmCondBranch x f t (fmap not l)
+  where
+    neLike (MO_Ne _) = True
+    neLike (MO_U_Lt _) = True   -- (x<y) < 1 behaves like (x<y) != 1
+    neLike (MO_S_Lt _) = True   -- (x<y) < 1 behaves like (x<y) != 1
+    neLike _ = False
+improveConditional other = other
+
+-- Note [dependent assignments]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- If our assignment list looks like
+--
+--    [ y = e,  x = ... y ... ]
+--
+-- We cannot inline x.  Remember this list is really in reverse order,
+-- so it means  x = ... y ...; y = e
+--
+-- Hence if we inline x, the outer assignment to y will capture the
+-- reference in x's right hand side.
+--
+-- In this case we should rename the y in x's right-hand side,
+-- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
+-- Now we can go ahead and inline x.
+--
+-- For now we do nothing, because this would require putting
+-- everything inside UniqSM.
+--
+-- One more variant of this (#7366):
+--
+--   [ y = e, y = z ]
+--
+-- If we don't want to inline y = e, because y is used many times, we
+-- might still be tempted to inline y = z (because we always inline
+-- trivial rhs's).  But of course we can't, because y is equal to e,
+-- not z.
+
+-- Note [discard during inlining]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- Opportunities to discard assignments sometimes appear after we've
+-- done some inlining.  Here's an example:
+--
+--      x = R1;
+--      y = P64[x + 7];
+--      z = P64[x + 15];
+--      /* z is dead */
+--      R1 = y & (-8);
+--
+-- The x assignment is trivial, so we inline it in the RHS of y, and
+-- keep both x and y.  z gets dropped because it is dead, then we
+-- inline y, and we have a dead assignment to x.  If we don't notice
+-- that x is dead in tryToInline, we end up retaining it.
+
+addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
+addUsage m r = addToUFM_C (+) m r 1
+
+regsUsedIn :: LRegSet -> CmmExpr -> Bool
+regsUsedIn ls _ | nullLRegSet ls = False
+regsUsedIn ls e = wrapRecExpf f e False
+  where f (CmmReg (CmmLocal l))      _ | l `elemLRegSet` ls = True
+        f (CmmRegOff (CmmLocal l) _) _ | l `elemLRegSet` ls = True
+        f _ z = z
+
+-- we don't inline into CmmUnsafeForeignCall if the expression refers
+-- to global registers.  This is a HACK to avoid global registers
+-- clashing with C argument-passing registers, really the back-end
+-- ought to be able to handle it properly, but currently neither PprC
+-- nor the NCG can do it.  See Note [Register parameter passing]
+-- See also GHC.StgToCmm.Foreign.load_args_into_temps.
+okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
+okToInline dflags expr node@(CmmUnsafeForeignCall{}) =
+    not (globalRegistersConflict dflags expr node)
+okToInline _ _ _ = True
+
+-- -----------------------------------------------------------------------------
+
+-- | @conflicts (r,e) node@ is @False@ if and only if the assignment
+-- @r = e@ can be safely commuted past statement @node@.
+conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
+conflicts dflags (r, rhs, addr) node
+
+  -- (1) node defines registers used by rhs of assignment. This catches
+  -- assignments and all three kinds of calls. See Note [Sinking and calls]
+  | globalRegistersConflict dflags rhs node                       = True
+  | localRegistersConflict  dflags rhs node                       = True
+
+  -- (2) node uses register defined by assignment
+  | foldRegsUsed dflags (\b r' -> r == r' || b) False node        = True
+
+  -- (3) a store to an address conflicts with a read of the same memory
+  | CmmStore addr' e <- node
+  , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
+
+  -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
+  | HeapMem    <- addr, CmmAssign (CmmGlobal Hp) _ <- node        = True
+  | StackMem   <- addr, CmmAssign (CmmGlobal Sp) _ <- node        = True
+  | SpMem{}    <- addr, CmmAssign (CmmGlobal Sp) _ <- node        = True
+
+  -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
+  | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem      = True
+
+  -- (6) native calls clobber any memory
+  | CmmCall{} <- node, memConflicts addr AnyMem                   = True
+
+  -- (7) otherwise, no conflict
+  | otherwise = False
+
+-- Returns True if node defines any global registers that are used in the
+-- Cmm expression
+globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
+globalRegistersConflict dflags expr node =
+    foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmGlobal r) expr)
+                 False node
+
+-- Returns True if node defines any local registers that are used in the
+-- Cmm expression
+localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
+localRegistersConflict dflags expr node =
+    foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmLocal  r) expr)
+                 False node
+
+-- Note [Sinking and calls]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
+-- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
+-- stack layout (see Note [Sinking after stack layout]) which leads to two
+-- invariants related to calls:
+--
+--   a) during stack layout phase all safe foreign calls are turned into
+--      unsafe foreign calls (see Note [Lower safe foreign calls]). This
+--      means that we will never encounter CmmForeignCall node when running
+--      sinking after stack layout
+--
+--   b) stack layout saves all variables live across a call on the stack
+--      just before making a call (remember we are not sinking assignments to
+--      stack):
+--
+--       L1:
+--          x = R1
+--          P64[Sp - 16] = L2
+--          P64[Sp - 8]  = x
+--          Sp = Sp - 16
+--          call f() returns L2
+--       L2:
+--
+--      We will attempt to sink { x = R1 } but we will detect conflict with
+--      { P64[Sp - 8]  = x } and hence we will drop { x = R1 } without even
+--      checking whether it conflicts with { call f() }. In this way we will
+--      never need to check any assignment conflicts with CmmCall. Remember
+--      that we still need to check for potential memory conflicts.
+--
+-- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
+-- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
+-- This assumption holds only when we do sinking after stack layout. If we run
+-- it before stack layout we need to check for possible conflicts with all three
+-- kinds of calls. Our `conflicts` function does that by using a generic
+-- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
+-- UserOfRegs typeclasses.
+--
+
+-- An abstraction of memory read or written.
+data AbsMem
+  = NoMem            -- no memory accessed
+  | AnyMem           -- arbitrary memory
+  | HeapMem          -- definitely heap memory
+  | StackMem         -- definitely stack memory
+  | SpMem            -- <size>[Sp+n]
+       {-# UNPACK #-} !Int
+       {-# UNPACK #-} !Int
+
+-- Having SpMem is important because it lets us float loads from Sp
+-- past stores to Sp as long as they don't overlap, and this helps to
+-- unravel some long sequences of
+--    x1 = [Sp + 8]
+--    x2 = [Sp + 16]
+--    ...
+--    [Sp + 8]  = xi
+--    [Sp + 16] = xj
+--
+-- Note that SpMem is invalidated if Sp is changed, but the definition
+-- of 'conflicts' above handles that.
+
+-- ToDo: this won't currently fix the following commonly occurring code:
+--    x1 = [R1 + 8]
+--    x2 = [R1 + 16]
+--    ..
+--    [Hp - 8] = x1
+--    [Hp - 16] = x2
+--    ..
+
+-- because [R1 + 8] and [Hp - 8] are both HeapMem.  We know that
+-- assignments to [Hp + n] do not conflict with any other heap memory,
+-- but this is tricky to nail down.  What if we had
+--
+--   x = Hp + n
+--   [x] = ...
+--
+--  the store to [x] should be "new heap", not "old heap".
+--  Furthermore, you could imagine that if we started inlining
+--  functions in Cmm then there might well be reads of heap memory
+--  that was written in the same basic block.  To take advantage of
+--  non-aliasing of heap memory we will have to be more clever.
+
+-- Note [Foreign calls clobber heap]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- It is tempting to say that foreign calls clobber only
+-- non-heap/stack memory, but unfortunately we break this invariant in
+-- the RTS.  For example, in stg_catch_retry_frame we call
+-- stmCommitNestedTransaction() which modifies the contents of the
+-- TRec it is passed (this actually caused incorrect code to be
+-- generated).
+--
+-- Since the invariant is true for the majority of foreign calls,
+-- perhaps we ought to have a special annotation for calls that can
+-- modify heap/stack memory.  For now we just use the conservative
+-- definition here.
+--
+-- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
+-- therefore we should never float any memory operations across one of
+-- these calls.
+
+
+bothMems :: AbsMem -> AbsMem -> AbsMem
+bothMems NoMem    x         = x
+bothMems x        NoMem     = x
+bothMems HeapMem  HeapMem   = HeapMem
+bothMems StackMem StackMem     = StackMem
+bothMems (SpMem o1 w1) (SpMem o2 w2)
+  | o1 == o2  = SpMem o1 (max w1 w2)
+  | otherwise = StackMem
+bothMems SpMem{}  StackMem  = StackMem
+bothMems StackMem SpMem{}   = StackMem
+bothMems _         _        = AnyMem
+
+memConflicts :: AbsMem -> AbsMem -> Bool
+memConflicts NoMem      _          = False
+memConflicts _          NoMem      = False
+memConflicts HeapMem    StackMem   = False
+memConflicts StackMem   HeapMem    = False
+memConflicts SpMem{}    HeapMem    = False
+memConflicts HeapMem    SpMem{}    = False
+memConflicts (SpMem o1 w1) (SpMem o2 w2)
+  | o1 < o2   = o1 + w1 > o2
+  | otherwise = o2 + w2 > o1
+memConflicts _         _         = True
+
+exprMem :: DynFlags -> CmmExpr -> AbsMem
+exprMem dflags (CmmLoad addr w)  = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
+exprMem dflags (CmmMachOp _ es)  = foldr bothMems NoMem (map (exprMem dflags) es)
+exprMem _      _                 = NoMem
+
+loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
+loadAddr dflags e w =
+  case e of
+   CmmReg r       -> regAddr dflags r 0 w
+   CmmRegOff r i  -> regAddr dflags r i w
+   _other | regUsedIn dflags spReg e -> StackMem
+          | otherwise -> AnyMem
+
+regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
+regAddr _      (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
+regAddr _      (CmmGlobal Hp) _ _ = HeapMem
+regAddr _      (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
+regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
+regAddr _      _ _ _ = AnyMem
+
+{-
+Note [Inline GlobalRegs?]
+
+Should we freely inline GlobalRegs?
+
+Actually it doesn't make a huge amount of difference either way, so we
+*do* currently treat GlobalRegs as "trivial" and inline them
+everywhere, but for what it's worth, here is what I discovered when I
+(SimonM) looked into this:
+
+Common sense says we should not inline GlobalRegs, because when we
+have
+
+  x = R1
+
+the register allocator will coalesce this assignment, generating no
+code, and simply record the fact that x is bound to $rbx (or
+whatever).  Furthermore, if we were to sink this assignment, then the
+range of code over which R1 is live increases, and the range of code
+over which x is live decreases.  All things being equal, it is better
+for x to be live than R1, because R1 is a fixed register whereas x can
+live in any register.  So we should neither sink nor inline 'x = R1'.
+
+However, not inlining GlobalRegs can have surprising
+consequences. e.g. (cgrun020)
+
+  c3EN:
+      _s3DB::P64 = R1;
+      _c3ES::P64 = _s3DB::P64 & 7;
+      if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
+  c3EU:
+      _s3DD::P64 = P64[_s3DB::P64 + 6];
+      _s3DE::P64 = P64[_s3DB::P64 + 14];
+      I64[Sp - 8] = c3F0;
+      R1 = _s3DE::P64;
+      P64[Sp] = _s3DD::P64;
+
+inlining the GlobalReg gives:
+
+  c3EN:
+      if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
+  c3EU:
+      I64[Sp - 8] = c3F0;
+      _s3DD::P64 = P64[R1 + 6];
+      R1 = P64[R1 + 14];
+      P64[Sp] = _s3DD::P64;
+
+but if we don't inline the GlobalReg, instead we get:
+
+      _s3DB::P64 = R1;
+      if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
+  c3EU:
+      I64[Sp - 8] = c3F0;
+      R1 = P64[_s3DB::P64 + 14];
+      P64[Sp] = P64[_s3DB::P64 + 6];
+
+This looks better - we managed to inline _s3DD - but in fact it
+generates an extra reg-reg move:
+
+.Lc3EU:
+        movq $c3F0_info,-8(%rbp)
+        movq %rbx,%rax
+        movq 14(%rbx),%rbx
+        movq 6(%rax),%rax
+        movq %rax,(%rbp)
+
+because _s3DB is now live across the R1 assignment, we lost the
+benefit of coalescing.
+
+Who is at fault here?  Perhaps if we knew that _s3DB was an alias for
+R1, then we would not sink a reference to _s3DB past the R1
+assignment.  Or perhaps we *should* do that - we might gain by sinking
+it, despite losing the coalescing opportunity.
+
+Sometimes not inlining global registers wins by virtue of the rule
+about not inlining into arguments of a foreign call, e.g. (T7163) this
+is what happens when we inlined F1:
+
+      _s3L2::F32 = F1;
+      _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
+      (_s3L7::F32) = call "ccall" arg hints:  []  result hints:  [] rintFloat(_c3O3::F32);
+
+but if we don't inline F1:
+
+      (_s3L7::F32) = call "ccall" arg hints:  []  result hints:  [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
+                                                                                            10.0 :: W32));
+-}
diff --git a/compiler/GHC/Cmm/Switch.hs b/compiler/GHC/Cmm/Switch.hs
new file mode 100644
index 0000000000..e89fadfd2e
--- /dev/null
+++ b/compiler/GHC/Cmm/Switch.hs
@@ -0,0 +1,502 @@
+{-# LANGUAGE GADTs #-}
+module GHC.Cmm.Switch (
+     SwitchTargets,
+     mkSwitchTargets,
+     switchTargetsCases, switchTargetsDefault, switchTargetsRange, switchTargetsSigned,
+     mapSwitchTargets, switchTargetsToTable, switchTargetsFallThrough,
+     switchTargetsToList, eqSwitchTargetWith,
+
+     SwitchPlan(..),
+     targetSupportsSwitch,
+     createSwitchPlan,
+  ) where
+
+import GhcPrelude
+
+import Outputable
+import DynFlags
+import GHC.Cmm.Dataflow.Label (Label)
+
+import Data.Maybe
+import Data.List (groupBy)
+import Data.Function (on)
+import qualified Data.Map as M
+
+-- Note [Cmm Switches, the general plan]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- Compiling a high-level switch statement, as it comes out of a STG case
+-- expression, for example, allows for a surprising amount of design decisions.
+-- Therefore, we cleanly separated this from the Stg → Cmm transformation, as
+-- well as from the actual code generation.
+--
+-- The overall plan is:
+--  * The Stg → Cmm transformation creates a single `SwitchTargets` in
+--    emitSwitch and emitCmmLitSwitch in GHC.StgToCmm.Utils.
+--    At this stage, they are unsuitable for code generation.
+--  * A dedicated Cmm transformation (GHC.Cmm.Switch.Implement) replaces these
+--    switch statements with code that is suitable for code generation, i.e.
+--    a nice balanced tree of decisions with dense jump tables in the leafs.
+--    The actual planning of this tree is performed in pure code in createSwitchPlan
+--    in this module. See Note [createSwitchPlan].
+--  * The actual code generation will not do any further processing and
+--    implement each CmmSwitch with a jump tables.
+--
+-- When compiling to LLVM or C, GHC.Cmm.Switch.Implement leaves the switch
+-- statements alone, as we can turn a SwitchTargets value into a nice
+-- switch-statement in LLVM resp. C, and leave the rest to the compiler.
+--
+-- See Note [GHC.Cmm.Switch vs. GHC.Cmm.Switch.Implement] why the two module are
+-- separated.
+
+-----------------------------------------------------------------------------
+-- Note [Magic Constants in GHC.Cmm.Switch]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- There are a lot of heuristics here that depend on magic values where it is
+-- hard to determine the "best" value (for whatever that means). These are the
+-- magic values:
+
+-- | Number of consecutive default values allowed in a jump table. If there are
+-- more of them, the jump tables are split.
+--
+-- Currently 7, as it costs 7 words of additional code when a jump table is
+-- split (at least on x64, determined experimentally).
+maxJumpTableHole :: Integer
+maxJumpTableHole = 7
+
+-- | Minimum size of a jump table. If the number is smaller, the switch is
+-- implemented using conditionals.
+-- Currently 5, because an if-then-else tree of 4 values is nice and compact.
+minJumpTableSize :: Int
+minJumpTableSize = 5
+
+-- | Minimum non-zero offset for a jump table. See Note [Jump Table Offset].
+minJumpTableOffset :: Integer
+minJumpTableOffset = 2
+
+
+-----------------------------------------------------------------------------
+-- Switch Targets
+
+-- Note [SwitchTargets]
+-- ~~~~~~~~~~~~~~~~~~~~
+--
+-- The branches of a switch are stored in a SwitchTargets, which consists of an
+-- (optional) default jump target, and a map from values to jump targets.
+--
+-- If the default jump target is absent, the behaviour of the switch outside the
+-- values of the map is undefined.
+--
+-- We use an Integer for the keys the map so that it can be used in switches on
+-- unsigned as well as signed integers.
+--
+-- The map may be empty (we prune out-of-range branches here, so it could be us
+-- emptying it).
+--
+-- Before code generation, the table needs to be brought into a form where all
+-- entries are non-negative, so that it can be compiled into a jump table.
+-- See switchTargetsToTable.
+
+
+-- | A value of type SwitchTargets contains the alternatives for a 'CmmSwitch'
+-- value, and knows whether the value is signed, the possible range, an
+-- optional default value and a map from values to jump labels.
+data SwitchTargets =
+    SwitchTargets
+        Bool                       -- Signed values
+        (Integer, Integer)         -- Range
+        (Maybe Label)              -- Default value
+        (M.Map Integer Label)      -- The branches
+    deriving (Show, Eq)
+
+-- | The smart constructor mkSwitchTargets normalises the map a bit:
+--  * No entries outside the range
+--  * No entries equal to the default
+--  * No default if all elements have explicit values
+mkSwitchTargets :: Bool -> (Integer, Integer) -> Maybe Label -> M.Map Integer Label -> SwitchTargets
+mkSwitchTargets signed range@(lo,hi) mbdef ids
+    = SwitchTargets signed range mbdef' ids'
+  where
+    ids' = dropDefault $ restrict ids
+    mbdef' | defaultNeeded = mbdef
+           | otherwise     = Nothing
+
+    -- Drop entries outside the range, if there is a range
+    restrict = restrictMap (lo,hi)
+
+    -- Drop entries that equal the default, if there is a default
+    dropDefault | Just l <- mbdef = M.filter (/= l)
+                | otherwise       = id
+
+    -- Check if the default is still needed
+    defaultNeeded = fromIntegral (M.size ids') /= hi-lo+1
+
+
+-- | Changes all labels mentioned in the SwitchTargets value
+mapSwitchTargets :: (Label -> Label) -> SwitchTargets -> SwitchTargets
+mapSwitchTargets f (SwitchTargets signed range mbdef branches)
+    = SwitchTargets signed range (fmap f mbdef) (fmap f branches)
+
+-- | Returns the list of non-default branches of the SwitchTargets value
+switchTargetsCases :: SwitchTargets -> [(Integer, Label)]
+switchTargetsCases (SwitchTargets _ _ _ branches) = M.toList branches
+
+-- | Return the default label of the SwitchTargets value
+switchTargetsDefault :: SwitchTargets -> Maybe Label
+switchTargetsDefault (SwitchTargets _ _ mbdef _) = mbdef
+
+-- | Return the range of the SwitchTargets value
+switchTargetsRange :: SwitchTargets -> (Integer, Integer)
+switchTargetsRange (SwitchTargets _ range _ _) = range
+
+-- | Return whether this is used for a signed value
+switchTargetsSigned :: SwitchTargets -> Bool
+switchTargetsSigned (SwitchTargets signed _ _ _) = signed
+
+-- | switchTargetsToTable creates a dense jump table, usable for code generation.
+--
+-- Also returns an offset to add to the value; the list is 0-based on the
+-- result of that addition.
+--
+-- The conversion from Integer to Int is a bit of a wart, as the actual
+-- scrutinee might be an unsigned word, but it just works, due to wrap-around
+-- arithmetic (as verified by the CmmSwitchTest test case).
+switchTargetsToTable :: SwitchTargets -> (Int, [Maybe Label])
+switchTargetsToTable (SwitchTargets _ (lo,hi) mbdef branches)
+    = (fromIntegral (-start), [ labelFor i | i <- [start..hi] ])
+  where
+    labelFor i = case M.lookup i branches of Just l -> Just l
+                                             Nothing -> mbdef
+    start | lo >= 0 && lo < minJumpTableOffset  = 0  -- See Note [Jump Table Offset]
+          | otherwise                           = lo
+
+-- Note [Jump Table Offset]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- Usually, the code for a jump table starting at x will first subtract x from
+-- the value, to avoid a large amount of empty entries. But if x is very small,
+-- the extra entries are no worse than the subtraction in terms of code size, and
+-- not having to do the subtraction is quicker.
+--
+-- I.e. instead of
+--     _u20N:
+--             leaq -1(%r14),%rax
+--             jmp *_n20R(,%rax,8)
+--     _n20R:
+--             .quad   _c20p
+--             .quad   _c20q
+-- do
+--     _u20N:
+--             jmp *_n20Q(,%r14,8)
+--
+--     _n20Q:
+--             .quad   0
+--             .quad   _c20p
+--             .quad   _c20q
+--             .quad   _c20r
+
+-- | The list of all labels occurring in the SwitchTargets value.
+switchTargetsToList :: SwitchTargets -> [Label]
+switchTargetsToList (SwitchTargets _ _ mbdef branches)
+    = maybeToList mbdef ++ M.elems branches
+
+-- | Groups cases with equal targets, suitable for pretty-printing to a
+-- c-like switch statement with fall-through semantics.
+switchTargetsFallThrough :: SwitchTargets -> ([([Integer], Label)], Maybe Label)
+switchTargetsFallThrough (SwitchTargets _ _ mbdef branches) = (groups, mbdef)
+  where
+    groups = map (\xs -> (map fst xs, snd (head xs))) $
+             groupBy ((==) `on` snd) $
+             M.toList branches
+
+-- | Custom equality helper, needed for "GHC.Cmm.CommonBlockElim"
+eqSwitchTargetWith :: (Label -> Label -> Bool) -> SwitchTargets -> SwitchTargets -> Bool
+eqSwitchTargetWith eq (SwitchTargets signed1 range1 mbdef1 ids1) (SwitchTargets signed2 range2 mbdef2 ids2) =
+    signed1 == signed2 && range1 == range2 && goMB mbdef1 mbdef2 && goList (M.toList ids1) (M.toList ids2)
+  where
+    goMB Nothing Nothing = True
+    goMB (Just l1) (Just l2) = l1 `eq` l2
+    goMB _ _ = False
+    goList [] [] = True
+    goList ((i1,l1):ls1) ((i2,l2):ls2) = i1 == i2 && l1 `eq` l2 && goList ls1 ls2
+    goList _ _ = False
+
+-----------------------------------------------------------------------------
+-- Code generation for Switches
+
+
+-- | A SwitchPlan abstractly describes how a Switch statement ought to be
+-- implemented. See Note [createSwitchPlan]
+data SwitchPlan
+    = Unconditionally Label
+    | IfEqual Integer Label SwitchPlan
+    | IfLT Bool Integer SwitchPlan SwitchPlan
+    | JumpTable SwitchTargets
+  deriving Show
+--
+-- Note [createSwitchPlan]
+-- ~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- A SwitchPlan describes how a Switch statement is to be broken down into
+-- smaller pieces suitable for code generation.
+--
+-- createSwitchPlan creates such a switch plan, in these steps:
+--  1. It splits the switch statement at segments of non-default values that
+--     are too large. See splitAtHoles and Note [Magic Constants in GHC.Cmm.Switch]
+--  2. Too small jump tables should be avoided, so we break up smaller pieces
+--     in breakTooSmall.
+--  3. We fill in the segments between those pieces with a jump to the default
+--     label (if there is one), returning a SeparatedList in mkFlatSwitchPlan
+--  4. We find and replace two less-than branches by a single equal-to-test in
+--     findSingleValues
+--  5. The thus collected pieces are assembled to a balanced binary tree.
+
+{-
+  Note [Two alts + default]
+  ~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Discussion and a bit more info at #14644
+
+When dealing with a switch of the form:
+switch(e) {
+  case 1: goto l1;
+  case 3000: goto l2;
+  default: goto ldef;
+}
+
+If we treat it as a sparse jump table we would generate:
+
+if (e > 3000) //Check if value is outside of the jump table.
+    goto ldef;
+else {
+    if (e < 3000) { //Compare to upper value
+        if(e != 1) //Compare to remaining value
+            goto ldef;
+          else
+            goto l2;
+    }
+    else
+        goto l1;
+}
+
+Instead we special case this to :
+
+if (e==1) goto l1;
+else if (e==3000) goto l2;
+else goto l3;
+
+This means we have:
+* Less comparisons for: 1,<3000
+* Unchanged for 3000
+* One more for >3000
+
+This improves code in a few ways:
+* One comparison less means smaller code which helps with cache.
+* It exchanges a taken jump for two jumps no taken in the >range case.
+  Jumps not taken are cheaper (See Agner guides) making this about as fast.
+* For all other cases the first range check is removed making it faster.
+
+The end result is that the change is not measurably slower for the case
+>3000 and faster for the other cases.
+
+This makes running this kind of match in an inner loop cheaper by 10-20%
+depending on the data.
+In nofib this improves wheel-sieve1 by 4-9% depending on problem
+size.
+
+We could also add a second conditional jump after the comparison to
+keep the range check like this:
+    cmp 3000, rArgument
+    jg <default>
+    je <branch 2>
+While this is fairly cheap it made no big difference for the >3000 case
+and slowed down all other cases making it not worthwhile.
+-}
+
+
+-- | Does the target support switch out of the box? Then leave this to the
+-- target!
+targetSupportsSwitch :: HscTarget -> Bool
+targetSupportsSwitch HscC = True
+targetSupportsSwitch HscLlvm = True
+targetSupportsSwitch _ = False
+
+-- | This function creates a SwitchPlan from a SwitchTargets value, breaking it
+-- down into smaller pieces suitable for code generation.
+createSwitchPlan :: SwitchTargets -> SwitchPlan
+-- Lets do the common case of a singleton map quickly and efficiently (#10677)
+createSwitchPlan (SwitchTargets _signed _range (Just defLabel) m)
+    | [(x, l)] <- M.toList m
+    = IfEqual x l (Unconditionally defLabel)
+-- And another common case, matching "booleans"
+createSwitchPlan (SwitchTargets _signed (lo,hi) Nothing m)
+    | [(x1, l1), (_x2,l2)] <- M.toAscList m
+    --Checking If |range| = 2 is enough if we have two unique literals
+    , hi - lo == 1
+    = IfEqual x1 l1 (Unconditionally l2)
+-- See Note [Two alts + default]
+createSwitchPlan (SwitchTargets _signed _range (Just defLabel) m)
+    | [(x1, l1), (x2,l2)] <- M.toAscList m
+    = IfEqual x1 l1 (IfEqual x2 l2 (Unconditionally defLabel))
+createSwitchPlan (SwitchTargets signed range mbdef m) =
+    -- pprTrace "createSwitchPlan" (text (show ids) $$ text (show (range,m)) $$ text (show pieces) $$ text (show flatPlan) $$ text (show plan)) $
+    plan
+  where
+    pieces = concatMap breakTooSmall $ splitAtHoles maxJumpTableHole m
+    flatPlan = findSingleValues $ mkFlatSwitchPlan signed mbdef range pieces
+    plan = buildTree signed $ flatPlan
+
+
+---
+--- Step 1: Splitting at large holes
+---
+splitAtHoles :: Integer -> M.Map Integer a -> [M.Map Integer a]
+splitAtHoles _        m | M.null m = []
+splitAtHoles holeSize m = map (\range -> restrictMap range m) nonHoles
+  where
+    holes = filter (\(l,h) -> h - l > holeSize) $ zip (M.keys m) (tail (M.keys m))
+    nonHoles = reassocTuples lo holes hi
+
+    (lo,_) = M.findMin m
+    (hi,_) = M.findMax m
+
+---
+--- Step 2: Avoid small jump tables
+---
+-- We do not want jump tables below a certain size. This breaks them up
+-- (into singleton maps, for now).
+breakTooSmall :: M.Map Integer a -> [M.Map Integer a]
+breakTooSmall m
+  | M.size m > minJumpTableSize = [m]
+  | otherwise                   = [M.singleton k v | (k,v) <- M.toList m]
+
+---
+---  Step 3: Fill in the blanks
+---
+
+-- | A FlatSwitchPlan is a list of SwitchPlans, with an integer inbetween every
+-- two entries, dividing the range.
+-- So if we have (abusing list syntax) [plan1,n,plan2], then we use plan1 if
+-- the expression is < n, and plan2 otherwise.
+
+type FlatSwitchPlan = SeparatedList Integer SwitchPlan
+
+mkFlatSwitchPlan :: Bool -> Maybe Label -> (Integer, Integer) -> [M.Map Integer Label] -> FlatSwitchPlan
+
+-- If we have no default (i.e. undefined where there is no entry), we can
+-- branch at the minimum of each map
+mkFlatSwitchPlan _ Nothing _ [] = pprPanic "mkFlatSwitchPlan with nothing left to do" empty
+mkFlatSwitchPlan signed  Nothing _ (m:ms)
+  = (mkLeafPlan signed Nothing m , [ (fst (M.findMin m'), mkLeafPlan signed Nothing m') | m' <- ms ])
+
+-- If we have a default, we have to interleave segments that jump
+-- to the default between the maps
+mkFlatSwitchPlan signed (Just l) r ms = let ((_,p1):ps) = go r ms in (p1, ps)
+  where
+    go (lo,hi) []
+        | lo > hi = []
+        | otherwise = [(lo, Unconditionally l)]
+    go (lo,hi) (m:ms)
+        | lo < min
+        = (lo, Unconditionally l) : go (min,hi) (m:ms)
+        | lo == min
+        = (lo, mkLeafPlan signed (Just l) m) : go (max+1,hi) ms
+        | otherwise
+        = pprPanic "mkFlatSwitchPlan" (integer lo <+> integer min)
+      where
+        min = fst (M.findMin m)
+        max = fst (M.findMax m)
+
+
+mkLeafPlan :: Bool -> Maybe Label -> M.Map Integer Label -> SwitchPlan
+mkLeafPlan signed mbdef m
+    | [(_,l)] <- M.toList m -- singleton map
+    = Unconditionally l
+    | otherwise
+    = JumpTable $ mkSwitchTargets signed (min,max) mbdef m
+  where
+    min = fst (M.findMin m)
+    max = fst (M.findMax m)
+
+---
+---  Step 4: Reduce the number of branches using ==
+---
+
+-- A sequence of three unconditional jumps, with the outer two pointing to the
+-- same value and the bounds off by exactly one can be improved
+findSingleValues :: FlatSwitchPlan -> FlatSwitchPlan
+findSingleValues (Unconditionally l, (i, Unconditionally l2) : (i', Unconditionally l3) : xs)
+  | l == l3 && i + 1 == i'
+  = findSingleValues (IfEqual i l2 (Unconditionally l), xs)
+findSingleValues (p, (i,p'):xs)
+  = (p,i) `consSL` findSingleValues (p', xs)
+findSingleValues (p, [])
+  = (p, [])
+
+---
+---  Step 5: Actually build the tree
+---
+
+-- Build a balanced tree from a separated list
+buildTree :: Bool -> FlatSwitchPlan -> SwitchPlan
+buildTree _ (p,[]) = p
+buildTree signed sl = IfLT signed m (buildTree signed sl1) (buildTree signed sl2)
+  where
+    (sl1, m, sl2) = divideSL sl
+
+
+
+--
+-- Utility data type: Non-empty lists with extra markers in between each
+-- element:
+--
+
+type SeparatedList b a = (a, [(b,a)])
+
+consSL :: (a, b) -> SeparatedList b a -> SeparatedList b a
+consSL (a, b) (a', xs) = (a, (b,a'):xs)
+
+divideSL :: SeparatedList b a -> (SeparatedList b a, b, SeparatedList b a)
+divideSL (_,[]) = error "divideSL: Singleton SeparatedList"
+divideSL (p,xs) = ((p, xs1), m, (p', xs2))
+  where
+    (xs1, (m,p'):xs2) = splitAt (length xs `div` 2) xs
+
+--
+-- Other Utilities
+--
+
+restrictMap :: (Integer,Integer) -> M.Map Integer b -> M.Map Integer b
+restrictMap (lo,hi) m = mid
+  where (_,   mid_hi) = M.split (lo-1) m
+        (mid, _) =      M.split (hi+1) mid_hi
+
+-- for example: reassocTuples a [(b,c),(d,e)] f == [(a,b),(c,d),(e,f)]
+reassocTuples :: a -> [(a,a)] -> a -> [(a,a)]
+reassocTuples initial [] last
+    = [(initial,last)]
+reassocTuples initial ((a,b):tuples) last
+    = (initial,a) : reassocTuples b tuples last
+
+-- Note [GHC.Cmm.Switch vs. GHC.Cmm.Switch.Implement]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-- I (Joachim) separated the two somewhat closely related modules
+--
+--  - GHC.Cmm.Switch, which provides the CmmSwitchTargets type and contains the strategy
+--    for implementing a Cmm switch (createSwitchPlan), and
+--  - GHC.Cmm.Switch.Implement, which contains the actual Cmm graph modification,
+--
+-- for these reasons:
+--
+--  * GHC.Cmm.Switch is very low in the dependency tree, i.e. does not depend on any
+--    GHC specific modules at all (with the exception of Output and
+--    GHC.Cmm.Dataflow (Literal)).
+--  * GHC.Cmm.Switch.Implement is the Cmm transformation and hence very high in
+--    the dependency tree.
+--  * GHC.Cmm.Switch provides the CmmSwitchTargets data type, which is abstract, but
+--    used in GHC.Cmm.Node.
+--  * Because GHC.Cmm.Switch is low in the dependency tree, the separation allows
+--    for more parallelism when building GHC.
+--  * The interaction between the modules is very explicit and easy to
+--    understand, due to the small and simple interface.
diff --git a/compiler/GHC/Cmm/Switch/Implement.hs b/compiler/GHC/Cmm/Switch/Implement.hs
new file mode 100644
index 0000000000..dfac116764
--- /dev/null
+++ b/compiler/GHC/Cmm/Switch/Implement.hs
@@ -0,0 +1,116 @@
+{-# LANGUAGE GADTs #-}
+module GHC.Cmm.Switch.Implement
+  ( cmmImplementSwitchPlans
+  )
+where
+
+import GhcPrelude
+
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import UniqSupply
+import DynFlags
+
+--
+-- This module replaces Switch statements as generated by the Stg -> Cmm
+-- transformation, which might be huge and sparse and hence unsuitable for
+-- assembly code, by proper constructs (if-then-else trees, dense jump tables).
+--
+-- The actual, abstract strategy is determined by createSwitchPlan in
+-- GHC.Cmm.Switch and returned as a SwitchPlan; here is just the implementation in
+-- terms of Cmm code. See Note [Cmm Switches, the general plan] in GHC.Cmm.Switch.
+--
+-- This division into different modules is both to clearly separate concerns,
+-- but also because createSwitchPlan needs access to the constructors of
+-- SwitchTargets, a data type exported abstractly by GHC.Cmm.Switch.
+--
+
+-- | Traverses the 'CmmGraph', making sure that 'CmmSwitch' are suitable for
+-- code generation.
+cmmImplementSwitchPlans :: DynFlags -> CmmGraph -> UniqSM CmmGraph
+cmmImplementSwitchPlans dflags g
+    -- Switch generation done by backend (LLVM/C)
+    | targetSupportsSwitch (hscTarget dflags) = return g
+    | otherwise = do
+    blocks' <- concat `fmap` mapM (visitSwitches dflags) (toBlockList g)
+    return $ ofBlockList (g_entry g) blocks'
+
+visitSwitches :: DynFlags -> CmmBlock -> UniqSM [CmmBlock]
+visitSwitches dflags block
+  | (entry@(CmmEntry _ scope), middle, CmmSwitch vanillaExpr ids) <- blockSplit block
+  = do
+    let plan = createSwitchPlan ids
+    -- See Note [Floating switch expressions]
+    (assignSimple, simpleExpr) <- floatSwitchExpr dflags vanillaExpr
+
+    (newTail, newBlocks) <- implementSwitchPlan dflags scope simpleExpr plan
+
+    let block' = entry `blockJoinHead` middle `blockAppend` assignSimple `blockAppend` newTail
+
+    return $ block' : newBlocks
+
+  | otherwise
+  = return [block]
+
+-- Note [Floating switch expressions]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-- When we translate a sparse switch into a search tree we would like
+-- to compute the value we compare against only once.
+
+-- For this purpose we assign the switch expression to a local register
+-- and then use this register when constructing the actual binary tree.
+
+-- This is important as the expression could contain expensive code like
+-- memory loads or divisions which we REALLY don't want to duplicate.
+
+-- This happened in parts of the handwritten RTS Cmm code. See also #16933
+
+-- See Note [Floating switch expressions]
+floatSwitchExpr :: DynFlags -> CmmExpr -> UniqSM (Block CmmNode O O, CmmExpr)
+floatSwitchExpr _      reg@(CmmReg {})  = return (emptyBlock, reg)
+floatSwitchExpr dflags expr             = do
+  (assign, expr') <- cmmMkAssign dflags expr <$> getUniqueM
+  return (BMiddle assign, expr')
+
+
+-- Implementing a switch plan (returning a tail block)
+implementSwitchPlan :: DynFlags -> CmmTickScope -> CmmExpr -> SwitchPlan -> UniqSM (Block CmmNode O C, [CmmBlock])
+implementSwitchPlan dflags scope expr = go
+  where
+    go (Unconditionally l)
+      = return (emptyBlock `blockJoinTail` CmmBranch l, [])
+    go (JumpTable ids)
+      = return (emptyBlock `blockJoinTail` CmmSwitch expr ids, [])
+    go (IfLT signed i ids1 ids2)
+      = do
+        (bid1, newBlocks1) <- go' ids1
+        (bid2, newBlocks2) <- go' ids2
+
+        let lt | signed    = cmmSLtWord
+               | otherwise = cmmULtWord
+            scrut = lt dflags expr $ CmmLit $ mkWordCLit dflags i
+            lastNode = CmmCondBranch scrut bid1 bid2 Nothing
+            lastBlock = emptyBlock `blockJoinTail` lastNode
+        return (lastBlock, newBlocks1++newBlocks2)
+    go (IfEqual i l ids2)
+      = do
+        (bid2, newBlocks2) <- go' ids2
+
+        let scrut = cmmNeWord dflags expr $ CmmLit $ mkWordCLit dflags i
+            lastNode = CmmCondBranch scrut bid2 l Nothing
+            lastBlock = emptyBlock `blockJoinTail` lastNode
+        return (lastBlock, newBlocks2)
+
+    -- Same but returning a label to branch to
+    go' (Unconditionally l)
+      = return (l, [])
+    go' p
+      = do
+        bid <- mkBlockId `fmap` getUniqueM
+        (last, newBlocks) <- go p
+        let block = CmmEntry bid scope `blockJoinHead` last
+        return (bid, block: newBlocks)
diff --git a/compiler/GHC/Cmm/Type.hs b/compiler/GHC/Cmm/Type.hs
new file mode 100644
index 0000000000..867a260078
--- /dev/null
+++ b/compiler/GHC/Cmm/Type.hs
@@ -0,0 +1,432 @@
+module GHC.Cmm.Type
+    ( CmmType   -- Abstract
+    , b8, b16, b32, b64, b128, b256, b512, f32, f64, bWord, bHalfWord, gcWord
+    , cInt
+    , cmmBits, cmmFloat
+    , typeWidth, cmmEqType, cmmEqType_ignoring_ptrhood
+    , isFloatType, isGcPtrType, isBitsType
+    , isWord32, isWord64, isFloat64, isFloat32
+
+    , Width(..)
+    , widthInBits, widthInBytes, widthInLog, widthFromBytes
+    , wordWidth, halfWordWidth, cIntWidth
+    , halfWordMask
+    , narrowU, narrowS
+    , rEP_CostCentreStack_mem_alloc
+    , rEP_CostCentreStack_scc_count
+    , rEP_StgEntCounter_allocs
+    , rEP_StgEntCounter_allocd
+
+    , ForeignHint(..)
+
+    , Length
+    , vec, vec2, vec4, vec8, vec16
+    , vec2f64, vec2b64, vec4f32, vec4b32, vec8b16, vec16b8
+    , cmmVec
+    , vecLength, vecElemType
+    , isVecType
+   )
+where
+
+
+import GhcPrelude
+
+import DynFlags
+import FastString
+import Outputable
+
+import Data.Word
+import Data.Int
+
+-----------------------------------------------------------------------------
+--              CmmType
+-----------------------------------------------------------------------------
+
+  -- NOTE: CmmType is an abstract type, not exported from this
+  --       module so you can easily change its representation
+  --
+  -- However Width is exported in a concrete way,
+  -- and is used extensively in pattern-matching
+
+data CmmType    -- The important one!
+  = CmmType CmmCat Width
+
+data CmmCat                -- "Category" (not exported)
+   = GcPtrCat              -- GC pointer
+   | BitsCat               -- Non-pointer
+   | FloatCat              -- Float
+   | VecCat Length CmmCat  -- Vector
+   deriving( Eq )
+        -- See Note [Signed vs unsigned] at the end
+
+instance Outputable CmmType where
+  ppr (CmmType cat wid) = ppr cat <> ppr (widthInBits wid)
+
+instance Outputable CmmCat where
+  ppr FloatCat       = text "F"
+  ppr GcPtrCat       = text "P"
+  ppr BitsCat        = text "I"
+  ppr (VecCat n cat) = ppr cat <> text "x" <> ppr n <> text "V"
+
+-- Why is CmmType stratified?  For native code generation,
+-- most of the time you just want to know what sort of register
+-- to put the thing in, and for this you need to know how
+-- many bits thing has, and whether it goes in a floating-point
+-- register.  By contrast, the distinction between GcPtr and
+-- GcNonPtr is of interest to only a few parts of the code generator.
+
+-------- Equality on CmmType --------------
+-- CmmType is *not* an instance of Eq; sometimes we care about the
+-- Gc/NonGc distinction, and sometimes we don't
+-- So we use an explicit function to force you to think about it
+cmmEqType :: CmmType -> CmmType -> Bool -- Exact equality
+cmmEqType (CmmType c1 w1) (CmmType c2 w2) = c1==c2 && w1==w2
+
+cmmEqType_ignoring_ptrhood :: CmmType -> CmmType -> Bool
+  -- This equality is temporary; used in CmmLint
+  -- but the RTS files are not yet well-typed wrt pointers
+cmmEqType_ignoring_ptrhood (CmmType c1 w1) (CmmType c2 w2)
+   = c1 `weak_eq` c2 && w1==w2
+   where
+     weak_eq :: CmmCat -> CmmCat -> Bool
+     FloatCat         `weak_eq` FloatCat         = True
+     FloatCat         `weak_eq` _other           = False
+     _other           `weak_eq` FloatCat         = False
+     (VecCat l1 cat1) `weak_eq` (VecCat l2 cat2) = l1 == l2
+                                                   && cat1 `weak_eq` cat2
+     (VecCat {})      `weak_eq` _other           = False
+     _other           `weak_eq` (VecCat {})      = False
+     _word1           `weak_eq` _word2           = True        -- Ignores GcPtr
+
+--- Simple operations on CmmType -----
+typeWidth :: CmmType -> Width
+typeWidth (CmmType _ w) = w
+
+cmmBits, cmmFloat :: Width -> CmmType
+cmmBits  = CmmType BitsCat
+cmmFloat = CmmType FloatCat
+
+-------- Common CmmTypes ------------
+-- Floats and words of specific widths
+b8, b16, b32, b64, b128, b256, b512, f32, f64 :: CmmType
+b8     = cmmBits W8
+b16    = cmmBits W16
+b32    = cmmBits W32
+b64    = cmmBits W64
+b128   = cmmBits W128
+b256   = cmmBits W256
+b512   = cmmBits W512
+f32    = cmmFloat W32
+f64    = cmmFloat W64
+
+-- CmmTypes of native word widths
+bWord :: DynFlags -> CmmType
+bWord dflags = cmmBits (wordWidth dflags)
+
+bHalfWord :: DynFlags -> CmmType
+bHalfWord dflags = cmmBits (halfWordWidth dflags)
+
+gcWord :: DynFlags -> CmmType
+gcWord dflags = CmmType GcPtrCat (wordWidth dflags)
+
+cInt :: DynFlags -> CmmType
+cInt dflags = cmmBits (cIntWidth  dflags)
+
+------------ Predicates ----------------
+isFloatType, isGcPtrType, isBitsType :: CmmType -> Bool
+isFloatType (CmmType FloatCat    _) = True
+isFloatType _other                  = False
+
+isGcPtrType (CmmType GcPtrCat _) = True
+isGcPtrType _other               = False
+
+isBitsType (CmmType BitsCat _) = True
+isBitsType _                   = False
+
+isWord32, isWord64, isFloat32, isFloat64 :: CmmType -> Bool
+-- isWord64 is true of 64-bit non-floats (both gc-ptrs and otherwise)
+-- isFloat32 and 64 are obvious
+
+isWord64 (CmmType BitsCat  W64) = True
+isWord64 (CmmType GcPtrCat W64) = True
+isWord64 _other                 = False
+
+isWord32 (CmmType BitsCat  W32) = True
+isWord32 (CmmType GcPtrCat W32) = True
+isWord32 _other                 = False
+
+isFloat32 (CmmType FloatCat W32) = True
+isFloat32 _other                 = False
+
+isFloat64 (CmmType FloatCat W64) = True
+isFloat64 _other                 = False
+
+-----------------------------------------------------------------------------
+--              Width
+-----------------------------------------------------------------------------
+
+data Width   = W8 | W16 | W32 | W64
+             | W128
+             | W256
+             | W512
+             deriving (Eq, Ord, Show)
+
+instance Outputable Width where
+   ppr rep = ptext (mrStr rep)
+
+mrStr :: Width -> PtrString
+mrStr = sLit . show
+
+
+-------- Common Widths  ------------
+wordWidth :: DynFlags -> Width
+wordWidth dflags
+ | wORD_SIZE dflags == 4 = W32
+ | wORD_SIZE dflags == 8 = W64
+ | otherwise             = panic "MachOp.wordRep: Unknown word size"
+
+halfWordWidth :: DynFlags -> Width
+halfWordWidth dflags
+ | wORD_SIZE dflags == 4 = W16
+ | wORD_SIZE dflags == 8 = W32
+ | otherwise             = panic "MachOp.halfWordRep: Unknown word size"
+
+halfWordMask :: DynFlags -> Integer
+halfWordMask dflags
+ | wORD_SIZE dflags == 4 = 0xFFFF
+ | wORD_SIZE dflags == 8 = 0xFFFFFFFF
+ | otherwise             = panic "MachOp.halfWordMask: Unknown word size"
+
+-- cIntRep is the Width for a C-language 'int'
+cIntWidth :: DynFlags -> Width
+cIntWidth dflags = case cINT_SIZE dflags of
+                   4 -> W32
+                   8 -> W64
+                   s -> panic ("cIntWidth: Unknown cINT_SIZE: " ++ show s)
+
+widthInBits :: Width -> Int
+widthInBits W8   = 8
+widthInBits W16  = 16
+widthInBits W32  = 32
+widthInBits W64  = 64
+widthInBits W128 = 128
+widthInBits W256 = 256
+widthInBits W512 = 512
+
+
+widthInBytes :: Width -> Int
+widthInBytes W8   = 1
+widthInBytes W16  = 2
+widthInBytes W32  = 4
+widthInBytes W64  = 8
+widthInBytes W128 = 16
+widthInBytes W256 = 32
+widthInBytes W512 = 64
+
+
+widthFromBytes :: Int -> Width
+widthFromBytes 1  = W8
+widthFromBytes 2  = W16
+widthFromBytes 4  = W32
+widthFromBytes 8  = W64
+widthFromBytes 16 = W128
+widthFromBytes 32 = W256
+widthFromBytes 64 = W512
+
+widthFromBytes n  = pprPanic "no width for given number of bytes" (ppr n)
+
+-- log_2 of the width in bytes, useful for generating shifts.
+widthInLog :: Width -> Int
+widthInLog W8   = 0
+widthInLog W16  = 1
+widthInLog W32  = 2
+widthInLog W64  = 3
+widthInLog W128 = 4
+widthInLog W256 = 5
+widthInLog W512 = 6
+
+
+-- widening / narrowing
+
+narrowU :: Width -> Integer -> Integer
+narrowU W8  x = fromIntegral (fromIntegral x :: Word8)
+narrowU W16 x = fromIntegral (fromIntegral x :: Word16)
+narrowU W32 x = fromIntegral (fromIntegral x :: Word32)
+narrowU W64 x = fromIntegral (fromIntegral x :: Word64)
+narrowU _ _ = panic "narrowTo"
+
+narrowS :: Width -> Integer -> Integer
+narrowS W8  x = fromIntegral (fromIntegral x :: Int8)
+narrowS W16 x = fromIntegral (fromIntegral x :: Int16)
+narrowS W32 x = fromIntegral (fromIntegral x :: Int32)
+narrowS W64 x = fromIntegral (fromIntegral x :: Int64)
+narrowS _ _ = panic "narrowTo"
+
+-----------------------------------------------------------------------------
+--              SIMD
+-----------------------------------------------------------------------------
+
+type Length = Int
+
+vec :: Length -> CmmType -> CmmType
+vec l (CmmType cat w) = CmmType (VecCat l cat) vecw
+  where
+    vecw :: Width
+    vecw = widthFromBytes (l*widthInBytes w)
+
+vec2, vec4, vec8, vec16 :: CmmType -> CmmType
+vec2  = vec 2
+vec4  = vec 4
+vec8  = vec 8
+vec16 = vec 16
+
+vec2f64, vec2b64, vec4f32, vec4b32, vec8b16, vec16b8 :: CmmType
+vec2f64 = vec 2 f64
+vec2b64 = vec 2 b64
+vec4f32 = vec 4 f32
+vec4b32 = vec 4 b32
+vec8b16 = vec 8 b16
+vec16b8 = vec 16 b8
+
+cmmVec :: Int -> CmmType -> CmmType
+cmmVec n (CmmType cat w) =
+    CmmType (VecCat n cat) (widthFromBytes (n*widthInBytes w))
+
+vecLength :: CmmType -> Length
+vecLength (CmmType (VecCat l _) _) = l
+vecLength _                        = panic "vecLength: not a vector"
+
+vecElemType :: CmmType -> CmmType
+vecElemType (CmmType (VecCat l cat) w) = CmmType cat scalw
+  where
+    scalw :: Width
+    scalw = widthFromBytes (widthInBytes w `div` l)
+vecElemType _ = panic "vecElemType: not a vector"
+
+isVecType :: CmmType -> Bool
+isVecType (CmmType (VecCat {}) _) = True
+isVecType _                       = False
+
+-------------------------------------------------------------------------
+-- Hints
+
+-- Hints are extra type information we attach to the arguments and
+-- results of a foreign call, where more type information is sometimes
+-- needed by the ABI to make the correct kind of call.
+
+data ForeignHint
+  = NoHint | AddrHint | SignedHint
+  deriving( Eq )
+        -- Used to give extra per-argument or per-result
+        -- information needed by foreign calling conventions
+
+-------------------------------------------------------------------------
+
+-- These don't really belong here, but I don't know where is best to
+-- put them.
+
+rEP_CostCentreStack_mem_alloc :: DynFlags -> CmmType
+rEP_CostCentreStack_mem_alloc dflags
+    = cmmBits (widthFromBytes (pc_REP_CostCentreStack_mem_alloc pc))
+    where pc = platformConstants dflags
+
+rEP_CostCentreStack_scc_count :: DynFlags -> CmmType
+rEP_CostCentreStack_scc_count dflags
+    = cmmBits (widthFromBytes (pc_REP_CostCentreStack_scc_count pc))
+    where pc = platformConstants dflags
+
+rEP_StgEntCounter_allocs :: DynFlags -> CmmType
+rEP_StgEntCounter_allocs dflags
+    = cmmBits (widthFromBytes (pc_REP_StgEntCounter_allocs pc))
+    where pc = platformConstants dflags
+
+rEP_StgEntCounter_allocd :: DynFlags -> CmmType
+rEP_StgEntCounter_allocd dflags
+    = cmmBits (widthFromBytes (pc_REP_StgEntCounter_allocd pc))
+    where pc = platformConstants dflags
+
+-------------------------------------------------------------------------
+{-      Note [Signed vs unsigned]
+        ~~~~~~~~~~~~~~~~~~~~~~~~~
+Should a CmmType include a signed vs. unsigned distinction?
+
+This is very much like a "hint" in C-- terminology: it isn't necessary
+in order to generate correct code, but it might be useful in that the
+compiler can generate better code if it has access to higher-level
+hints about data.  This is important at call boundaries, because the
+definition of a function is not visible at all of its call sites, so
+the compiler cannot infer the hints.
+
+Here in Cmm, we're taking a slightly different approach.  We include
+the int vs. float hint in the CmmType, because (a) the majority of
+platforms have a strong distinction between float and int registers,
+and (b) we don't want to do any heavyweight hint-inference in the
+native code backend in order to get good code.  We're treating the
+hint more like a type: our Cmm is always completely consistent with
+respect to hints.  All coercions between float and int are explicit.
+
+What about the signed vs. unsigned hint?  This information might be
+useful if we want to keep sub-word-sized values in word-size
+registers, which we must do if we only have word-sized registers.
+
+On such a system, there are two straightforward conventions for
+representing sub-word-sized values:
+
+(a) Leave the upper bits undefined.  Comparison operations must
+    sign- or zero-extend both operands before comparing them,
+    depending on whether the comparison is signed or unsigned.
+
+(b) Always keep the values sign- or zero-extended as appropriate.
+    Arithmetic operations must narrow the result to the appropriate
+    size.
+
+A clever compiler might not use either (a) or (b) exclusively, instead
+it would attempt to minimize the coercions by analysis: the same kind
+of analysis that propagates hints around.  In Cmm we don't want to
+have to do this, so we plump for having richer types and keeping the
+type information consistent.
+
+If signed/unsigned hints are missing from CmmType, then the only
+choice we have is (a), because we don't know whether the result of an
+operation should be sign- or zero-extended.
+
+Many architectures have extending load operations, which work well
+with (b).  To make use of them with (a), you need to know whether the
+value is going to be sign- or zero-extended by an enclosing comparison
+(for example), which involves knowing above the context.  This is
+doable but more complex.
+
+Further complicating the issue is foreign calls: a foreign calling
+convention can specify that signed 8-bit quantities are passed as
+sign-extended 32 bit quantities, for example (this is the case on the
+PowerPC).  So we *do* need sign information on foreign call arguments.
+
+Pros for adding signed vs. unsigned to CmmType:
+
+  - It would let us use convention (b) above, and get easier
+    code generation for extending loads.
+
+  - Less information required on foreign calls.
+
+  - MachOp type would be simpler
+
+Cons:
+
+  - More complexity
+
+  - What is the CmmType for a VanillaReg?  Currently it is
+    always wordRep, but now we have to decide whether it is
+    signed or unsigned.  The same VanillaReg can thus have
+    different CmmType in different parts of the program.
+
+  - Extra coercions cluttering up expressions.
+
+Currently for GHC, the foreign call point is moot, because we do our
+own promotion of sub-word-sized values to word-sized values.  The Int8
+type is represented by an Int# which is kept sign-extended at all times
+(this is slightly naughty, because we're making assumptions about the
+C calling convention rather early on in the compiler).  However, given
+this, the cons outweigh the pros.
+
+-}
+
diff --git a/compiler/GHC/Cmm/Utils.hs b/compiler/GHC/Cmm/Utils.hs
new file mode 100644
index 0000000000..d879c7b82f
--- /dev/null
+++ b/compiler/GHC/Cmm/Utils.hs
@@ -0,0 +1,607 @@
+{-# LANGUAGE GADTs, RankNTypes #-}
+{-# LANGUAGE BangPatterns #-}
+
+-----------------------------------------------------------------------------
+--
+-- Cmm utilities.
+--
+-- (c) The University of Glasgow 2004-2006
+--
+-----------------------------------------------------------------------------
+
+module GHC.Cmm.Utils(
+        -- CmmType
+        primRepCmmType, slotCmmType, slotForeignHint,
+        typeCmmType, typeForeignHint, primRepForeignHint,
+
+        -- CmmLit
+        zeroCLit, mkIntCLit,
+        mkWordCLit, packHalfWordsCLit,
+        mkByteStringCLit,
+        mkDataLits, mkRODataLits,
+        mkStgWordCLit,
+
+        -- CmmExpr
+        mkIntExpr, zeroExpr,
+        mkLblExpr,
+        cmmRegOff,  cmmOffset,  cmmLabelOff,  cmmOffsetLit,  cmmOffsetExpr,
+        cmmRegOffB, cmmOffsetB, cmmLabelOffB, cmmOffsetLitB, cmmOffsetExprB,
+        cmmRegOffW, cmmOffsetW, cmmLabelOffW, cmmOffsetLitW, cmmOffsetExprW,
+        cmmIndex, cmmIndexExpr, cmmLoadIndex, cmmLoadIndexW,
+        cmmNegate,
+        cmmULtWord, cmmUGeWord, cmmUGtWord, cmmUShrWord,
+        cmmSLtWord,
+        cmmNeWord, cmmEqWord,
+        cmmOrWord, cmmAndWord,
+        cmmSubWord, cmmAddWord, cmmMulWord, cmmQuotWord,
+        cmmToWord,
+
+        cmmMkAssign,
+
+        isTrivialCmmExpr, hasNoGlobalRegs, isLit, isComparisonExpr,
+
+        baseExpr, spExpr, hpExpr, spLimExpr, hpLimExpr,
+        currentTSOExpr, currentNurseryExpr, cccsExpr,
+
+        -- Statics
+        blankWord,
+
+        -- Tagging
+        cmmTagMask, cmmPointerMask, cmmUntag, cmmIsTagged,
+        cmmConstrTag1,
+
+        -- Overlap and usage
+        regsOverlap, regUsedIn,
+
+        -- Liveness and bitmaps
+        mkLiveness,
+
+        -- * Operations that probably don't belong here
+        modifyGraph,
+
+        ofBlockMap, toBlockMap,
+        ofBlockList, toBlockList, bodyToBlockList,
+        toBlockListEntryFirst, toBlockListEntryFirstFalseFallthrough,
+        foldlGraphBlocks, mapGraphNodes, revPostorder, mapGraphNodes1,
+
+        -- * Ticks
+        blockTicks
+  ) where
+
+import GhcPrelude
+
+import TyCon    ( PrimRep(..), PrimElemRep(..) )
+import GHC.Types.RepType  ( UnaryType, SlotTy (..), typePrimRep1 )
+
+import GHC.Runtime.Layout
+import GHC.Cmm
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import Outputable
+import DynFlags
+import Unique
+import GHC.Platform.Regs
+
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as BS
+import Data.Bits
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Collections
+
+---------------------------------------------------
+--
+--      CmmTypes
+--
+---------------------------------------------------
+
+primRepCmmType :: DynFlags -> PrimRep -> CmmType
+primRepCmmType _      VoidRep          = panic "primRepCmmType:VoidRep"
+primRepCmmType dflags LiftedRep        = gcWord dflags
+primRepCmmType dflags UnliftedRep      = gcWord dflags
+primRepCmmType dflags IntRep           = bWord dflags
+primRepCmmType dflags WordRep          = bWord dflags
+primRepCmmType _      Int8Rep          = b8
+primRepCmmType _      Word8Rep         = b8
+primRepCmmType _      Int16Rep         = b16
+primRepCmmType _      Word16Rep        = b16
+primRepCmmType _      Int32Rep         = b32
+primRepCmmType _      Word32Rep        = b32
+primRepCmmType _      Int64Rep         = b64
+primRepCmmType _      Word64Rep        = b64
+primRepCmmType dflags AddrRep          = bWord dflags
+primRepCmmType _      FloatRep         = f32
+primRepCmmType _      DoubleRep        = f64
+primRepCmmType _      (VecRep len rep) = vec len (primElemRepCmmType rep)
+
+slotCmmType :: DynFlags -> SlotTy -> CmmType
+slotCmmType dflags PtrSlot    = gcWord dflags
+slotCmmType dflags WordSlot   = bWord dflags
+slotCmmType _      Word64Slot = b64
+slotCmmType _      FloatSlot  = f32
+slotCmmType _      DoubleSlot = f64
+
+primElemRepCmmType :: PrimElemRep -> CmmType
+primElemRepCmmType Int8ElemRep   = b8
+primElemRepCmmType Int16ElemRep  = b16
+primElemRepCmmType Int32ElemRep  = b32
+primElemRepCmmType Int64ElemRep  = b64
+primElemRepCmmType Word8ElemRep  = b8
+primElemRepCmmType Word16ElemRep = b16
+primElemRepCmmType Word32ElemRep = b32
+primElemRepCmmType Word64ElemRep = b64
+primElemRepCmmType FloatElemRep  = f32
+primElemRepCmmType DoubleElemRep = f64
+
+typeCmmType :: DynFlags -> UnaryType -> CmmType
+typeCmmType dflags ty = primRepCmmType dflags (typePrimRep1 ty)
+
+primRepForeignHint :: PrimRep -> ForeignHint
+primRepForeignHint VoidRep      = panic "primRepForeignHint:VoidRep"
+primRepForeignHint LiftedRep    = AddrHint
+primRepForeignHint UnliftedRep  = AddrHint
+primRepForeignHint IntRep       = SignedHint
+primRepForeignHint Int8Rep      = SignedHint
+primRepForeignHint Int16Rep     = SignedHint
+primRepForeignHint Int32Rep     = SignedHint
+primRepForeignHint Int64Rep     = SignedHint
+primRepForeignHint WordRep      = NoHint
+primRepForeignHint Word8Rep     = NoHint
+primRepForeignHint Word16Rep    = NoHint
+primRepForeignHint Word32Rep    = NoHint
+primRepForeignHint Word64Rep    = NoHint
+primRepForeignHint AddrRep      = AddrHint -- NB! AddrHint, but NonPtrArg
+primRepForeignHint FloatRep     = NoHint
+primRepForeignHint DoubleRep    = NoHint
+primRepForeignHint (VecRep {})  = NoHint
+
+slotForeignHint :: SlotTy -> ForeignHint
+slotForeignHint PtrSlot       = AddrHint
+slotForeignHint WordSlot      = NoHint
+slotForeignHint Word64Slot    = NoHint
+slotForeignHint FloatSlot     = NoHint
+slotForeignHint DoubleSlot    = NoHint
+
+typeForeignHint :: UnaryType -> ForeignHint
+typeForeignHint = primRepForeignHint . typePrimRep1
+
+---------------------------------------------------
+--
+--      CmmLit
+--
+---------------------------------------------------
+
+-- XXX: should really be Integer, since Int doesn't necessarily cover
+-- the full range of target Ints.
+mkIntCLit :: DynFlags -> Int -> CmmLit
+mkIntCLit dflags i = CmmInt (toInteger i) (wordWidth dflags)
+
+mkIntExpr :: DynFlags -> Int -> CmmExpr
+mkIntExpr dflags i = CmmLit $! mkIntCLit dflags i
+
+zeroCLit :: DynFlags -> CmmLit
+zeroCLit dflags = CmmInt 0 (wordWidth dflags)
+
+zeroExpr :: DynFlags -> CmmExpr
+zeroExpr dflags = CmmLit (zeroCLit dflags)
+
+mkWordCLit :: DynFlags -> Integer -> CmmLit
+mkWordCLit dflags wd = CmmInt wd (wordWidth dflags)
+
+mkByteStringCLit
+  :: CLabel -> ByteString -> (CmmLit, GenCmmDecl CmmStatics info stmt)
+-- We have to make a top-level decl for the string,
+-- and return a literal pointing to it
+mkByteStringCLit lbl bytes
+  = (CmmLabel lbl, CmmData (Section sec lbl) $ Statics lbl [CmmString bytes])
+  where
+    -- This can not happen for String literals (as there \NUL is replaced by
+    -- C0 80). However, it can happen with Addr# literals.
+    sec = if 0 `BS.elem` bytes then ReadOnlyData else CString
+
+mkDataLits :: Section -> CLabel -> [CmmLit] -> GenCmmDecl CmmStatics info stmt
+-- Build a data-segment data block
+mkDataLits section lbl lits
+  = CmmData section (Statics lbl $ map CmmStaticLit lits)
+
+mkRODataLits :: CLabel -> [CmmLit] -> GenCmmDecl CmmStatics info stmt
+-- Build a read-only data block
+mkRODataLits lbl lits
+  = mkDataLits section lbl lits
+  where
+    section | any needsRelocation lits = Section RelocatableReadOnlyData lbl
+            | otherwise                = Section ReadOnlyData lbl
+    needsRelocation (CmmLabel _)      = True
+    needsRelocation (CmmLabelOff _ _) = True
+    needsRelocation _                 = False
+
+mkStgWordCLit :: DynFlags -> StgWord -> CmmLit
+mkStgWordCLit dflags wd = CmmInt (fromStgWord wd) (wordWidth dflags)
+
+packHalfWordsCLit :: DynFlags -> StgHalfWord -> StgHalfWord -> CmmLit
+-- Make a single word literal in which the lower_half_word is
+-- at the lower address, and the upper_half_word is at the
+-- higher address
+-- ToDo: consider using half-word lits instead
+--       but be careful: that's vulnerable when reversed
+packHalfWordsCLit dflags lower_half_word upper_half_word
+   = if wORDS_BIGENDIAN dflags
+     then mkWordCLit dflags ((l `shiftL` halfWordSizeInBits dflags) .|. u)
+     else mkWordCLit dflags (l .|. (u `shiftL` halfWordSizeInBits dflags))
+    where l = fromStgHalfWord lower_half_word
+          u = fromStgHalfWord upper_half_word
+
+---------------------------------------------------
+--
+--      CmmExpr
+--
+---------------------------------------------------
+
+mkLblExpr :: CLabel -> CmmExpr
+mkLblExpr lbl = CmmLit (CmmLabel lbl)
+
+cmmOffsetExpr :: DynFlags -> CmmExpr -> CmmExpr -> CmmExpr
+-- assumes base and offset have the same CmmType
+cmmOffsetExpr dflags e (CmmLit (CmmInt n _)) = cmmOffset dflags e (fromInteger n)
+cmmOffsetExpr dflags e byte_off = CmmMachOp (MO_Add (cmmExprWidth dflags e)) [e, byte_off]
+
+cmmOffset :: DynFlags -> CmmExpr -> Int -> CmmExpr
+cmmOffset _ e                 0        = e
+cmmOffset _ (CmmReg reg)      byte_off = cmmRegOff reg byte_off
+cmmOffset _ (CmmRegOff reg m) byte_off = cmmRegOff reg (m+byte_off)
+cmmOffset _ (CmmLit lit)      byte_off = CmmLit (cmmOffsetLit lit byte_off)
+cmmOffset _ (CmmStackSlot area off) byte_off
+  = CmmStackSlot area (off - byte_off)
+  -- note stack area offsets increase towards lower addresses
+cmmOffset _ (CmmMachOp (MO_Add rep) [expr, CmmLit (CmmInt byte_off1 _rep)]) byte_off2
+  = CmmMachOp (MO_Add rep)
+              [expr, CmmLit (CmmInt (byte_off1 + toInteger byte_off2) rep)]
+cmmOffset dflags expr byte_off
+  = CmmMachOp (MO_Add width) [expr, CmmLit (CmmInt (toInteger byte_off) width)]
+  where
+    width = cmmExprWidth dflags expr
+
+-- Smart constructor for CmmRegOff.  Same caveats as cmmOffset above.
+cmmRegOff :: CmmReg -> Int -> CmmExpr
+cmmRegOff reg 0        = CmmReg reg
+cmmRegOff reg byte_off = CmmRegOff reg byte_off
+
+cmmOffsetLit :: CmmLit -> Int -> CmmLit
+cmmOffsetLit (CmmLabel l)      byte_off = cmmLabelOff l byte_off
+cmmOffsetLit (CmmLabelOff l m) byte_off = cmmLabelOff l (m+byte_off)
+cmmOffsetLit (CmmLabelDiffOff l1 l2 m w) byte_off
+                                        = CmmLabelDiffOff l1 l2 (m+byte_off) w
+cmmOffsetLit (CmmInt m rep)    byte_off = CmmInt (m + fromIntegral byte_off) rep
+cmmOffsetLit _                 byte_off = pprPanic "cmmOffsetLit" (ppr byte_off)
+
+cmmLabelOff :: CLabel -> Int -> CmmLit
+-- Smart constructor for CmmLabelOff
+cmmLabelOff lbl 0        = CmmLabel lbl
+cmmLabelOff lbl byte_off = CmmLabelOff lbl byte_off
+
+-- | Useful for creating an index into an array, with a statically known offset.
+-- The type is the element type; used for making the multiplier
+cmmIndex :: DynFlags
+         -> Width       -- Width w
+         -> CmmExpr     -- Address of vector of items of width w
+         -> Int         -- Which element of the vector (0 based)
+         -> CmmExpr     -- Address of i'th element
+cmmIndex dflags width base idx = cmmOffset dflags base (idx * widthInBytes width)
+
+-- | Useful for creating an index into an array, with an unknown offset.
+cmmIndexExpr :: DynFlags
+             -> Width           -- Width w
+             -> CmmExpr         -- Address of vector of items of width w
+             -> CmmExpr         -- Which element of the vector (0 based)
+             -> CmmExpr         -- Address of i'th element
+cmmIndexExpr dflags width base (CmmLit (CmmInt n _)) = cmmIndex dflags width base (fromInteger n)
+cmmIndexExpr dflags width base idx =
+  cmmOffsetExpr dflags base byte_off
+  where
+    idx_w = cmmExprWidth dflags idx
+    byte_off = CmmMachOp (MO_Shl idx_w) [idx, mkIntExpr dflags (widthInLog width)]
+
+cmmLoadIndex :: DynFlags -> CmmType -> CmmExpr -> Int -> CmmExpr
+cmmLoadIndex dflags ty expr ix = CmmLoad (cmmIndex dflags (typeWidth ty) expr ix) ty
+
+-- The "B" variants take byte offsets
+cmmRegOffB :: CmmReg -> ByteOff -> CmmExpr
+cmmRegOffB = cmmRegOff
+
+cmmOffsetB :: DynFlags -> CmmExpr -> ByteOff -> CmmExpr
+cmmOffsetB = cmmOffset
+
+cmmOffsetExprB :: DynFlags -> CmmExpr -> CmmExpr -> CmmExpr
+cmmOffsetExprB = cmmOffsetExpr
+
+cmmLabelOffB :: CLabel -> ByteOff -> CmmLit
+cmmLabelOffB = cmmLabelOff
+
+cmmOffsetLitB :: CmmLit -> ByteOff -> CmmLit
+cmmOffsetLitB = cmmOffsetLit
+
+-----------------------
+-- The "W" variants take word offsets
+
+cmmOffsetExprW :: DynFlags -> CmmExpr -> CmmExpr -> CmmExpr
+-- The second arg is a *word* offset; need to change it to bytes
+cmmOffsetExprW dflags  e (CmmLit (CmmInt n _)) = cmmOffsetW dflags e (fromInteger n)
+cmmOffsetExprW dflags e wd_off = cmmIndexExpr dflags (wordWidth dflags) e wd_off
+
+cmmOffsetW :: DynFlags -> CmmExpr -> WordOff -> CmmExpr
+cmmOffsetW dflags e n = cmmOffsetB dflags e (wordsToBytes dflags n)
+
+cmmRegOffW :: DynFlags -> CmmReg -> WordOff -> CmmExpr
+cmmRegOffW dflags reg wd_off = cmmRegOffB reg (wordsToBytes dflags wd_off)
+
+cmmOffsetLitW :: DynFlags -> CmmLit -> WordOff -> CmmLit
+cmmOffsetLitW dflags lit wd_off = cmmOffsetLitB lit (wordsToBytes dflags wd_off)
+
+cmmLabelOffW :: DynFlags -> CLabel -> WordOff -> CmmLit
+cmmLabelOffW dflags lbl wd_off = cmmLabelOffB lbl (wordsToBytes dflags wd_off)
+
+cmmLoadIndexW :: DynFlags -> CmmExpr -> Int -> CmmType -> CmmExpr
+cmmLoadIndexW dflags base off ty = CmmLoad (cmmOffsetW dflags base off) ty
+
+-----------------------
+cmmULtWord, cmmUGeWord, cmmUGtWord, cmmUShrWord,
+  cmmSLtWord,
+  cmmNeWord, cmmEqWord,
+  cmmOrWord, cmmAndWord,
+  cmmSubWord, cmmAddWord, cmmMulWord, cmmQuotWord
+  :: DynFlags -> CmmExpr -> CmmExpr -> CmmExpr
+cmmOrWord dflags  e1 e2 = CmmMachOp (mo_wordOr dflags)  [e1, e2]
+cmmAndWord dflags e1 e2 = CmmMachOp (mo_wordAnd dflags) [e1, e2]
+cmmNeWord dflags  e1 e2 = CmmMachOp (mo_wordNe dflags)  [e1, e2]
+cmmEqWord dflags  e1 e2 = CmmMachOp (mo_wordEq dflags)  [e1, e2]
+cmmULtWord dflags e1 e2 = CmmMachOp (mo_wordULt dflags) [e1, e2]
+cmmUGeWord dflags e1 e2 = CmmMachOp (mo_wordUGe dflags) [e1, e2]
+cmmUGtWord dflags e1 e2 = CmmMachOp (mo_wordUGt dflags) [e1, e2]
+cmmSLtWord dflags e1 e2 = CmmMachOp (mo_wordSLt dflags) [e1, e2]
+cmmUShrWord dflags e1 e2 = CmmMachOp (mo_wordUShr dflags) [e1, e2]
+cmmAddWord dflags e1 e2 = CmmMachOp (mo_wordAdd dflags) [e1, e2]
+cmmSubWord dflags e1 e2 = CmmMachOp (mo_wordSub dflags) [e1, e2]
+cmmMulWord dflags e1 e2 = CmmMachOp (mo_wordMul dflags) [e1, e2]
+cmmQuotWord dflags e1 e2 = CmmMachOp (mo_wordUQuot dflags) [e1, e2]
+
+cmmNegate :: DynFlags -> CmmExpr -> CmmExpr
+cmmNegate _      (CmmLit (CmmInt n rep)) = CmmLit (CmmInt (-n) rep)
+cmmNegate dflags e                       = CmmMachOp (MO_S_Neg (cmmExprWidth dflags e)) [e]
+
+blankWord :: DynFlags -> CmmStatic
+blankWord dflags = CmmUninitialised (wORD_SIZE dflags)
+
+cmmToWord :: DynFlags -> CmmExpr -> CmmExpr
+cmmToWord dflags e
+  | w == word  = e
+  | otherwise  = CmmMachOp (MO_UU_Conv w word) [e]
+  where
+    w = cmmExprWidth dflags e
+    word = wordWidth dflags
+
+cmmMkAssign :: DynFlags -> CmmExpr -> Unique -> (CmmNode O O, CmmExpr)
+cmmMkAssign dflags expr uq =
+  let !ty = cmmExprType dflags expr
+      reg = (CmmLocal (LocalReg uq ty))
+  in  (CmmAssign reg expr, CmmReg reg)
+
+
+---------------------------------------------------
+--
+--      CmmExpr predicates
+--
+---------------------------------------------------
+
+isTrivialCmmExpr :: CmmExpr -> Bool
+isTrivialCmmExpr (CmmLoad _ _)      = False
+isTrivialCmmExpr (CmmMachOp _ _)    = False
+isTrivialCmmExpr (CmmLit _)         = True
+isTrivialCmmExpr (CmmReg _)         = True
+isTrivialCmmExpr (CmmRegOff _ _)    = True
+isTrivialCmmExpr (CmmStackSlot _ _) = panic "isTrivialCmmExpr CmmStackSlot"
+
+hasNoGlobalRegs :: CmmExpr -> Bool
+hasNoGlobalRegs (CmmLoad e _)              = hasNoGlobalRegs e
+hasNoGlobalRegs (CmmMachOp _ es)           = all hasNoGlobalRegs es
+hasNoGlobalRegs (CmmLit _)                 = True
+hasNoGlobalRegs (CmmReg (CmmLocal _))      = True
+hasNoGlobalRegs (CmmRegOff (CmmLocal _) _) = True
+hasNoGlobalRegs _ = False
+
+isLit :: CmmExpr -> Bool
+isLit (CmmLit _) = True
+isLit _          = False
+
+isComparisonExpr :: CmmExpr -> Bool
+isComparisonExpr (CmmMachOp op _) = isComparisonMachOp op
+isComparisonExpr _                  = False
+
+---------------------------------------------------
+--
+--      Tagging
+--
+---------------------------------------------------
+
+-- Tag bits mask
+cmmTagMask, cmmPointerMask :: DynFlags -> CmmExpr
+cmmTagMask dflags = mkIntExpr dflags (tAG_MASK dflags)
+cmmPointerMask dflags = mkIntExpr dflags (complement (tAG_MASK dflags))
+
+-- Used to untag a possibly tagged pointer
+-- A static label need not be untagged
+cmmUntag, cmmIsTagged, cmmConstrTag1 :: DynFlags -> CmmExpr -> CmmExpr
+cmmUntag _ e@(CmmLit (CmmLabel _)) = e
+-- Default case
+cmmUntag dflags e = cmmAndWord dflags e (cmmPointerMask dflags)
+
+-- Test if a closure pointer is untagged
+cmmIsTagged dflags e = cmmNeWord dflags (cmmAndWord dflags e (cmmTagMask dflags)) (zeroExpr dflags)
+
+-- Get constructor tag, but one based.
+cmmConstrTag1 dflags e = cmmAndWord dflags e (cmmTagMask dflags)
+
+
+-----------------------------------------------------------------------------
+-- Overlap and usage
+
+-- | Returns True if the two STG registers overlap on the specified
+-- platform, in the sense that writing to one will clobber the
+-- other. This includes the case that the two registers are the same
+-- STG register. See Note [Overlapping global registers] for details.
+regsOverlap :: DynFlags -> CmmReg -> CmmReg -> Bool
+regsOverlap dflags (CmmGlobal g) (CmmGlobal g')
+  | Just real  <- globalRegMaybe (targetPlatform dflags) g,
+    Just real' <- globalRegMaybe (targetPlatform dflags) g',
+    real == real'
+    = True
+regsOverlap _ reg reg' = reg == reg'
+
+-- | Returns True if the STG register is used by the expression, in
+-- the sense that a store to the register might affect the value of
+-- the expression.
+--
+-- We must check for overlapping registers and not just equal
+-- registers here, otherwise CmmSink may incorrectly reorder
+-- assignments that conflict due to overlap. See #10521 and Note
+-- [Overlapping global registers].
+regUsedIn :: DynFlags -> CmmReg -> CmmExpr -> Bool
+regUsedIn dflags = regUsedIn_ where
+  _   `regUsedIn_` CmmLit _         = False
+  reg `regUsedIn_` CmmLoad e  _     = reg `regUsedIn_` e
+  reg `regUsedIn_` CmmReg reg'      = regsOverlap dflags reg reg'
+  reg `regUsedIn_` CmmRegOff reg' _ = regsOverlap dflags reg reg'
+  reg `regUsedIn_` CmmMachOp _ es   = any (reg `regUsedIn_`) es
+  _   `regUsedIn_` CmmStackSlot _ _ = False
+
+--------------------------------------------
+--
+--        mkLiveness
+--
+---------------------------------------------
+
+mkLiveness :: DynFlags -> [LocalReg] -> Liveness
+mkLiveness _      [] = []
+mkLiveness dflags (reg:regs)
+  = bits ++ mkLiveness dflags regs
+  where
+    sizeW = (widthInBytes (typeWidth (localRegType reg)) + wORD_SIZE dflags - 1)
+            `quot` wORD_SIZE dflags
+            -- number of words, rounded up
+    bits = replicate sizeW is_non_ptr -- True <=> Non Ptr
+
+    is_non_ptr = not $ isGcPtrType (localRegType reg)
+
+
+-- ============================================== -
+-- ============================================== -
+-- ============================================== -
+
+---------------------------------------------------
+--
+--      Manipulating CmmGraphs
+--
+---------------------------------------------------
+
+modifyGraph :: (Graph n C C -> Graph n' C C) -> GenCmmGraph n -> GenCmmGraph n'
+modifyGraph f g = CmmGraph {g_entry=g_entry g, g_graph=f (g_graph g)}
+
+toBlockMap :: CmmGraph -> LabelMap CmmBlock
+toBlockMap (CmmGraph {g_graph=GMany NothingO body NothingO}) = body
+
+ofBlockMap :: BlockId -> LabelMap CmmBlock -> CmmGraph
+ofBlockMap entry bodyMap = CmmGraph {g_entry=entry, g_graph=GMany NothingO bodyMap NothingO}
+
+toBlockList :: CmmGraph -> [CmmBlock]
+toBlockList g = mapElems $ toBlockMap g
+
+-- | like 'toBlockList', but the entry block always comes first
+toBlockListEntryFirst :: CmmGraph -> [CmmBlock]
+toBlockListEntryFirst g
+  | mapNull m  = []
+  | otherwise  = entry_block : others
+  where
+    m = toBlockMap g
+    entry_id = g_entry g
+    Just entry_block = mapLookup entry_id m
+    others = filter ((/= entry_id) . entryLabel) (mapElems m)
+
+-- | Like 'toBlockListEntryFirst', but we strive to ensure that we order blocks
+-- so that the false case of a conditional jumps to the next block in the output
+-- list of blocks. This matches the way OldCmm blocks were output since in
+-- OldCmm the false case was a fallthrough, whereas in Cmm conditional branches
+-- have both true and false successors. Block ordering can make a big difference
+-- in performance in the LLVM backend. Note that we rely crucially on the order
+-- of successors returned for CmmCondBranch by the NonLocal instance for CmmNode
+-- defined in cmm/CmmNode.hs. -GBM
+toBlockListEntryFirstFalseFallthrough :: CmmGraph -> [CmmBlock]
+toBlockListEntryFirstFalseFallthrough g
+  | mapNull m  = []
+  | otherwise  = dfs setEmpty [entry_block]
+  where
+    m = toBlockMap g
+    entry_id = g_entry g
+    Just entry_block = mapLookup entry_id m
+
+    dfs :: LabelSet -> [CmmBlock] -> [CmmBlock]
+    dfs _ [] = []
+    dfs visited (block:bs)
+      | id `setMember` visited = dfs visited bs
+      | otherwise              = block : dfs (setInsert id visited) bs'
+      where id = entryLabel block
+            bs' = foldr add_id bs (successors block)
+            add_id id bs = case mapLookup id m of
+                              Just b  -> b : bs
+                              Nothing -> bs
+
+ofBlockList :: BlockId -> [CmmBlock] -> CmmGraph
+ofBlockList entry blocks = CmmGraph { g_entry = entry
+                                    , g_graph = GMany NothingO body NothingO }
+  where body = foldr addBlock emptyBody blocks
+
+bodyToBlockList :: Body CmmNode -> [CmmBlock]
+bodyToBlockList body = mapElems body
+
+mapGraphNodes :: ( CmmNode C O -> CmmNode C O
+                 , CmmNode O O -> CmmNode O O
+                 , CmmNode O C -> CmmNode O C)
+              -> CmmGraph -> CmmGraph
+mapGraphNodes funs@(mf,_,_) g =
+  ofBlockMap (entryLabel $ mf $ CmmEntry (g_entry g) GlobalScope) $
+  mapMap (mapBlock3' funs) $ toBlockMap g
+
+mapGraphNodes1 :: (forall e x. CmmNode e x -> CmmNode e x) -> CmmGraph -> CmmGraph
+mapGraphNodes1 f = modifyGraph (mapGraph f)
+
+
+foldlGraphBlocks :: (a -> CmmBlock -> a) -> a -> CmmGraph -> a
+foldlGraphBlocks k z g = mapFoldl k z $ toBlockMap g
+
+revPostorder :: CmmGraph -> [CmmBlock]
+revPostorder g = {-# SCC "revPostorder" #-}
+    revPostorderFrom (toBlockMap g) (g_entry g)
+
+-------------------------------------------------
+-- Tick utilities
+
+-- | Extract all tick annotations from the given block
+blockTicks :: Block CmmNode C C -> [CmmTickish]
+blockTicks b = reverse $ foldBlockNodesF goStmt b []
+  where goStmt :: CmmNode e x -> [CmmTickish] -> [CmmTickish]
+        goStmt  (CmmTick t) ts = t:ts
+        goStmt  _other      ts = ts
+
+
+-- -----------------------------------------------------------------------------
+-- Access to common global registers
+
+baseExpr, spExpr, hpExpr, currentTSOExpr, currentNurseryExpr,
+  spLimExpr, hpLimExpr, cccsExpr :: CmmExpr
+baseExpr = CmmReg baseReg
+spExpr = CmmReg spReg
+spLimExpr = CmmReg spLimReg
+hpExpr = CmmReg hpReg
+hpLimExpr = CmmReg hpLimReg
+currentTSOExpr = CmmReg currentTSOReg
+currentNurseryExpr = CmmReg currentNurseryReg
+cccsExpr = CmmReg cccsReg
diff --git a/compiler/GHC/Cmm/cmm-notes b/compiler/GHC/Cmm/cmm-notes
new file mode 100644
index 0000000000..d664a195b7
--- /dev/null
+++ b/compiler/GHC/Cmm/cmm-notes
@@ -0,0 +1,184 @@
+More notes (Aug 11)
+~~~~~~~~~~~~~~~~~~
+* CmmInfo.cmmToRawCmm expands info tables to their representations
+  (needed for .cmm files as well as the code generators)
+
+* Why is FCode a lazy monad?  That makes it inefficient.
+  We want laziness to get code out one procedure at a time,
+  but not at the instruction level.
+  UPDATE (31/5/2016): FCode is strict since 09afcc9b.
+
+Things we did
+  * Remove CmmCvt.graphToZgraph (Conversion from old to new Cmm reps)
+  * Remove HscMain.optionallyConvertAndOrCPS (converted old Cmm to
+    new, ran pipeline, and converted back)
+  * Remove CmmDecl. Put its types in Cmm.  Import Cmm into OldCmm
+    so it can get those types.
+
+
+More notes (June 11)
+~~~~~~~~~~~~~~~~~~~~
+
+* In CmmContFlowOpt.branchChainElim, can a single block be the
+  successor of two calls?
+
+* Check in ClosureInfo:
+     -- NB: Results here should line up with the results of SMRep.rtsClosureType
+
+More notes (May 11)
+~~~~~~~~~~~~~~~~~~~
+In CmmNode, consider splitting CmmCall into two: call and jump
+
+Notes on new codegen (Aug 10)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Things to do:
+ - Proc points pass all arguments on the stack, adding more code and
+   slowing down things a lot. We either need to fix this or even better
+   would be to get rid of proc points.
+
+ - Sort out Label, LabelMap, LabelSet versus BlockId, BlockEnv, BlockSet
+   dichotomy. Mostly this means global replace, but we also need to make
+   Label an instance of Outputable (probably in the Outputable module).
+
+   EZY: We should use Label, since that's the terminology Hoopl uses.
+
+ - AsmCodeGen has a generic Cmm optimiser; move this into new pipeline
+   EZY (2011-04-16): The mini-inliner has been generalized and ported,
+   but the constant folding and other optimizations need to still be
+   ported.
+
+ - AsmCodeGen has post-native-cg branch eliminator (shortCutBranches);
+   we ultimately want to share this with the Cmm branch eliminator.
+
+ - At the moment, references to global registers like Hp are "lowered" 
+   late (in CgUtils.fixStgRegisters). We should do this early, in the
+	new native codegen, much in the way that we lower calling conventions.
+	Might need to be a bit sophisticated about aliasing.
+
+ - Move to new Cmm rep:
+     * Make native CG consume New Cmm; 
+     * Convert Old Cmm->New Cmm to keep old path alive
+     * Produce New Cmm when reading in .cmm files
+
+ - Top-level SRT threading is a bit ugly
+
+ - See "CAFs" below; we want to totally refactor the way SRTs are calculated
+
+ - Garbage-collect https://gitlab.haskell.org/ghc/ghc/wikis/commentary/compiler/cps
+   moving good stuff into 
+   https://gitlab.haskell.org/ghc/ghc/wikis/commentary/compiler/new-code-gen-pipeline
+
+ - Currently AsmCodeGen top level calls AsmCodeGen.cmmToCmm, which is a small
+   C-- optimiser.  It has quite a lot of boilerplate folding code in AsmCodeGen
+   (cmmBlockConFold, cmmStmtConFold, cmmExprConFold), before calling out to
+   CmmOpt.  ToDo: see what optimisations are being done; and do them before
+   AsmCodeGen.
+
+ - If we stick CAF and stack liveness info on a LastCall node (not LastRet/Jump)
+   then all CAF and stack liveness stuff be completed before we split
+   into separate C procedures.
+
+   Short term:
+     compute and attach liveness into LastCall
+     right at end, split, cvt to old rep
+     [must split before cvt, because old rep is not expressive enough]
+
+   Longer term: 
+     when old rep disappears, 
+     move the whole splitting game into the C back end *only*
+	 (guided by the procpoint set)
+
+----------------------------------------------------
+	Proc-points
+----------------------------------------------------
+
+Consider this program, which has a diamond control flow, 
+with a call on one branch
+ fn(p,x) {
+        h()
+	if b then { ... f(x) ...; q=5; goto J }
+             else { ...; q=7; goto J }
+     J: ..p...q...
+  }
+then the join point J is a "proc-point".  So, is 'p' passed to J
+as a parameter?  Or, if 'p' was saved on the stack anyway, perhaps
+to keep it alive across the call to h(), maybe 'p' gets communicated
+to J that way. This is an awkward choice.  (We think that we currently
+never pass variables to join points via arguments.)
+
+Furthermore, there is *no way* to pass q to J in a register (other
+than a parameter register).
+
+What we want is to do register allocation across the whole caboodle.
+Then we could drop all the code that deals with the above awkward
+decisions about spilling variables across proc-points.
+
+Note that J doesn't need an info table.
+
+What we really want is for each LastCall (not LastJump/Ret) 
+to have an info table.   Note that ProcPoints that are not successors
+of calls don't need an info table.
+
+Figuring out proc-points
+~~~~~~~~~~~~~~~~~~~~~~~~
+Proc-points are identified by
+GHC.Cmm.ProcPoint.minimalProcPointSet/extendPPSet Although there isn't
+that much code, JD thinks that it could be done much more nicely using
+a dominator analysis, using the Dataflow Engine.
+
+----------------------------------------------------
+		CAFs
+----------------------------------------------------
+
+* The code for a procedure f may refer to either the *closure* 
+  or the *entry point* of another top-level procedure g.  
+  If f is live, then so is g.  f's SRT must include g's closure.
+
+* The CLabel for the entry-point/closure reveals whether g is 
+  a CAF (or refers to CAFs).  See the IdLabel constructor of CLabel.
+
+* The CAF-ness of the original top-level definitions is figured out
+  (by GHC.Iface.Tidy) before we generate C--.  This CafInfo is only set for
+  top-level Ids; nested bindings stay with MayHaveCafRefs.
+
+* Currently an SRT contains (only) pointers to (top-level) closures.
+
+* Consider this Core code
+	f = \x -> let g = \y -> ...x...y...h1...
+                  in ...h2...g...
+  and suppose that h1, h2 have IdInfo of MayHaveCafRefs.
+  Therefore, so will f,  But g will not (since it's nested).
+
+  This generates C-- roughly like this:
+     f_closure: .word f_entry
+     f_entry() [info-tbl-for-f] { ...jump g_entry...jump h2... }
+     g_entry() [info-tbl-for-g] { ...jump h1... }
+
+  Note that there is no top-level closure for g (only an info table).
+  This fact (whether or not there is a top-level closure) is recorded
+  in the InfoTable attached to the CmmProc for f, g
+  INVARIANT: 
+     Any out-of-Group references to an IdLabel goes to
+     a Proc whose InfoTable says "I have a top-level closure".
+  Equivalently: 
+     A CmmProc whose InfoTable says "I do not have a top-level
+     closure" is referred to only from its own Group.
+
+* So:   info-tbl-for-f must have an SRT that keeps h1,h2 alive
+        info-tbl-for-g must have an SRT that keeps h1 (only) alive
+
+  But if we just look for the free CAF refs, we get:
+	f   h2 (only)
+        g   h1
+
+  So we need to do a transitive closure thing to flesh out 
+  f's keep-alive refs to include h1.
+
+* The SRT info is the C_SRT field of Cmm.ClosureTypeInfo in a
+  CmmInfoTable attached to each CmmProc.  CmmPipeline.toTops actually does
+  the attaching, right at the end of the pipeline.  The C_SRT part
+  gives offsets within a single, shared table of closure pointers.
+
+* DECIDED: we can generate SRTs based on the final Cmm program
+  without knowledge of how it is generated.