Modules: CmmToAsm (#13009)

author: Sylvain Henry <sylvain@haskus.fr> 2020-02-22 15:05:20 +0100
committer: Marge Bot <ben+marge-bot@smart-cactus.org> 2020-02-24 20:55:25 -0500
commit: 1b1067d14b656bbbfa7c47f156ec2700c9751549 (patch)
tree: 32346e3c4c3f89117190b36364144d85dc260e05 /compiler/GHC
parent: 354e2787be08fb6d973de1a39e58080ff8e107f8 (diff)
download: haskell-1b1067d14b656bbbfa7c47f156ec2700c9751549.tar.gz
71 files changed, 28475 insertions, 4 deletions
diff --git a/compiler/GHC/CmmToAsm.hs b/compiler/GHC/CmmToAsm.hs
new file mode 100644
index 0000000000..8dc9b61198
--- /dev/null
+++ b/compiler/GHC/CmmToAsm.hs
@@ -0,0 +1,1236 @@
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 1993-2004
+--
+-- This is the top-level module in the native code generator.
+--
+-- -----------------------------------------------------------------------------
+
+{-# LANGUAGE BangPatterns, CPP, GADTs, ScopedTypeVariables, PatternSynonyms,
+    DeriveFunctor #-}
+
+#if !defined(GHC_LOADED_INTO_GHCI)
+{-# LANGUAGE UnboxedTuples #-}
+#endif
+
+{-# OPTIONS_GHC -Wno-incomplete-record-updates #-}
+
+module GHC.CmmToAsm (
+                    -- * Module entry point
+                    nativeCodeGen
+
+                    -- * Test-only exports: see trac #12744
+                    -- used by testGraphNoSpills, which needs to access
+                    -- the register allocator intermediate data structures
+                    -- cmmNativeGen emits
+                  , cmmNativeGen
+                  , NcgImpl(..)
+                  , x86NcgImpl
+                  ) where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import qualified GHC.CmmToAsm.X86.CodeGen as X86.CodeGen
+import qualified GHC.CmmToAsm.X86.Regs    as X86.Regs
+import qualified GHC.CmmToAsm.X86.Instr   as X86.Instr
+import qualified GHC.CmmToAsm.X86.Ppr     as X86.Ppr
+
+import qualified GHC.CmmToAsm.SPARC.CodeGen as SPARC.CodeGen
+import qualified GHC.CmmToAsm.SPARC.Regs    as SPARC.Regs
+import qualified GHC.CmmToAsm.SPARC.Instr   as SPARC.Instr
+import qualified GHC.CmmToAsm.SPARC.Ppr     as SPARC.Ppr
+import qualified GHC.CmmToAsm.SPARC.ShortcutJump   as SPARC.ShortcutJump
+import qualified GHC.CmmToAsm.SPARC.CodeGen.Expand as SPARC.CodeGen.Expand
+
+import qualified GHC.CmmToAsm.PPC.CodeGen as PPC.CodeGen
+import qualified GHC.CmmToAsm.PPC.Regs    as PPC.Regs
+import qualified GHC.CmmToAsm.PPC.RegInfo as PPC.RegInfo
+import qualified GHC.CmmToAsm.PPC.Instr   as PPC.Instr
+import qualified GHC.CmmToAsm.PPC.Ppr     as PPC.Ppr
+
+import GHC.CmmToAsm.Reg.Liveness
+import qualified GHC.CmmToAsm.Reg.Linear                as Linear
+
+import qualified GraphColor                             as Color
+import qualified GHC.CmmToAsm.Reg.Graph                 as Color
+import qualified GHC.CmmToAsm.Reg.Graph.Stats           as Color
+import qualified GHC.CmmToAsm.Reg.Graph.TrivColorable   as Color
+
+import AsmUtils
+import GHC.CmmToAsm.Reg.Target
+import GHC.Platform
+import GHC.CmmToAsm.BlockLayout as BlockLayout
+import Config
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.PIC
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.CFG
+import GHC.CmmToAsm.Dwarf
+import GHC.Cmm.DebugBlock
+
+import GHC.Cmm.BlockId
+import GHC.StgToCmm.CgUtils ( fixStgRegisters )
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Opt           ( cmmMachOpFold )
+import GHC.Cmm.Ppr
+import GHC.Cmm.CLabel
+
+import UniqFM
+import UniqSupply
+import GHC.Driver.Session
+import Util
+
+import BasicTypes       ( Alignment )
+import qualified Pretty
+import BufWrite
+import Outputable
+import FastString
+import UniqSet
+import ErrUtils
+import Module
+import Stream (Stream)
+import qualified Stream
+
+-- DEBUGGING ONLY
+--import OrdList
+
+import Data.List
+import Data.Maybe
+import Data.Ord         ( comparing )
+import Control.Exception
+import Control.Monad
+import System.IO
+
+{-
+The native-code generator has machine-independent and
+machine-dependent modules.
+
+This module ("AsmCodeGen") is the top-level machine-independent
+module.  Before entering machine-dependent land, we do some
+machine-independent optimisations (defined below) on the
+'CmmStmts's.
+
+We convert to the machine-specific 'Instr' datatype with
+'cmmCodeGen', assuming an infinite supply of registers.  We then use
+a machine-independent register allocator ('regAlloc') to rejoin
+reality.  Obviously, 'regAlloc' has machine-specific helper
+functions (see about "RegAllocInfo" below).
+
+Finally, we order the basic blocks of the function so as to minimise
+the number of jumps between blocks, by utilising fallthrough wherever
+possible.
+
+The machine-dependent bits break down as follows:
+
+  * ["MachRegs"]  Everything about the target platform's machine
+    registers (and immediate operands, and addresses, which tend to
+    intermingle/interact with registers).
+
+  * ["MachInstrs"]  Includes the 'Instr' datatype (possibly should
+    have a module of its own), plus a miscellany of other things
+    (e.g., 'targetDoubleSize', 'smStablePtrTable', ...)
+
+  * ["MachCodeGen"]  is where 'Cmm' stuff turns into
+    machine instructions.
+
+  * ["PprMach"] 'pprInstr' turns an 'Instr' into text (well, really
+    a 'SDoc').
+
+  * ["RegAllocInfo"] In the register allocator, we manipulate
+    'MRegsState's, which are 'BitSet's, one bit per machine register.
+    When we want to say something about a specific machine register
+    (e.g., ``it gets clobbered by this instruction''), we set/unset
+    its bit.  Obviously, we do this 'BitSet' thing for efficiency
+    reasons.
+
+    The 'RegAllocInfo' module collects together the machine-specific
+    info needed to do register allocation.
+
+   * ["RegisterAlloc"] The (machine-independent) register allocator.
+-}
+
+--------------------
+nativeCodeGen :: forall a . DynFlags -> Module -> ModLocation -> Handle -> UniqSupply
+              -> Stream IO RawCmmGroup a
+              -> IO a
+nativeCodeGen dflags this_mod modLoc h us cmms
+ = let platform = targetPlatform dflags
+       nCG' :: ( Outputable statics, Outputable instr
+               , Outputable jumpDest, Instruction instr)
+            => NcgImpl statics instr jumpDest -> IO a
+       nCG' ncgImpl = nativeCodeGen' dflags this_mod modLoc ncgImpl h us cmms
+   in case platformArch platform of
+      ArchX86       -> nCG' (x86NcgImpl    dflags)
+      ArchX86_64    -> nCG' (x86_64NcgImpl dflags)
+      ArchPPC       -> nCG' (ppcNcgImpl    dflags)
+      ArchS390X     -> panic "nativeCodeGen: No NCG for S390X"
+      ArchSPARC     -> nCG' (sparcNcgImpl  dflags)
+      ArchSPARC64   -> panic "nativeCodeGen: No NCG for SPARC64"
+      ArchARM {}    -> panic "nativeCodeGen: No NCG for ARM"
+      ArchARM64     -> panic "nativeCodeGen: No NCG for ARM64"
+      ArchPPC_64 _  -> nCG' (ppcNcgImpl    dflags)
+      ArchAlpha     -> panic "nativeCodeGen: No NCG for Alpha"
+      ArchMipseb    -> panic "nativeCodeGen: No NCG for mipseb"
+      ArchMipsel    -> panic "nativeCodeGen: No NCG for mipsel"
+      ArchUnknown   -> panic "nativeCodeGen: No NCG for unknown arch"
+      ArchJavaScript-> panic "nativeCodeGen: No NCG for JavaScript"
+
+x86NcgImpl :: DynFlags -> NcgImpl (Alignment, RawCmmStatics)
+                                  X86.Instr.Instr X86.Instr.JumpDest
+x86NcgImpl dflags
+ = (x86_64NcgImpl dflags)
+
+x86_64NcgImpl :: DynFlags -> NcgImpl (Alignment, RawCmmStatics)
+                                  X86.Instr.Instr X86.Instr.JumpDest
+x86_64NcgImpl dflags
+ = NcgImpl {
+        cmmTopCodeGen             = X86.CodeGen.cmmTopCodeGen
+       ,generateJumpTableForInstr = X86.CodeGen.generateJumpTableForInstr dflags
+       ,getJumpDestBlockId        = X86.Instr.getJumpDestBlockId
+       ,canShortcut               = X86.Instr.canShortcut
+       ,shortcutStatics           = X86.Instr.shortcutStatics
+       ,shortcutJump              = X86.Instr.shortcutJump
+       ,pprNatCmmDecl             = X86.Ppr.pprNatCmmDecl
+       ,maxSpillSlots             = X86.Instr.maxSpillSlots dflags
+       ,allocatableRegs           = X86.Regs.allocatableRegs platform
+       ,ncgAllocMoreStack         = X86.Instr.allocMoreStack platform
+       ,ncgExpandTop              = id
+       ,ncgMakeFarBranches        = const id
+       ,extractUnwindPoints       = X86.CodeGen.extractUnwindPoints
+       ,invertCondBranches        = X86.CodeGen.invertCondBranches
+   }
+    where platform = targetPlatform dflags
+
+ppcNcgImpl :: DynFlags -> NcgImpl RawCmmStatics PPC.Instr.Instr PPC.RegInfo.JumpDest
+ppcNcgImpl dflags
+ = NcgImpl {
+        cmmTopCodeGen             = PPC.CodeGen.cmmTopCodeGen
+       ,generateJumpTableForInstr = PPC.CodeGen.generateJumpTableForInstr dflags
+       ,getJumpDestBlockId        = PPC.RegInfo.getJumpDestBlockId
+       ,canShortcut               = PPC.RegInfo.canShortcut
+       ,shortcutStatics           = PPC.RegInfo.shortcutStatics
+       ,shortcutJump              = PPC.RegInfo.shortcutJump
+       ,pprNatCmmDecl             = PPC.Ppr.pprNatCmmDecl
+       ,maxSpillSlots             = PPC.Instr.maxSpillSlots dflags
+       ,allocatableRegs           = PPC.Regs.allocatableRegs platform
+       ,ncgAllocMoreStack         = PPC.Instr.allocMoreStack platform
+       ,ncgExpandTop              = id
+       ,ncgMakeFarBranches        = PPC.Instr.makeFarBranches
+       ,extractUnwindPoints       = const []
+       ,invertCondBranches        = \_ _ -> id
+   }
+    where platform = targetPlatform dflags
+
+sparcNcgImpl :: DynFlags -> NcgImpl RawCmmStatics SPARC.Instr.Instr SPARC.ShortcutJump.JumpDest
+sparcNcgImpl dflags
+ = NcgImpl {
+        cmmTopCodeGen             = SPARC.CodeGen.cmmTopCodeGen
+       ,generateJumpTableForInstr = SPARC.CodeGen.generateJumpTableForInstr dflags
+       ,getJumpDestBlockId        = SPARC.ShortcutJump.getJumpDestBlockId
+       ,canShortcut               = SPARC.ShortcutJump.canShortcut
+       ,shortcutStatics           = SPARC.ShortcutJump.shortcutStatics
+       ,shortcutJump              = SPARC.ShortcutJump.shortcutJump
+       ,pprNatCmmDecl             = SPARC.Ppr.pprNatCmmDecl
+       ,maxSpillSlots             = SPARC.Instr.maxSpillSlots dflags
+       ,allocatableRegs           = SPARC.Regs.allocatableRegs
+       ,ncgAllocMoreStack         = noAllocMoreStack
+       ,ncgExpandTop              = map SPARC.CodeGen.Expand.expandTop
+       ,ncgMakeFarBranches        = const id
+       ,extractUnwindPoints       = const []
+       ,invertCondBranches        = \_ _ -> id
+   }
+
+--
+-- Allocating more stack space for spilling is currently only
+-- supported for the linear register allocator on x86/x86_64, the rest
+-- default to the panic below.  To support allocating extra stack on
+-- more platforms provide a definition of ncgAllocMoreStack.
+--
+noAllocMoreStack :: Int -> NatCmmDecl statics instr
+                 -> UniqSM (NatCmmDecl statics instr, [(BlockId,BlockId)])
+noAllocMoreStack amount _
+  = panic $   "Register allocator: out of stack slots (need " ++ show amount ++ ")\n"
+        ++  "   If you are trying to compile SHA1.hs from the crypto library then this\n"
+        ++  "   is a known limitation in the linear allocator.\n"
+        ++  "\n"
+        ++  "   Try enabling the graph colouring allocator with -fregs-graph instead."
+        ++  "   You can still file a bug report if you like.\n"
+
+
+-- | Data accumulated during code generation. Mostly about statistics,
+-- but also collects debug data for DWARF generation.
+data NativeGenAcc statics instr
+  = NGS { ngs_imports     :: ![[CLabel]]
+        , ngs_natives     :: ![[NatCmmDecl statics instr]]
+             -- ^ Native code generated, for statistics. This might
+             -- hold a lot of data, so it is important to clear this
+             -- field as early as possible if it isn't actually
+             -- required.
+        , ngs_colorStats  :: ![[Color.RegAllocStats statics instr]]
+        , ngs_linearStats :: ![[Linear.RegAllocStats]]
+        , ngs_labels      :: ![Label]
+        , ngs_debug       :: ![DebugBlock]
+        , ngs_dwarfFiles  :: !DwarfFiles
+        , ngs_unwinds     :: !(LabelMap [UnwindPoint])
+             -- ^ see Note [Unwinding information in the NCG]
+             -- and Note [What is this unwinding business?] in Debug.
+        }
+
+{-
+Note [Unwinding information in the NCG]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Unwind information is a type of metadata which allows a debugging tool
+to reconstruct the values of machine registers at the time a procedure was
+entered. For the most part, the production of unwind information is handled by
+the Cmm stage, where it is represented by CmmUnwind nodes.
+
+Unfortunately, the Cmm stage doesn't know everything necessary to produce
+accurate unwinding information. For instance, the x86-64 calling convention
+requires that the stack pointer be aligned to 16 bytes, which in turn means that
+GHC must sometimes add padding to $sp prior to performing a foreign call. When
+this happens unwind information must be updated accordingly.
+For this reason, we make the NCG backends responsible for producing
+unwinding tables (with the extractUnwindPoints function in NcgImpl).
+
+We accumulate the produced unwind tables over CmmGroups in the ngs_unwinds
+field of NativeGenAcc. This is a label map which contains an entry for each
+procedure, containing a list of unwinding points (e.g. a label and an associated
+unwinding table).
+
+See also Note [What is this unwinding business?] in Debug.
+-}
+
+nativeCodeGen' :: (Outputable statics, Outputable instr,Outputable jumpDest,
+                   Instruction instr)
+               => DynFlags
+               -> Module -> ModLocation
+               -> NcgImpl statics instr jumpDest
+               -> Handle
+               -> UniqSupply
+               -> Stream IO RawCmmGroup a
+               -> IO a
+nativeCodeGen' dflags this_mod modLoc ncgImpl h us cmms
+ = do
+        -- BufHandle is a performance hack.  We could hide it inside
+        -- Pretty if it weren't for the fact that we do lots of little
+        -- printDocs here (in order to do codegen in constant space).
+        bufh <- newBufHandle h
+        let ngs0 = NGS [] [] [] [] [] [] emptyUFM mapEmpty
+        (ngs, us', a) <- cmmNativeGenStream dflags this_mod modLoc ncgImpl bufh us
+                                         cmms ngs0
+        _ <- finishNativeGen dflags modLoc bufh us' ngs
+        return a
+
+finishNativeGen :: Instruction instr
+                => DynFlags
+                -> ModLocation
+                -> BufHandle
+                -> UniqSupply
+                -> NativeGenAcc statics instr
+                -> IO UniqSupply
+finishNativeGen dflags modLoc bufh@(BufHandle _ _ h) us ngs
+ = withTimingSilent dflags (text "NCG") (`seq` ()) $ do
+        -- Write debug data and finish
+        let emitDw = debugLevel dflags > 0
+        us' <- if not emitDw then return us else do
+          (dwarf, us') <- dwarfGen dflags modLoc us (ngs_debug ngs)
+          emitNativeCode dflags bufh dwarf
+          return us'
+        bFlush bufh
+
+        -- dump global NCG stats for graph coloring allocator
+        let stats = concat (ngs_colorStats ngs)
+        unless (null stats) $ do
+
+          -- build the global register conflict graph
+          let graphGlobal
+                  = foldl' Color.union Color.initGraph
+                  $ [ Color.raGraph stat
+                          | stat@Color.RegAllocStatsStart{} <- stats]
+
+          dump_stats (Color.pprStats stats graphGlobal)
+
+          let platform = targetPlatform dflags
+          dumpIfSet_dyn dflags
+                  Opt_D_dump_asm_conflicts "Register conflict graph"
+                  FormatText
+                  $ Color.dotGraph
+                          (targetRegDotColor platform)
+                          (Color.trivColorable platform
+                                  (targetVirtualRegSqueeze platform)
+                                  (targetRealRegSqueeze platform))
+                  $ graphGlobal
+
+
+        -- dump global NCG stats for linear allocator
+        let linearStats = concat (ngs_linearStats ngs)
+        unless (null linearStats) $
+          dump_stats (Linear.pprStats (concat (ngs_natives ngs)) linearStats)
+
+        -- write out the imports
+        printSDocLn Pretty.LeftMode dflags h (mkCodeStyle AsmStyle)
+                $ makeImportsDoc dflags (concat (ngs_imports ngs))
+        return us'
+  where
+    dump_stats = dumpAction dflags (mkDumpStyle dflags alwaysQualify)
+                   (dumpOptionsFromFlag Opt_D_dump_asm_stats) "NCG stats"
+                   FormatText
+
+cmmNativeGenStream :: (Outputable statics, Outputable instr
+                      ,Outputable jumpDest, Instruction instr)
+              => DynFlags
+              -> Module -> ModLocation
+              -> NcgImpl statics instr jumpDest
+              -> BufHandle
+              -> UniqSupply
+              -> Stream IO RawCmmGroup a
+              -> NativeGenAcc statics instr
+              -> IO (NativeGenAcc statics instr, UniqSupply, a)
+
+cmmNativeGenStream dflags this_mod modLoc ncgImpl h us cmm_stream ngs
+ = do r <- Stream.runStream cmm_stream
+      case r of
+        Left a ->
+          return (ngs { ngs_imports = reverse $ ngs_imports ngs
+                      , ngs_natives = reverse $ ngs_natives ngs
+                      , ngs_colorStats = reverse $ ngs_colorStats ngs
+                      , ngs_linearStats = reverse $ ngs_linearStats ngs
+                      },
+                  us,
+                  a)
+        Right (cmms, cmm_stream') -> do
+          (us', ngs'') <-
+            withTimingSilent
+                dflags
+                ncglabel (\(a, b) -> a `seq` b `seq` ()) $ do
+              -- Generate debug information
+              let debugFlag = debugLevel dflags > 0
+                  !ndbgs | debugFlag = cmmDebugGen modLoc cmms
+                         | otherwise = []
+                  dbgMap = debugToMap ndbgs
+
+              -- Generate native code
+              (ngs',us') <- cmmNativeGens dflags this_mod modLoc ncgImpl h
+                                               dbgMap us cmms ngs 0
+
+              -- Link native code information into debug blocks
+              -- See Note [What is this unwinding business?] in Debug.
+              let !ldbgs = cmmDebugLink (ngs_labels ngs') (ngs_unwinds ngs') ndbgs
+              unless (null ldbgs) $
+                dumpIfSet_dyn dflags Opt_D_dump_debug "Debug Infos" FormatText
+                  (vcat $ map ppr ldbgs)
+
+              -- Accumulate debug information for emission in finishNativeGen.
+              let ngs'' = ngs' { ngs_debug = ngs_debug ngs' ++ ldbgs, ngs_labels = [] }
+              return (us', ngs'')
+
+          cmmNativeGenStream dflags this_mod modLoc ncgImpl h us'
+              cmm_stream' ngs''
+
+    where ncglabel = text "NCG"
+
+-- | Do native code generation on all these cmms.
+--
+cmmNativeGens :: forall statics instr jumpDest.
+                 (Outputable statics, Outputable instr
+                 ,Outputable jumpDest, Instruction instr)
+              => DynFlags
+              -> Module -> ModLocation
+              -> NcgImpl statics instr jumpDest
+              -> BufHandle
+              -> LabelMap DebugBlock
+              -> UniqSupply
+              -> [RawCmmDecl]
+              -> NativeGenAcc statics instr
+              -> Int
+              -> IO (NativeGenAcc statics instr, UniqSupply)
+
+cmmNativeGens dflags this_mod modLoc ncgImpl h dbgMap = go
+  where
+    go :: UniqSupply -> [RawCmmDecl]
+       -> NativeGenAcc statics instr -> Int
+       -> IO (NativeGenAcc statics instr, UniqSupply)
+
+    go us [] ngs !_ =
+        return (ngs, us)
+
+    go us (cmm : cmms) ngs count = do
+        let fileIds = ngs_dwarfFiles ngs
+        (us', fileIds', native, imports, colorStats, linearStats, unwinds)
+          <- {-# SCC "cmmNativeGen" #-}
+             cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap
+                          cmm count
+
+        -- Generate .file directives for every new file that has been
+        -- used. Note that it is important that we generate these in
+        -- ascending order, as Clang's 3.6 assembler complains.
+        let newFileIds = sortBy (comparing snd) $
+                         nonDetEltsUFM $ fileIds' `minusUFM` fileIds
+            -- See Note [Unique Determinism and code generation]
+            pprDecl (f,n) = text "\t.file " <> ppr n <+>
+                            pprFilePathString (unpackFS f)
+
+        emitNativeCode dflags h $ vcat $
+          map pprDecl newFileIds ++
+          map (pprNatCmmDecl ncgImpl) native
+
+        -- force evaluation all this stuff to avoid space leaks
+        {-# SCC "seqString" #-} evaluate $ seqList (showSDoc dflags $ vcat $ map ppr imports) ()
+
+        let !labels' = if debugLevel dflags > 0
+                       then cmmDebugLabels isMetaInstr native else []
+            !natives' = if dopt Opt_D_dump_asm_stats dflags
+                        then native : ngs_natives ngs else []
+
+            mCon = maybe id (:)
+            ngs' = ngs{ ngs_imports     = imports : ngs_imports ngs
+                      , ngs_natives     = natives'
+                      , ngs_colorStats  = colorStats `mCon` ngs_colorStats ngs
+                      , ngs_linearStats = linearStats `mCon` ngs_linearStats ngs
+                      , ngs_labels      = ngs_labels ngs ++ labels'
+                      , ngs_dwarfFiles  = fileIds'
+                      , ngs_unwinds     = ngs_unwinds ngs `mapUnion` unwinds
+                      }
+        go us' cmms ngs' (count + 1)
+
+
+emitNativeCode :: DynFlags -> BufHandle -> SDoc -> IO ()
+emitNativeCode dflags h sdoc = do
+
+        {-# SCC "pprNativeCode" #-} bufLeftRenderSDoc dflags h
+                                      (mkCodeStyle AsmStyle) sdoc
+
+        -- dump native code
+        dumpIfSet_dyn dflags
+                Opt_D_dump_asm "Asm code" FormatASM
+                sdoc
+
+-- | Complete native code generation phase for a single top-level chunk of Cmm.
+--      Dumping the output of each stage along the way.
+--      Global conflict graph and NGC stats
+cmmNativeGen
+    :: forall statics instr jumpDest. (Instruction instr,
+        Outputable statics, Outputable instr, Outputable jumpDest)
+    => DynFlags
+    -> Module -> ModLocation
+    -> NcgImpl statics instr jumpDest
+        -> UniqSupply
+        -> DwarfFiles
+        -> LabelMap DebugBlock
+        -> RawCmmDecl                                   -- ^ the cmm to generate code for
+        -> Int                                          -- ^ sequence number of this top thing
+        -> IO   ( UniqSupply
+                , DwarfFiles
+                , [NatCmmDecl statics instr]                -- native code
+                , [CLabel]                                  -- things imported by this cmm
+                , Maybe [Color.RegAllocStats statics instr] -- stats for the coloring register allocator
+                , Maybe [Linear.RegAllocStats]              -- stats for the linear register allocators
+                , LabelMap [UnwindPoint]                    -- unwinding information for blocks
+                )
+
+cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
+ = do
+        let platform = targetPlatform dflags
+
+        let proc_name = case cmm of
+                (CmmProc _ entry_label _ _) -> ppr entry_label
+                _                           -> text "DataChunk"
+
+        -- rewrite assignments to global regs
+        let fixed_cmm =
+                {-# SCC "fixStgRegisters" #-}
+                fixStgRegisters dflags cmm
+
+        -- cmm to cmm optimisations
+        let (opt_cmm, imports) =
+                {-# SCC "cmmToCmm" #-}
+                cmmToCmm dflags this_mod fixed_cmm
+
+        dumpIfSet_dyn dflags
+                Opt_D_dump_opt_cmm "Optimised Cmm" FormatCMM
+                (pprCmmGroup [opt_cmm])
+
+        let cmmCfg = {-# SCC "getCFG" #-}
+                     getCfgProc (cfgWeightInfo dflags) opt_cmm
+
+        -- generate native code from cmm
+        let ((native, lastMinuteImports, fileIds', nativeCfgWeights), usGen) =
+                {-# SCC "genMachCode" #-}
+                initUs us $ genMachCode dflags this_mod modLoc
+                                        (cmmTopCodeGen ncgImpl)
+                                        fileIds dbgMap opt_cmm cmmCfg
+
+        dumpIfSet_dyn dflags
+                Opt_D_dump_asm_native "Native code" FormatASM
+                (vcat $ map (pprNatCmmDecl ncgImpl) native)
+
+        maybeDumpCfg dflags (Just nativeCfgWeights) "CFG Weights - Native" proc_name
+
+        -- tag instructions with register liveness information
+        -- also drops dead code. We don't keep the cfg in sync on
+        -- some backends, so don't use it there.
+        let livenessCfg = if (backendMaintainsCfg dflags)
+                                then Just nativeCfgWeights
+                                else Nothing
+        let (withLiveness, usLive) =
+                {-# SCC "regLiveness" #-}
+                initUs usGen
+                        $ mapM (cmmTopLiveness livenessCfg platform) native
+
+        dumpIfSet_dyn dflags
+                Opt_D_dump_asm_liveness "Liveness annotations added"
+                FormatCMM
+                (vcat $ map ppr withLiveness)
+
+        -- allocate registers
+        (alloced, usAlloc, ppr_raStatsColor, ppr_raStatsLinear, raStats, stack_updt_blks) <-
+         if ( gopt Opt_RegsGraph dflags
+           || gopt Opt_RegsIterative dflags )
+          then do
+                -- the regs usable for allocation
+                let (alloc_regs :: UniqFM (UniqSet RealReg))
+                        = foldr (\r -> plusUFM_C unionUniqSets
+                                        $ unitUFM (targetClassOfRealReg platform r) (unitUniqSet r))
+                                emptyUFM
+                        $ allocatableRegs ncgImpl
+
+                -- do the graph coloring register allocation
+                let ((alloced, maybe_more_stack, regAllocStats), usAlloc)
+                        = {-# SCC "RegAlloc-color" #-}
+                          initUs usLive
+                          $ Color.regAlloc
+                                dflags
+                                alloc_regs
+                                (mkUniqSet [0 .. maxSpillSlots ncgImpl])
+                                (maxSpillSlots ncgImpl)
+                                withLiveness
+                                livenessCfg
+
+                let ((alloced', stack_updt_blks), usAlloc')
+                        = initUs usAlloc $
+                                case maybe_more_stack of
+                                Nothing     -> return (alloced, [])
+                                Just amount -> do
+                                    (alloced',stack_updt_blks) <- unzip <$>
+                                                (mapM ((ncgAllocMoreStack ncgImpl) amount) alloced)
+                                    return (alloced', concat stack_updt_blks )
+
+
+                -- dump out what happened during register allocation
+                dumpIfSet_dyn dflags
+                        Opt_D_dump_asm_regalloc "Registers allocated"
+                        FormatCMM
+                        (vcat $ map (pprNatCmmDecl ncgImpl) alloced)
+
+                dumpIfSet_dyn dflags
+                        Opt_D_dump_asm_regalloc_stages "Build/spill stages"
+                        FormatText
+                        (vcat   $ map (\(stage, stats)
+                                        -> text "# --------------------------"
+                                        $$ text "#  cmm " <> int count <> text " Stage " <> int stage
+                                        $$ ppr stats)
+                                $ zip [0..] regAllocStats)
+
+                let mPprStats =
+                        if dopt Opt_D_dump_asm_stats dflags
+                         then Just regAllocStats else Nothing
+
+                -- force evaluation of the Maybe to avoid space leak
+                mPprStats `seq` return ()
+
+                return  ( alloced', usAlloc'
+                        , mPprStats
+                        , Nothing
+                        , [], stack_updt_blks)
+
+          else do
+                -- do linear register allocation
+                let reg_alloc proc = do
+                       (alloced, maybe_more_stack, ra_stats) <-
+                               Linear.regAlloc dflags proc
+                       case maybe_more_stack of
+                         Nothing -> return ( alloced, ra_stats, [] )
+                         Just amount -> do
+                           (alloced',stack_updt_blks) <-
+                               ncgAllocMoreStack ncgImpl amount alloced
+                           return (alloced', ra_stats, stack_updt_blks )
+
+                let ((alloced, regAllocStats, stack_updt_blks), usAlloc)
+                        = {-# SCC "RegAlloc-linear" #-}
+                          initUs usLive
+                          $ liftM unzip3
+                          $ mapM reg_alloc withLiveness
+
+                dumpIfSet_dyn dflags
+                        Opt_D_dump_asm_regalloc "Registers allocated"
+                        FormatCMM
+                        (vcat $ map (pprNatCmmDecl ncgImpl) alloced)
+
+                let mPprStats =
+                        if dopt Opt_D_dump_asm_stats dflags
+                         then Just (catMaybes regAllocStats) else Nothing
+
+                -- force evaluation of the Maybe to avoid space leak
+                mPprStats `seq` return ()
+
+                return  ( alloced, usAlloc
+                        , Nothing
+                        , mPprStats, (catMaybes regAllocStats)
+                        , concat stack_updt_blks )
+
+        -- Fixupblocks the register allocator inserted (from, regMoves, to)
+        let cfgRegAllocUpdates :: [(BlockId,BlockId,BlockId)]
+            cfgRegAllocUpdates = (concatMap Linear.ra_fixupList raStats)
+
+        let cfgWithFixupBlks =
+                (\cfg -> addNodesBetween cfg cfgRegAllocUpdates) <$> livenessCfg
+
+        -- Insert stack update blocks
+        let postRegCFG =
+                pure (foldl' (\m (from,to) -> addImmediateSuccessor from to m ))
+                     <*> cfgWithFixupBlks
+                     <*> pure stack_updt_blks
+
+        ---- generate jump tables
+        let tabled      =
+                {-# SCC "generateJumpTables" #-}
+                generateJumpTables ncgImpl alloced
+
+        when (not $ null nativeCfgWeights) $ dumpIfSet_dyn dflags
+                Opt_D_dump_cfg_weights "CFG Update information"
+                FormatText
+                ( text "stack:" <+> ppr stack_updt_blks $$
+                  text "linearAlloc:" <+> ppr cfgRegAllocUpdates )
+
+        ---- shortcut branches
+        let (shorted, postShortCFG)     =
+                {-# SCC "shortcutBranches" #-}
+                shortcutBranches dflags ncgImpl tabled postRegCFG
+
+        let optimizedCFG :: Maybe CFG
+            optimizedCFG =
+                optimizeCFG (cfgWeightInfo dflags) cmm <$!> postShortCFG
+
+        maybeDumpCfg dflags optimizedCFG "CFG Weights - Final" proc_name
+
+        --TODO: Partially check validity of the cfg.
+        let getBlks (CmmProc _info _lbl _live (ListGraph blocks)) = blocks
+            getBlks _ = []
+
+        when ( backendMaintainsCfg dflags &&
+                (gopt Opt_DoAsmLinting dflags || debugIsOn )) $ do
+                let blocks = concatMap getBlks shorted
+                let labels = setFromList $ fmap blockId blocks :: LabelSet
+                let cfg = fromJust optimizedCFG
+                return $! seq (sanityCheckCfg cfg labels $
+                                text "cfg not in lockstep") ()
+
+        ---- sequence blocks
+        let sequenced :: [NatCmmDecl statics instr]
+            sequenced =
+                checkLayout shorted $
+                {-# SCC "sequenceBlocks" #-}
+                map (BlockLayout.sequenceTop
+                        dflags
+                        ncgImpl optimizedCFG)
+                    shorted
+
+        let branchOpt :: [NatCmmDecl statics instr]
+            branchOpt =
+                {-# SCC "invertCondBranches" #-}
+                map invert sequenced
+              where
+                invertConds :: LabelMap RawCmmStatics -> [NatBasicBlock instr]
+                            -> [NatBasicBlock instr]
+                invertConds = invertCondBranches ncgImpl optimizedCFG
+                invert top@CmmData {} = top
+                invert (CmmProc info lbl live (ListGraph blocks)) =
+                    CmmProc info lbl live (ListGraph $ invertConds info blocks)
+
+        ---- expansion of SPARC synthetic instrs
+        let expanded =
+                {-# SCC "sparc_expand" #-}
+                ncgExpandTop ncgImpl branchOpt
+                --ncgExpandTop ncgImpl sequenced
+
+        dumpIfSet_dyn dflags
+                Opt_D_dump_asm_expanded "Synthetic instructions expanded"
+                FormatCMM
+                (vcat $ map (pprNatCmmDecl ncgImpl) expanded)
+
+        -- generate unwinding information from cmm
+        let unwinds :: BlockMap [UnwindPoint]
+            unwinds =
+                {-# SCC "unwindingInfo" #-}
+                foldl' addUnwind mapEmpty expanded
+              where
+                addUnwind acc proc =
+                    acc `mapUnion` computeUnwinding dflags ncgImpl proc
+
+        return  ( usAlloc
+                , fileIds'
+                , expanded
+                , lastMinuteImports ++ imports
+                , ppr_raStatsColor
+                , ppr_raStatsLinear
+                , unwinds )
+
+maybeDumpCfg :: DynFlags -> Maybe CFG -> String -> SDoc -> IO ()
+maybeDumpCfg _dflags Nothing _ _ = return ()
+maybeDumpCfg dflags (Just cfg) msg proc_name
+        | null cfg = return ()
+        | otherwise
+        = dumpIfSet_dyn
+                dflags Opt_D_dump_cfg_weights msg
+                FormatText
+                (proc_name <> char ':' $$ pprEdgeWeights cfg)
+
+-- | Make sure all blocks we want the layout algorithm to place have been placed.
+checkLayout :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr]
+            -> [NatCmmDecl statics instr]
+checkLayout procsUnsequenced procsSequenced =
+        ASSERT2(setNull diff,
+                ppr "Block sequencing dropped blocks:" <> ppr diff)
+        procsSequenced
+  where
+        blocks1 = foldl' (setUnion) setEmpty $
+                        map getBlockIds procsUnsequenced :: LabelSet
+        blocks2 = foldl' (setUnion) setEmpty $
+                        map getBlockIds procsSequenced
+        diff = setDifference blocks1 blocks2
+
+        getBlockIds (CmmData _ _) = setEmpty
+        getBlockIds (CmmProc _ _ _ (ListGraph blocks)) =
+                setFromList $ map blockId blocks
+
+-- | Compute unwinding tables for the blocks of a procedure
+computeUnwinding :: Instruction instr
+                 => DynFlags -> NcgImpl statics instr jumpDest
+                 -> NatCmmDecl statics instr
+                    -- ^ the native code generated for the procedure
+                 -> LabelMap [UnwindPoint]
+                    -- ^ unwinding tables for all points of all blocks of the
+                    -- procedure
+computeUnwinding dflags _ _
+  | debugLevel dflags == 0         = mapEmpty
+computeUnwinding _ _ (CmmData _ _) = mapEmpty
+computeUnwinding _ ncgImpl (CmmProc _ _ _ (ListGraph blks)) =
+    -- In general we would need to push unwinding information down the
+    -- block-level call-graph to ensure that we fully account for all
+    -- relevant register writes within a procedure.
+    --
+    -- However, the only unwinding information that we care about in GHC is for
+    -- Sp. The fact that GHC.Cmm.LayoutStack already ensures that we have unwind
+    -- information at the beginning of every block means that there is no need
+    -- to perform this sort of push-down.
+    mapFromList [ (blk_lbl, extractUnwindPoints ncgImpl instrs)
+                | BasicBlock blk_lbl instrs <- blks ]
+
+-- | Build a doc for all the imports.
+--
+makeImportsDoc :: DynFlags -> [CLabel] -> SDoc
+makeImportsDoc dflags imports
+ = dyld_stubs imports
+            $$
+            -- On recent versions of Darwin, the linker supports
+            -- dead-stripping of code and data on a per-symbol basis.
+            -- There's a hack to make this work in PprMach.pprNatCmmDecl.
+            (if platformHasSubsectionsViaSymbols platform
+             then text ".subsections_via_symbols"
+             else Outputable.empty)
+            $$
+                -- On recent GNU ELF systems one can mark an object file
+                -- as not requiring an executable stack. If all objects
+                -- linked into a program have this note then the program
+                -- will not use an executable stack, which is good for
+                -- security. GHC generated code does not need an executable
+                -- stack so add the note in:
+            (if platformHasGnuNonexecStack platform
+             then text ".section .note.GNU-stack,\"\"," <> sectionType "progbits"
+             else Outputable.empty)
+            $$
+                -- And just because every other compiler does, let's stick in
+                -- an identifier directive: .ident "GHC x.y.z"
+            (if platformHasIdentDirective platform
+             then let compilerIdent = text "GHC" <+> text cProjectVersion
+                   in text ".ident" <+> doubleQuotes compilerIdent
+             else Outputable.empty)
+
+ where
+        platform = targetPlatform dflags
+        arch = platformArch platform
+        os   = platformOS   platform
+
+        -- Generate "symbol stubs" for all external symbols that might
+        -- come from a dynamic library.
+        dyld_stubs :: [CLabel] -> SDoc
+{-      dyld_stubs imps = vcat $ map pprDyldSymbolStub $
+                                    map head $ group $ sort imps-}
+        -- (Hack) sometimes two Labels pretty-print the same, but have
+        -- different uniques; so we compare their text versions...
+        dyld_stubs imps
+                | needImportedSymbols dflags arch os
+                = vcat $
+                        (pprGotDeclaration dflags arch os :) $
+                        map ( pprImportedSymbol dflags platform . fst . head) $
+                        groupBy (\(_,a) (_,b) -> a == b) $
+                        sortBy (\(_,a) (_,b) -> compare a b) $
+                        map doPpr $
+                        imps
+                | otherwise
+                = Outputable.empty
+
+        doPpr lbl = (lbl, renderWithStyle
+                              (initSDocContext dflags astyle)
+                              (pprCLabel dflags lbl))
+        astyle = mkCodeStyle AsmStyle
+
+-- -----------------------------------------------------------------------------
+-- Generate jump tables
+
+-- Analyzes all native code and generates data sections for all jump
+-- table instructions.
+generateJumpTables
+        :: NcgImpl statics instr jumpDest
+        -> [NatCmmDecl statics instr] -> [NatCmmDecl statics instr]
+generateJumpTables ncgImpl xs = concatMap f xs
+    where f p@(CmmProc _ _ _ (ListGraph xs)) = p : concatMap g xs
+          f p = [p]
+          g (BasicBlock _ xs) = catMaybes (map (generateJumpTableForInstr ncgImpl) xs)
+
+-- -----------------------------------------------------------------------------
+-- Shortcut branches
+
+shortcutBranches
+        :: forall statics instr jumpDest. (Outputable jumpDest) => DynFlags
+        -> NcgImpl statics instr jumpDest
+        -> [NatCmmDecl statics instr]
+        -> Maybe CFG
+        -> ([NatCmmDecl statics instr],Maybe CFG)
+
+shortcutBranches dflags ncgImpl tops weights
+  | gopt Opt_AsmShortcutting dflags
+  = ( map (apply_mapping ncgImpl mapping) tops'
+    , shortcutWeightMap mappingBid <$!> weights )
+  | otherwise
+  = (tops, weights)
+  where
+    (tops', mappings) = mapAndUnzip (build_mapping ncgImpl) tops
+    mapping = mapUnions mappings :: LabelMap jumpDest
+    mappingBid = fmap (getJumpDestBlockId ncgImpl) mapping
+
+build_mapping :: forall instr t d statics jumpDest.
+                 NcgImpl statics instr jumpDest
+              -> GenCmmDecl d (LabelMap t) (ListGraph instr)
+              -> (GenCmmDecl d (LabelMap t) (ListGraph instr)
+                 ,LabelMap jumpDest)
+build_mapping _ top@(CmmData _ _) = (top, mapEmpty)
+build_mapping _ (CmmProc info lbl live (ListGraph []))
+  = (CmmProc info lbl live (ListGraph []), mapEmpty)
+build_mapping ncgImpl (CmmProc info lbl live (ListGraph (head:blocks)))
+  = (CmmProc info lbl live (ListGraph (head:others)), mapping)
+        -- drop the shorted blocks, but don't ever drop the first one,
+        -- because it is pointed to by a global label.
+  where
+    -- find all the blocks that just consist of a jump that can be
+    -- shorted.
+    -- Don't completely eliminate loops here -- that can leave a dangling jump!
+    shortcut_blocks :: [(BlockId, jumpDest)]
+    (_, shortcut_blocks, others) =
+        foldl' split (setEmpty :: LabelSet, [], []) blocks
+    split (s, shortcut_blocks, others) b@(BasicBlock id [insn])
+        | Just jd <- canShortcut ncgImpl insn
+        , Just dest <- getJumpDestBlockId ncgImpl jd
+        , not (has_info id)
+        , (setMember dest s) || dest == id -- loop checks
+        = (s, shortcut_blocks, b : others)
+    split (s, shortcut_blocks, others) (BasicBlock id [insn])
+        | Just dest <- canShortcut ncgImpl insn
+        , not (has_info id)
+        = (setInsert id s, (id,dest) : shortcut_blocks, others)
+    split (s, shortcut_blocks, others) other = (s, shortcut_blocks, other : others)
+
+    -- do not eliminate blocks that have an info table
+    has_info l = mapMember l info
+
+    -- build a mapping from BlockId to JumpDest for shorting branches
+    mapping = mapFromList shortcut_blocks
+
+apply_mapping :: NcgImpl statics instr jumpDest
+              -> LabelMap jumpDest
+              -> GenCmmDecl statics h (ListGraph instr)
+              -> GenCmmDecl statics h (ListGraph instr)
+apply_mapping ncgImpl ufm (CmmData sec statics)
+  = CmmData sec (shortcutStatics ncgImpl (\bid -> mapLookup bid ufm) statics)
+apply_mapping ncgImpl ufm (CmmProc info lbl live (ListGraph blocks))
+  = CmmProc info lbl live (ListGraph $ map short_bb blocks)
+  where
+    short_bb (BasicBlock id insns) = BasicBlock id $! map short_insn insns
+    short_insn i = shortcutJump ncgImpl (\bid -> mapLookup bid ufm) i
+                 -- shortcutJump should apply the mapping repeatedly,
+                 -- just in case we can short multiple branches.
+
+-- -----------------------------------------------------------------------------
+-- Instruction selection
+
+-- Native code instruction selection for a chunk of stix code.  For
+-- this part of the computation, we switch from the UniqSM monad to
+-- the NatM monad.  The latter carries not only a Unique, but also an
+-- Int denoting the current C stack pointer offset in the generated
+-- code; this is needed for creating correct spill offsets on
+-- architectures which don't offer, or for which it would be
+-- prohibitively expensive to employ, a frame pointer register.  Viz,
+-- x86.
+
+-- The offset is measured in bytes, and indicates the difference
+-- between the current (simulated) C stack-ptr and the value it was at
+-- the beginning of the block.  For stacks which grow down, this value
+-- should be either zero or negative.
+
+-- Along with the stack pointer offset, we also carry along a LabelMap of
+-- DebugBlocks, which we read to generate .location directives.
+--
+-- Switching between the two monads whilst carrying along the same
+-- Unique supply breaks abstraction.  Is that bad?
+
+genMachCode
+        :: DynFlags
+        -> Module -> ModLocation
+        -> (RawCmmDecl -> NatM [NatCmmDecl statics instr])
+        -> DwarfFiles
+        -> LabelMap DebugBlock
+        -> RawCmmDecl
+        -> CFG
+        -> UniqSM
+                ( [NatCmmDecl statics instr]
+                , [CLabel]
+                , DwarfFiles
+                , CFG
+                )
+
+genMachCode dflags this_mod modLoc cmmTopCodeGen fileIds dbgMap cmm_top cmm_cfg
+  = do  { initial_us <- getUniqueSupplyM
+        ; let initial_st           = mkNatM_State initial_us 0 dflags this_mod
+                                                  modLoc fileIds dbgMap cmm_cfg
+              (new_tops, final_st) = initNat initial_st (cmmTopCodeGen cmm_top)
+              final_delta          = natm_delta final_st
+              final_imports        = natm_imports final_st
+              final_cfg            = natm_cfg final_st
+        ; if   final_delta == 0
+          then return (new_tops, final_imports
+                      , natm_fileid final_st, final_cfg)
+          else pprPanic "genMachCode: nonzero final delta" (int final_delta)
+    }
+
+-- -----------------------------------------------------------------------------
+-- Generic Cmm optimiser
+
+{-
+Here we do:
+
+  (a) Constant folding
+  (c) Position independent code and dynamic linking
+        (i)  introduce the appropriate indirections
+             and position independent refs
+        (ii) compile a list of imported symbols
+  (d) Some arch-specific optimizations
+
+(a) will be moving to the new Hoopl pipeline, however, (c) and
+(d) are only needed by the native backend and will continue to live
+here.
+
+Ideas for other things we could do (put these in Hoopl please!):
+
+  - shortcut jumps-to-jumps
+  - simple CSE: if an expr is assigned to a temp, then replace later occs of
+    that expr with the temp, until the expr is no longer valid (can push through
+    temp assignments, and certain assigns to mem...)
+-}
+
+cmmToCmm :: DynFlags -> Module -> RawCmmDecl -> (RawCmmDecl, [CLabel])
+cmmToCmm _ _ top@(CmmData _ _) = (top, [])
+cmmToCmm dflags this_mod (CmmProc info lbl live graph)
+    = runCmmOpt dflags this_mod $
+      do blocks' <- mapM cmmBlockConFold (toBlockList graph)
+         return $ CmmProc info lbl live (ofBlockList (g_entry graph) blocks')
+
+-- Avoids using unboxed tuples when loading into GHCi
+#if !defined(GHC_LOADED_INTO_GHCI)
+
+type OptMResult a = (# a, [CLabel] #)
+
+pattern OptMResult :: a -> b -> (# a, b #)
+pattern OptMResult x y = (# x, y #)
+{-# COMPLETE OptMResult #-}
+#else
+
+data OptMResult a = OptMResult !a ![CLabel] deriving (Functor)
+#endif
+
+newtype CmmOptM a = CmmOptM (DynFlags -> Module -> [CLabel] -> OptMResult a)
+    deriving (Functor)
+
+instance Applicative CmmOptM where
+    pure x = CmmOptM $ \_ _ imports -> OptMResult x imports
+    (<*>) = ap
+
+instance Monad CmmOptM where
+  (CmmOptM f) >>= g =
+    CmmOptM $ \dflags this_mod imports0 ->
+                case f dflags this_mod imports0 of
+                  OptMResult x imports1 ->
+                    case g x of
+                      CmmOptM g' -> g' dflags this_mod imports1
+
+instance CmmMakeDynamicReferenceM CmmOptM where
+    addImport = addImportCmmOpt
+    getThisModule = CmmOptM $ \_ this_mod imports -> OptMResult this_mod imports
+
+addImportCmmOpt :: CLabel -> CmmOptM ()
+addImportCmmOpt lbl = CmmOptM $ \_ _ imports -> OptMResult () (lbl:imports)
+
+instance HasDynFlags CmmOptM where
+    getDynFlags = CmmOptM $ \dflags _ imports -> OptMResult dflags imports
+
+runCmmOpt :: DynFlags -> Module -> CmmOptM a -> (a, [CLabel])
+runCmmOpt dflags this_mod (CmmOptM f) =
+  case f dflags this_mod [] of
+    OptMResult result imports -> (result, imports)
+
+cmmBlockConFold :: CmmBlock -> CmmOptM CmmBlock
+cmmBlockConFold block = do
+  let (entry, middle, last) = blockSplit block
+      stmts = blockToList middle
+  stmts' <- mapM cmmStmtConFold stmts
+  last' <- cmmStmtConFold last
+  return $ blockJoin entry (blockFromList stmts') last'
+
+-- This does three optimizations, but they're very quick to check, so we don't
+-- bother turning them off even when the Hoopl code is active.  Since
+-- this is on the old Cmm representation, we can't reuse the code either:
+--  * reg = reg      --> nop
+--  * if 0 then jump --> nop
+--  * if 1 then jump --> jump
+-- We might be tempted to skip this step entirely of not Opt_PIC, but
+-- there is some PowerPC code for the non-PIC case, which would also
+-- have to be separated.
+cmmStmtConFold :: CmmNode e x -> CmmOptM (CmmNode e x)
+cmmStmtConFold stmt
+   = case stmt of
+        CmmAssign reg src
+           -> do src' <- cmmExprConFold DataReference src
+                 return $ case src' of
+                   CmmReg reg' | reg == reg' -> CmmComment (fsLit "nop")
+                   new_src -> CmmAssign reg new_src
+
+        CmmStore addr src
+           -> do addr' <- cmmExprConFold DataReference addr
+                 src'  <- cmmExprConFold DataReference src
+                 return $ CmmStore addr' src'
+
+        CmmCall { cml_target = addr }
+           -> do addr' <- cmmExprConFold JumpReference addr
+                 return $ stmt { cml_target = addr' }
+
+        CmmUnsafeForeignCall target regs args
+           -> do target' <- case target of
+                              ForeignTarget e conv -> do
+                                e' <- cmmExprConFold CallReference e
+                                return $ ForeignTarget e' conv
+                              PrimTarget _ ->
+                                return target
+                 args' <- mapM (cmmExprConFold DataReference) args
+                 return $ CmmUnsafeForeignCall target' regs args'
+
+        CmmCondBranch test true false likely
+           -> do test' <- cmmExprConFold DataReference test
+                 return $ case test' of
+                   CmmLit (CmmInt 0 _) -> CmmBranch false
+                   CmmLit (CmmInt _ _) -> CmmBranch true
+                   _other -> CmmCondBranch test' true false likely
+
+        CmmSwitch expr ids
+           -> do expr' <- cmmExprConFold DataReference expr
+                 return $ CmmSwitch expr' ids
+
+        other
+           -> return other
+
+cmmExprConFold :: ReferenceKind -> CmmExpr -> CmmOptM CmmExpr
+cmmExprConFold referenceKind expr = do
+    dflags <- getDynFlags
+
+    -- With -O1 and greater, the cmmSink pass does constant-folding, so
+    -- we don't need to do it again here.
+    let expr' = if optLevel dflags >= 1
+                    then expr
+                    else cmmExprCon dflags expr
+
+    cmmExprNative referenceKind expr'
+
+cmmExprCon :: DynFlags -> CmmExpr -> CmmExpr
+cmmExprCon dflags (CmmLoad addr rep) = CmmLoad (cmmExprCon dflags addr) rep
+cmmExprCon dflags (CmmMachOp mop args)
+    = cmmMachOpFold dflags mop (map (cmmExprCon dflags) args)
+cmmExprCon _ other = other
+
+-- handles both PIC and non-PIC cases... a very strange mixture
+-- of things to do.
+cmmExprNative :: ReferenceKind -> CmmExpr -> CmmOptM CmmExpr
+cmmExprNative referenceKind expr = do
+     dflags <- getDynFlags
+     let platform = targetPlatform dflags
+         arch = platformArch platform
+     case expr of
+        CmmLoad addr rep
+           -> do addr' <- cmmExprNative DataReference addr
+                 return $ CmmLoad addr' rep
+
+        CmmMachOp mop args
+           -> do args' <- mapM (cmmExprNative DataReference) args
+                 return $ CmmMachOp mop args'
+
+        CmmLit (CmmBlock id)
+           -> cmmExprNative referenceKind (CmmLit (CmmLabel (infoTblLbl id)))
+           -- we must convert block Ids to CLabels here, because we
+           -- might have to do the PIC transformation.  Hence we must
+           -- not modify BlockIds beyond this point.
+
+        CmmLit (CmmLabel lbl)
+           -> do
+                cmmMakeDynamicReference dflags referenceKind lbl
+        CmmLit (CmmLabelOff lbl off)
+           -> do
+                 dynRef <- cmmMakeDynamicReference dflags referenceKind lbl
+                 -- need to optimize here, since it's late
+                 return $ cmmMachOpFold dflags (MO_Add (wordWidth dflags)) [
+                     dynRef,
+                     (CmmLit $ CmmInt (fromIntegral off) (wordWidth dflags))
+                   ]
+
+        -- On powerpc (non-PIC), it's easier to jump directly to a label than
+        -- to use the register table, so we replace these registers
+        -- with the corresponding labels:
+        CmmReg (CmmGlobal EagerBlackholeInfo)
+          | arch == ArchPPC && not (positionIndependent dflags)
+          -> cmmExprNative referenceKind $
+             CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit "__stg_EAGER_BLACKHOLE_info")))
+        CmmReg (CmmGlobal GCEnter1)
+          | arch == ArchPPC && not (positionIndependent dflags)
+          -> cmmExprNative referenceKind $
+             CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit "__stg_gc_enter_1")))
+        CmmReg (CmmGlobal GCFun)
+          | arch == ArchPPC && not (positionIndependent dflags)
+          -> cmmExprNative referenceKind $
+             CmmLit (CmmLabel (mkCmmCodeLabel rtsUnitId (fsLit "__stg_gc_fun")))
+
+        other
+           -> return other
diff --git a/compiler/GHC/CmmToAsm/BlockLayout.hs b/compiler/GHC/CmmToAsm/BlockLayout.hs
new file mode 100644
index 0000000000..01a1388b5f
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/BlockLayout.hs
@@ -0,0 +1,895 @@
+--
+-- Copyright (c) 2018 Andreas Klebinger
+--
+
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE FlexibleContexts #-}
+
+module GHC.CmmToAsm.BlockLayout
+    ( sequenceTop )
+where
+
+#include "HsVersions.h"
+import GhcPrelude
+
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.CFG
+
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+
+import GHC.Driver.Session (gopt, GeneralFlag(..), DynFlags, backendMaintainsCfg)
+import UniqFM
+import Util
+import Unique
+
+import Digraph
+import Outputable
+import Maybes
+
+-- DEBUGGING ONLY
+--import GHC.Cmm.DebugBlock
+--import Debug.Trace
+import ListSetOps (removeDups)
+
+import OrdList
+import Data.List
+import Data.Foldable (toList)
+
+import qualified Data.Set as Set
+import Data.STRef
+import Control.Monad.ST.Strict
+import Control.Monad (foldM)
+
+{-
+  Note [CFG based code layout]
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  The major steps in placing blocks are as follow:
+  * Compute a CFG based on the Cmm AST, see getCfgProc.
+    This CFG will have edge weights representing a guess
+    on how important they are.
+  * After we convert Cmm to Asm we run `optimizeCFG` which
+    adds a few more "educated guesses" to the equation.
+  * Then we run loop analysis on the CFG (`loopInfo`) which tells us
+    about loop headers, loop nesting levels and the sort.
+  * Based on the CFG and loop information refine the edge weights
+    in the CFG and normalize them relative to the most often visited
+    node. (See `mkGlobalWeights`)
+  * Feed this CFG into the block layout code (`sequenceTop`) in this
+    module. Which will then produce a code layout based on the input weights.
+
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ~~~ Note [Chain based CFG serialization]
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  For additional information also look at
+  https://gitlab.haskell.org/ghc/ghc/wikis/commentary/compiler/code-layout
+
+  We have a CFG with edge weights based on which we try to place blocks next to
+  each other.
+
+  Edge weights not only represent likelihood of control transfer between blocks
+  but also how much a block would benefit from being placed sequentially after
+  it's predecessor.
+  For example blocks which are preceded by an info table are more likely to end
+  up in a different cache line than their predecessor and we can't eliminate the jump
+  so there is less benefit to placing them sequentially.
+
+  For example consider this example:
+
+  A:  ...
+      jmp cond D (weak successor)
+      jmp B
+  B:  ...
+      jmp C
+  C:  ...
+      jmp X
+  D:  ...
+      jmp B (weak successor)
+
+  We determine a block layout by building up chunks (calling them chains) of
+  possible control flows for which blocks will be placed sequentially.
+
+  Eg for our example we might end up with two chains like:
+  [A->B->C->X],[D]. Blocks inside chains will always be placed sequentially.
+  However there is no particular order in which chains are placed since
+  (hopefully) the blocks for which sequentiality is important have already
+  been placed in the same chain.
+
+  -----------------------------------------------------------------------------
+     1) First try to create a list of good chains.
+  -----------------------------------------------------------------------------
+
+  Good chains are these which allow us to eliminate jump instructions.
+  Which further eliminate often executed jumps first.
+
+  We do so by:
+
+  *)  Ignore edges which represent instructions which can not be replaced
+      by fall through control flow. Primarily calls and edges to blocks which
+      are prefixed by a info table we have to jump across.
+
+  *)  Then process remaining edges in order of frequency taken and:
+
+    +)  If source and target have not been placed build a new chain from them.
+
+    +)  If source and target have been placed, and are ends of differing chains
+        try to merge the two chains.
+
+    +)  If one side of the edge is a end/front of a chain, add the other block of
+        to edge to the same chain
+
+        Eg if we look at edge (B -> C) and already have the chain (A -> B)
+        then we extend the chain to (A -> B -> C).
+
+    +)  If the edge was used to modify or build a new chain remove the edge from
+        our working list.
+
+  *) If there any blocks not being placed into a chain after these steps we place
+     them into a chain consisting of only this block.
+
+  Ranking edges by their taken frequency, if
+  two edges compete for fall through on the same target block, the one taken
+  more often will automatically win out. Resulting in fewer instructions being
+  executed.
+
+  Creating singleton chains is required for situations where we have code of the
+  form:
+
+    A: goto B:
+    <infoTable>
+    B: goto C:
+    <infoTable>
+    C: ...
+
+  As the code in block B is only connected to the rest of the program via edges
+  which will be ignored in this step we make sure that B still ends up in a chain
+  this way.
+
+  -----------------------------------------------------------------------------
+     2) We also try to fuse chains.
+  -----------------------------------------------------------------------------
+
+  As a result from the above step we still end up with multiple chains which
+  represent sequential control flow chunks. But they are not yet suitable for
+  code layout as we need to place *all* blocks into a single sequence.
+
+  In this step we combine chains result from the above step via these steps:
+
+  *)  Look at the ranked list of *all* edges, including calls/jumps across info tables
+      and the like.
+
+  *)  Look at each edge and
+
+    +) Given an edge (A -> B) try to find two chains for which
+      * Block A is at the end of one chain
+      * Block B is at the front of the other chain.
+    +) If we find such a chain we "fuse" them into a single chain, remove the
+       edge from working set and continue.
+    +) If we can't find such chains we skip the edge and continue.
+
+  -----------------------------------------------------------------------------
+     3) Place indirect successors (neighbours) after each other
+  -----------------------------------------------------------------------------
+
+  We might have chains [A,B,C,X],[E] in a CFG of the sort:
+
+    A ---> B ---> C --------> X(exit)
+                   \- ->E- -/
+
+  While E does not follow X it's still beneficial to place them near each other.
+  This can be advantageous if eg C,X,E will end up in the same cache line.
+
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ~~~ Note [Triangle Control Flow]
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  Checking if an argument is already evaluated leads to a somewhat
+  special case  which looks like this:
+
+    A:
+        if (R1 & 7 != 0) goto Leval; else goto Lwork;
+    Leval: // global
+        call (I64[R1])(R1) returns to Lwork, args: 8, res: 8, upd: 8;
+    Lwork: // global
+        ...
+
+        A
+        |\
+        | Leval
+        |/ - (This edge can be missing because of optimizations)
+        Lwork
+
+  Once we hit the metal the call instruction is just 2-3 bytes large
+  depending on the register used. So we lay out the assembly like this:
+
+        movq %rbx,%rax
+        andl $7,%eax
+        cmpq $1,%rax
+        jne Lwork
+    Leval:
+        jmp *(%rbx) # encoded in 2-3 bytes.
+    <info table>
+    Lwork:
+        ...
+
+  We could explicitly check for this control flow pattern.
+
+  This is advantageous because:
+  * It's optimal if the argument isn't evaluated.
+  * If it's evaluated we only have the extra cost of jumping over
+    the 2-3 bytes for the call.
+  * Guarantees the smaller encoding for the conditional jump.
+
+  However given that Lwork usually has an info table we
+  penalize this edge. So Leval should get placed first
+  either way and things work out for the best.
+
+  Optimizing for the evaluated case instead would penalize
+  the other code path. It adds an jump as we can't fall through
+  to Lwork because of the info table.
+  Assuming that Lwork is large the chance that the "call" ends up
+  in the same cache line is also fairly small.
+
+-}
+
+
+-- | Look at X number of blocks in two chains to determine
+--   if they are "neighbours".
+neighbourOverlapp :: Int
+neighbourOverlapp = 2
+
+-- | Maps blocks near the end of a chain to it's chain AND
+-- the other blocks near the end.
+-- [A,B,C,D,E] Gives entries like (B -> ([A,B], [A,B,C,D,E]))
+-- where [A,B] are blocks in the end region of a chain.
+-- This is cheaper then recomputing the ends multiple times.
+type FrontierMap = LabelMap ([BlockId],BlockChain)
+
+-- | A non empty ordered sequence of basic blocks.
+--   It is suitable for serialization in this order.
+--
+--   We use OrdList instead of [] to allow fast append on both sides
+--   when combining chains.
+newtype BlockChain
+    = BlockChain { chainBlocks :: (OrdList BlockId) }
+
+-- All chains are constructed the same way so comparison
+-- including structure is faster.
+instance Eq BlockChain where
+    BlockChain b1 == BlockChain b2 = strictlyEqOL b1 b2
+
+-- Useful for things like sets and debugging purposes, sorts by blocks
+-- in the chain.
+instance Ord (BlockChain) where
+   (BlockChain lbls1) `compare` (BlockChain lbls2)
+       = ASSERT(toList lbls1 /= toList lbls2 || lbls1 `strictlyEqOL` lbls2)
+         strictlyOrdOL lbls1 lbls2
+
+instance Outputable (BlockChain) where
+    ppr (BlockChain blks) =
+        parens (text "Chain:" <+> ppr (fromOL $ blks) )
+
+chainFoldl :: (b -> BlockId -> b) -> b -> BlockChain -> b
+chainFoldl f z (BlockChain blocks) = foldl' f z blocks
+
+noDups :: [BlockChain] -> Bool
+noDups chains =
+    let chainBlocks = concatMap chainToBlocks chains :: [BlockId]
+        (_blocks, dups) = removeDups compare chainBlocks
+    in if null dups then True
+        else pprTrace "Duplicates:" (ppr (map toList dups) $$ text "chains" <+> ppr chains ) False
+
+inFront :: BlockId -> BlockChain -> Bool
+inFront bid (BlockChain seq)
+  = headOL seq == bid
+
+chainSingleton :: BlockId -> BlockChain
+chainSingleton lbl
+    = BlockChain (unitOL lbl)
+
+chainFromList :: [BlockId] -> BlockChain
+chainFromList = BlockChain . toOL
+
+chainSnoc :: BlockChain -> BlockId -> BlockChain
+chainSnoc (BlockChain blks) lbl
+  = BlockChain (blks `snocOL` lbl)
+
+chainCons :: BlockId -> BlockChain -> BlockChain
+chainCons lbl (BlockChain blks)
+  = BlockChain (lbl `consOL` blks)
+
+chainConcat :: BlockChain -> BlockChain -> BlockChain
+chainConcat (BlockChain blks1) (BlockChain blks2)
+  = BlockChain (blks1 `appOL` blks2)
+
+chainToBlocks :: BlockChain -> [BlockId]
+chainToBlocks (BlockChain blks) = fromOL blks
+
+-- | Given the Chain A -> B -> C -> D and we break at C
+--   we get the two Chains (A -> B, C -> D) as result.
+breakChainAt :: BlockId -> BlockChain
+             -> (BlockChain,BlockChain)
+breakChainAt bid (BlockChain blks)
+    | not (bid == head rblks)
+    = panic "Block not in chain"
+    | otherwise
+    = (BlockChain (toOL lblks),
+       BlockChain (toOL rblks))
+  where
+    (lblks, rblks) = break (\lbl -> lbl == bid) (fromOL blks)
+
+takeR :: Int -> BlockChain -> [BlockId]
+takeR n (BlockChain blks) =
+    take n . fromOLReverse $ blks
+
+takeL :: Int -> BlockChain -> [BlockId]
+takeL n (BlockChain blks) =
+    take n . fromOL $ blks
+
+-- Note [Combining neighborhood chains]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-- See also Note [Chain based CFG serialization]
+-- We have the chains (A-B-C-D) and (E-F) and an Edge C->E.
+--
+-- While placing the latter after the former doesn't result in sequential
+-- control flow it is still beneficial. As block C and E might end
+-- up in the same cache line.
+--
+-- So we place these chains next to each other even if we can't fuse them.
+--
+--   A -> B -> C -> D
+--             v
+--             - -> E -> F ...
+--
+-- A simple heuristic to chose which chains we want to combine:
+--   * Process edges in descending priority.
+--   * Check if there is a edge near the end of one chain which goes
+--     to a block near the start of another edge.
+--
+-- While we could take into account the space between the two blocks which
+-- share an edge this blows up compile times quite a bit. It requires
+-- us to find all edges between two chains, check the distance for all edges,
+-- rank them based on the distance and only then we can select two chains
+-- to combine. Which would add a lot of complexity for little gain.
+--
+-- So instead we just rank by the strength of the edge and use the first pair we
+-- find.
+
+-- | For a given list of chains and edges try to combine chains with strong
+--   edges between them.
+combineNeighbourhood  :: [CfgEdge] -- ^ Edges to consider
+                      -> [BlockChain] -- ^ Current chains of blocks
+                      -> ([BlockChain], Set.Set (BlockId,BlockId))
+                      -- ^ Resulting list of block chains, and a set of edges which
+                      -- were used to fuse chains and as such no longer need to be
+                      -- considered.
+combineNeighbourhood edges chains
+    = -- pprTraceIt "Neighbours" $
+    --   pprTrace "combineNeighbours" (ppr edges) $
+      applyEdges edges endFrontier startFrontier (Set.empty)
+    where
+        --Build maps from chain ends to chains
+        endFrontier, startFrontier :: FrontierMap
+        endFrontier =
+            mapFromList $ concatMap (\chain ->
+                                let ends = getEnds chain :: [BlockId]
+                                    entry = (ends,chain)
+                                in map (\x -> (x,entry)) ends ) chains
+        startFrontier =
+            mapFromList $ concatMap (\chain ->
+                                let front = getFronts chain
+                                    entry = (front,chain)
+                                in map (\x -> (x,entry)) front) chains
+        applyEdges :: [CfgEdge] -> FrontierMap -> FrontierMap -> Set.Set (BlockId, BlockId)
+                   -> ([BlockChain], Set.Set (BlockId,BlockId))
+        applyEdges [] chainEnds _chainFronts combined =
+            (ordNub $ map snd $ mapElems chainEnds, combined)
+        applyEdges ((CfgEdge from to _w):edges) chainEnds chainFronts combined
+            | Just (c1_e,c1) <- mapLookup from chainEnds
+            , Just (c2_f,c2) <- mapLookup to chainFronts
+            , c1 /= c2 -- Avoid trying to concat a chain with itself.
+            = let newChain = chainConcat c1 c2
+                  newChainFrontier = getFronts newChain
+                  newChainEnds = getEnds newChain
+                  newFronts :: FrontierMap
+                  newFronts =
+                    let withoutOld =
+                            foldl' (\m b -> mapDelete b m :: FrontierMap) chainFronts (c2_f ++ getFronts c1)
+                        entry =
+                            (newChainFrontier,newChain) --let bound to ensure sharing
+                    in foldl' (\m x -> mapInsert x entry m)
+                              withoutOld newChainFrontier
+
+                  newEnds =
+                    let withoutOld = foldl' (\m b -> mapDelete b m) chainEnds (c1_e ++ getEnds c2)
+                        entry = (newChainEnds,newChain) --let bound to ensure sharing
+                    in foldl' (\m x -> mapInsert x entry m)
+                              withoutOld newChainEnds
+              in
+                -- pprTrace "ApplyEdges"
+                --  (text "before" $$
+                --   text "fronts" <+> ppr chainFronts $$
+                --   text "ends" <+> ppr chainEnds $$
+
+                --   text "various" $$
+                --   text "newChain" <+> ppr newChain $$
+                --   text "newChainFrontier" <+> ppr newChainFrontier $$
+                --   text "newChainEnds" <+> ppr newChainEnds $$
+                --   text "drop" <+> ppr ((c2_f ++ getFronts c1) ++ (c1_e ++ getEnds c2)) $$
+
+                --   text "after" $$
+                --   text "fronts" <+> ppr newFronts $$
+                --   text "ends" <+> ppr newEnds
+                --   )
+                 applyEdges edges newEnds newFronts (Set.insert (from,to) combined)
+            | otherwise
+            = applyEdges edges chainEnds chainFronts combined
+         where
+
+        getFronts chain = takeL neighbourOverlapp chain
+        getEnds chain = takeR neighbourOverlapp chain
+
+-- In the last stop we combine all chains into a single one.
+-- Trying to place chains with strong edges next to each other.
+mergeChains :: [CfgEdge] -> [BlockChain]
+            -> (BlockChain)
+mergeChains edges chains
+    = -- pprTrace "combine" (ppr edges) $
+      runST $ do
+        let addChain m0 chain = do
+                ref <- newSTRef chain
+                return $ chainFoldl (\m' b -> mapInsert b ref m') m0 chain
+        chainMap' <- foldM (\m0 c -> addChain m0 c) mapEmpty chains
+        merge edges chainMap'
+    where
+        -- We keep a map from ALL blocks to their respective chain (sigh)
+        -- This is required since when looking at an edge we need to find
+        -- the associated chains quickly.
+        -- We use a map of STRefs, maintaining a invariant of one STRef per chain.
+        -- When merging chains we can update the
+        -- STRef of one chain once (instead of writing to the map for each block).
+        -- We then overwrite the STRefs for the other chain so there is again only
+        -- a single STRef for the combined chain.
+        -- The difference in terms of allocations saved is ~0.2% with -O so actually
+        -- significant compared to using a regular map.
+
+        merge :: forall s. [CfgEdge] -> LabelMap (STRef s BlockChain) -> ST s BlockChain
+        merge [] chains = do
+            chains' <- ordNub <$> (mapM readSTRef $ mapElems chains) :: ST s [BlockChain]
+            return $ foldl' chainConcat (head chains') (tail chains')
+        merge ((CfgEdge from to _):edges) chains
+        --   | pprTrace "merge" (ppr (from,to) <> ppr chains) False
+        --   = undefined
+          | cFrom == cTo
+          = merge edges chains
+          | otherwise
+          = do
+            chains' <- mergeComb cFrom cTo
+            merge edges chains'
+          where
+            mergeComb :: STRef s BlockChain -> STRef s BlockChain -> ST s (LabelMap (STRef s BlockChain))
+            mergeComb refFrom refTo = do
+                cRight <- readSTRef refTo
+                chain <- pure chainConcat <*> readSTRef refFrom <*> pure cRight
+                writeSTRef refFrom chain
+                return $ chainFoldl (\m b -> mapInsert b refFrom m) chains cRight
+
+            cFrom = expectJust "mergeChains:chainMap:from" $ mapLookup from chains
+            cTo = expectJust "mergeChains:chainMap:to"   $ mapLookup to   chains
+
+
+-- See Note [Chain based CFG serialization] for the general idea.
+-- This creates and fuses chains at the same time for performance reasons.
+
+-- Try to build chains from a list of edges.
+-- Edges must be sorted **descending** by their priority.
+-- Returns the constructed chains, along with all edges which
+-- are irrelevant past this point, this information doesn't need
+-- to be complete - it's only used to speed up the process.
+-- An Edge is irrelevant if the ends are part of the same chain.
+-- We say these edges are already linked
+buildChains :: [CfgEdge] -> [BlockId]
+            -> ( LabelMap BlockChain  -- Resulting chains, indexd by end if chain.
+               , Set.Set (BlockId, BlockId)) --List of fused edges.
+buildChains edges blocks
+  = runST $ buildNext setEmpty mapEmpty mapEmpty edges Set.empty
+  where
+    -- buildNext builds up chains from edges one at a time.
+
+    -- We keep a map from the ends of chains to the chains.
+    -- This we we can easily check if an block should be appended to an
+    -- existing chain!
+    -- We store them using STRefs so we don't have to rebuild the spine of both
+    -- maps every time we update a chain.
+    buildNext :: forall s. LabelSet
+              -> LabelMap (STRef s BlockChain) -- Map from end of chain to chain.
+              -> LabelMap (STRef s BlockChain) -- Map from start of chain to chain.
+              -> [CfgEdge] -- Edges to check - ordered by decreasing weight
+              -> Set.Set (BlockId, BlockId) -- Used edges
+              -> ST s   ( LabelMap BlockChain -- Chains by end
+                        , Set.Set (BlockId, BlockId) --List of fused edges
+                        )
+    buildNext placed _chainStarts chainEnds  [] linked = do
+        ends' <- sequence $ mapMap readSTRef chainEnds :: ST s (LabelMap BlockChain)
+        -- Any remaining blocks have to be made to singleton chains.
+        -- They might be combined with other chains later on outside this function.
+        let unplaced = filter (\x -> not (setMember x placed)) blocks
+            singletons = map (\x -> (x,chainSingleton x)) unplaced :: [(BlockId,BlockChain)]
+        return (foldl' (\m (k,v) -> mapInsert k v m) ends' singletons , linked)
+    buildNext placed chainStarts chainEnds (edge:todo) linked
+        | from == to
+        -- We skip self edges
+        = buildNext placed chainStarts chainEnds todo (Set.insert (from,to) linked)
+        | not (alreadyPlaced from) &&
+          not (alreadyPlaced to)
+        = do
+            --pprTraceM "Edge-Chain:" (ppr edge)
+            chain' <- newSTRef $ chainFromList [from,to]
+            buildNext
+                (setInsert to (setInsert from placed))
+                (mapInsert from chain' chainStarts)
+                (mapInsert to chain' chainEnds)
+                todo
+                (Set.insert (from,to) linked)
+
+        | (alreadyPlaced from) &&
+          (alreadyPlaced to)
+        , Just predChain <- mapLookup from chainEnds
+        , Just succChain <- mapLookup to chainStarts
+        , predChain /= succChain -- Otherwise we try to create a cycle.
+        = do
+            -- pprTraceM "Fusing edge" (ppr edge)
+            fuseChain predChain succChain
+
+        | (alreadyPlaced from) &&
+          (alreadyPlaced to)
+        =   --pprTraceM "Skipping:" (ppr edge) >>
+            buildNext placed chainStarts chainEnds todo linked
+
+        | otherwise
+        = do -- pprTraceM "Finding chain for:" (ppr edge $$
+             --         text "placed" <+> ppr placed)
+             findChain
+      where
+        from = edgeFrom edge
+        to   = edgeTo   edge
+        alreadyPlaced blkId = (setMember blkId placed)
+
+        -- Combine two chains into a single one.
+        fuseChain :: STRef s BlockChain -> STRef s BlockChain
+                  -> ST s   ( LabelMap BlockChain -- Chains by end
+                            , Set.Set (BlockId, BlockId) --List of fused edges
+                            )
+        fuseChain fromRef toRef = do
+            fromChain <- readSTRef fromRef
+            toChain <- readSTRef toRef
+            let newChain = chainConcat fromChain toChain
+            ref <- newSTRef newChain
+            let start = head $ takeL 1 newChain
+            let end = head $ takeR 1 newChain
+            -- chains <- sequence $ mapMap readSTRef chainStarts
+            -- pprTraceM "pre-fuse chains:" $ ppr chains
+            buildNext
+                placed
+                (mapInsert start ref $ mapDelete to $ chainStarts)
+                (mapInsert end ref $ mapDelete from $ chainEnds)
+                todo
+                (Set.insert (from,to) linked)
+
+
+        --Add the block to a existing chain or creates a new chain
+        findChain :: ST s   ( LabelMap BlockChain -- Chains by end
+                            , Set.Set (BlockId, BlockId) --List of fused edges
+                            )
+        findChain
+          -- We can attach the block to the end of a chain
+          | alreadyPlaced from
+          , Just predChain <- mapLookup from chainEnds
+          = do
+            chain <- readSTRef predChain
+            let newChain = chainSnoc chain to
+            writeSTRef predChain newChain
+            let chainEnds' = mapInsert to predChain $ mapDelete from chainEnds
+            -- chains <- sequence $ mapMap readSTRef chainStarts
+            -- pprTraceM "from chains:" $ ppr chains
+            buildNext (setInsert to placed) chainStarts chainEnds' todo (Set.insert (from,to) linked)
+          -- We can attack it to the front of a chain
+          | alreadyPlaced to
+          , Just succChain <- mapLookup to chainStarts
+          = do
+            chain <- readSTRef succChain
+            let newChain = from `chainCons` chain
+            writeSTRef succChain newChain
+            let chainStarts' = mapInsert from succChain $ mapDelete to chainStarts
+            -- chains <- sequence $ mapMap readSTRef chainStarts'
+            -- pprTraceM "to chains:" $ ppr chains
+            buildNext (setInsert from placed) chainStarts' chainEnds todo (Set.insert (from,to) linked)
+          -- The placed end of the edge is part of a chain already and not an end.
+          | otherwise
+          = do
+            let block    = if alreadyPlaced to then from else to
+            --pprTraceM "Singleton" $ ppr block
+            let newChain = chainSingleton block
+            ref <- newSTRef newChain
+            buildNext (setInsert block placed) (mapInsert block ref chainStarts)
+                      (mapInsert block ref chainEnds) todo (linked)
+            where
+              alreadyPlaced blkId = (setMember blkId placed)
+
+-- | Place basic blocks based on the given CFG.
+-- See Note [Chain based CFG serialization]
+sequenceChain :: forall a i. (Instruction i, Outputable i)
+              => LabelMap a -- ^ Keys indicate an info table on the block.
+              -> CFG -- ^ Control flow graph and some meta data.
+              -> [GenBasicBlock i] -- ^ List of basic blocks to be placed.
+              -> [GenBasicBlock i] -- ^ Blocks placed in sequence.
+sequenceChain _info _weights    [] = []
+sequenceChain _info _weights    [x] = [x]
+sequenceChain  info weights'     blocks@((BasicBlock entry _):_) =
+    let weights :: CFG
+        weights = --pprTrace "cfg'" (pprEdgeWeights cfg')
+                  cfg'
+          where
+            (_, globalEdgeWeights) = {-# SCC mkGlobalWeights #-} mkGlobalWeights entry weights'
+            cfg' = {-# SCC rewriteEdges #-}
+                    mapFoldlWithKey
+                        (\cfg from m ->
+                            mapFoldlWithKey
+                                (\cfg to w -> setEdgeWeight cfg (EdgeWeight w) from to )
+                                cfg m )
+                        weights'
+                        globalEdgeWeights
+
+        directEdges :: [CfgEdge]
+        directEdges = sortBy (flip compare) $ catMaybes . map relevantWeight $ (infoEdgeList weights)
+          where
+            relevantWeight :: CfgEdge -> Maybe CfgEdge
+            relevantWeight edge@(CfgEdge from to edgeInfo)
+                | (EdgeInfo CmmSource { trans_cmmNode = CmmCall {} } _) <- edgeInfo
+                -- Ignore edges across calls
+                = Nothing
+                | mapMember to info
+                , w <- edgeWeight edgeInfo
+                -- The payoff is small if we jump over an info table
+                = Just (CfgEdge from to edgeInfo { edgeWeight = w/8 })
+                | otherwise
+                = Just edge
+
+        blockMap :: LabelMap (GenBasicBlock i)
+        blockMap
+            = foldl' (\m blk@(BasicBlock lbl _ins) ->
+                        mapInsert lbl blk m)
+                     mapEmpty blocks
+
+        (builtChains, builtEdges)
+            = {-# SCC "buildChains" #-}
+              --pprTraceIt "generatedChains" $
+              --pprTrace "blocks" (ppr (mapKeys blockMap)) $
+              buildChains directEdges (mapKeys blockMap)
+
+        rankedEdges :: [CfgEdge]
+        -- Sort descending by weight, remove fused edges
+        rankedEdges =
+            filter (\edge -> not (Set.member (edgeFrom edge,edgeTo edge) builtEdges)) $
+            directEdges
+
+        (neighbourChains, combined)
+            = ASSERT(noDups $ mapElems builtChains)
+              {-# SCC "groupNeighbourChains" #-}
+            --   pprTraceIt "NeighbourChains" $
+              combineNeighbourhood rankedEdges (mapElems builtChains)
+
+
+        allEdges :: [CfgEdge]
+        allEdges = {-# SCC allEdges #-}
+                   sortOn (relevantWeight) $ filter (not . deadEdge) $ (infoEdgeList weights)
+          where
+            deadEdge :: CfgEdge -> Bool
+            deadEdge (CfgEdge from to _) = let e = (from,to) in Set.member e combined || Set.member e builtEdges
+            relevantWeight :: CfgEdge -> EdgeWeight
+            relevantWeight (CfgEdge _ _ edgeInfo)
+                | EdgeInfo (CmmSource { trans_cmmNode = CmmCall {}}) _ <- edgeInfo
+                -- Penalize edges across calls
+                = weight/(64.0)
+                | otherwise
+                = weight
+              where
+                -- negate to sort descending
+                weight = negate (edgeWeight edgeInfo)
+
+        masterChain =
+            {-# SCC "mergeChains" #-}
+            -- pprTraceIt "MergedChains" $
+            mergeChains allEdges neighbourChains
+
+        --Make sure the first block stays first
+        prepedChains
+            | inFront entry masterChain
+            = [masterChain]
+            | (rest,entry) <- breakChainAt entry masterChain
+            = [entry,rest]
+#if __GLASGOW_HASKELL__ <= 810
+            | otherwise = pprPanic "Entry point eliminated" $
+                            ppr masterChain
+#endif
+
+        blockList
+            = ASSERT(noDups [masterChain])
+              (concatMap fromOL $ map chainBlocks prepedChains)
+
+        --chainPlaced = setFromList $ map blockId blockList :: LabelSet
+        chainPlaced = setFromList $ blockList :: LabelSet
+        unplaced =
+            let blocks = mapKeys blockMap
+                isPlaced b = setMember (b) chainPlaced
+            in filter (\block -> not (isPlaced block)) blocks
+
+        placedBlocks =
+            -- We want debug builds to catch this as it's a good indicator for
+            -- issues with CFG invariants. But we don't want to blow up production
+            -- builds if something slips through.
+            ASSERT(null unplaced)
+            --pprTraceIt "placedBlocks" $
+            -- ++ [] is stil kinda expensive
+            if null unplaced then blockList else blockList ++ unplaced
+        getBlock bid = expectJust "Block placement" $ mapLookup bid blockMap
+    in
+        --Assert we placed all blocks given as input
+        ASSERT(all (\bid -> mapMember bid blockMap) placedBlocks)
+        dropJumps info $ map getBlock placedBlocks
+
+{-# SCC dropJumps #-}
+-- | Remove redundant jumps between blocks when we can rely on
+-- fall through.
+dropJumps :: forall a i. Instruction i => LabelMap a -> [GenBasicBlock i]
+          -> [GenBasicBlock i]
+dropJumps _    [] = []
+dropJumps info ((BasicBlock lbl ins):todo)
+    | not . null $ ins --This can happen because of shortcutting
+    , [dest] <- jumpDestsOfInstr (last ins)
+    , ((BasicBlock nextLbl _) : _) <- todo
+    , not (mapMember dest info)
+    , nextLbl == dest
+    = BasicBlock lbl (init ins) : dropJumps info todo
+    | otherwise
+    = BasicBlock lbl ins : dropJumps info todo
+
+
+-- -----------------------------------------------------------------------------
+-- Sequencing the basic blocks
+
+-- Cmm BasicBlocks are self-contained entities: they always end in a
+-- jump, either non-local or to another basic block in the same proc.
+-- In this phase, we attempt to place the basic blocks in a sequence
+-- such that as many of the local jumps as possible turn into
+-- fallthroughs.
+
+sequenceTop
+    :: (Instruction instr, Outputable instr)
+    => DynFlags -- Determine which layout algo to use
+    -> NcgImpl statics instr jumpDest
+    -> Maybe CFG -- ^ CFG if we have one.
+    -> NatCmmDecl statics instr -- ^ Function to serialize
+    -> NatCmmDecl statics instr
+
+sequenceTop _     _       _           top@(CmmData _ _) = top
+sequenceTop dflags ncgImpl edgeWeights
+            (CmmProc info lbl live (ListGraph blocks))
+  | (gopt Opt_CfgBlocklayout dflags) && backendMaintainsCfg dflags
+  --Use chain based algorithm
+  , Just cfg <- edgeWeights
+  = CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $
+                            {-# SCC layoutBlocks #-}
+                            sequenceChain info cfg blocks )
+  | otherwise
+  --Use old algorithm
+  = let cfg = if dontUseCfg then Nothing else edgeWeights
+    in  CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $
+                                {-# SCC layoutBlocks #-}
+                                sequenceBlocks cfg info blocks)
+  where
+    dontUseCfg = gopt Opt_WeightlessBlocklayout dflags ||
+                 (not $ backendMaintainsCfg dflags)
+
+-- The old algorithm:
+-- It is very simple (and stupid): We make a graph out of
+-- the blocks where there is an edge from one block to another iff the
+-- first block ends by jumping to the second.  Then we topologically
+-- sort this graph.  Then traverse the list: for each block, we first
+-- output the block, then if it has an out edge, we move the
+-- destination of the out edge to the front of the list, and continue.
+
+-- FYI, the classic layout for basic blocks uses postorder DFS; this
+-- algorithm is implemented in Hoopl.
+
+sequenceBlocks :: Instruction inst => Maybe CFG -> LabelMap a
+               -> [GenBasicBlock inst] -> [GenBasicBlock inst]
+sequenceBlocks _edgeWeight _ [] = []
+sequenceBlocks edgeWeights infos (entry:blocks) =
+    let entryNode = mkNode edgeWeights entry
+        bodyNodes = reverse
+                    (flattenSCCs (sccBlocks edgeWeights blocks))
+    in dropJumps infos . seqBlocks infos $ ( entryNode : bodyNodes)
+  -- the first block is the entry point ==> it must remain at the start.
+
+sccBlocks
+        :: Instruction instr
+        => Maybe CFG -> [NatBasicBlock instr]
+        -> [SCC (Node BlockId (NatBasicBlock instr))]
+sccBlocks edgeWeights blocks =
+    stronglyConnCompFromEdgedVerticesUniqR
+        (map (mkNode edgeWeights) blocks)
+
+mkNode :: (Instruction t)
+       => Maybe CFG -> GenBasicBlock t
+       -> Node BlockId (GenBasicBlock t)
+mkNode edgeWeights block@(BasicBlock id instrs) =
+    DigraphNode block id outEdges
+  where
+    outEdges :: [BlockId]
+    outEdges
+      --Select the heaviest successor, ignore weights <= zero
+      = successor
+      where
+        successor
+          | Just successors <- fmap (`getSuccEdgesSorted` id)
+                                    edgeWeights -- :: Maybe [(Label, EdgeInfo)]
+          = case successors of
+            [] -> []
+            ((target,info):_)
+              | length successors > 2 || edgeWeight info <= 0 -> []
+              | otherwise -> [target]
+          | otherwise
+          = case jumpDestsOfInstr (last instrs) of
+                [one] -> [one]
+                _many -> []
+
+
+seqBlocks :: LabelMap i -> [Node BlockId (GenBasicBlock t1)]
+                        -> [GenBasicBlock t1]
+seqBlocks infos blocks = placeNext pullable0 todo0
+  where
+    -- pullable: Blocks that are not yet placed
+    -- todo:     Original order of blocks, to be followed if we have no good
+    --           reason not to;
+    --           may include blocks that have already been placed, but then
+    --           these are not in pullable
+    pullable0 = listToUFM [ (i,(b,n)) | DigraphNode b i n <- blocks ]
+    todo0     = map node_key blocks
+
+    placeNext _ [] = []
+    placeNext pullable (i:rest)
+        | Just (block, pullable') <- lookupDeleteUFM pullable i
+        = place pullable' rest block
+        | otherwise
+        -- We already placed this block, so ignore
+        = placeNext pullable rest
+
+    place pullable todo (block,[])
+                          = block : placeNext pullable todo
+    place pullable todo (block@(BasicBlock id instrs),[next])
+        | mapMember next infos
+        = block : placeNext pullable todo
+        | Just (nextBlock, pullable') <- lookupDeleteUFM pullable next
+        = BasicBlock id instrs : place pullable' todo nextBlock
+        | otherwise
+        = block : placeNext pullable todo
+    place _ _ (_,tooManyNextNodes)
+        = pprPanic "seqBlocks" (ppr tooManyNextNodes)
+
+
+lookupDeleteUFM :: Uniquable key => UniqFM elt -> key
+                -> Maybe (elt, UniqFM elt)
+lookupDeleteUFM m k = do -- Maybe monad
+    v <- lookupUFM m k
+    return (v, delFromUFM m k)
diff --git a/compiler/GHC/CmmToAsm/CFG.hs b/compiler/GHC/CmmToAsm/CFG.hs
new file mode 100644
index 0000000000..f52ff514b1
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/CFG.hs
@@ -0,0 +1,1320 @@
+--
+-- Copyright (c) 2018 Andreas Klebinger
+--
+
+{-# LANGUAGE TypeFamilies, ScopedTypeVariables #-}
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE Rank2Types #-}
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE DataKinds #-}
+
+module GHC.CmmToAsm.CFG
+    ( CFG, CfgEdge(..), EdgeInfo(..), EdgeWeight(..)
+    , TransitionSource(..)
+
+    --Modify the CFG
+    , addWeightEdge, addEdge
+    , delEdge, delNode
+    , addNodesBetween, shortcutWeightMap
+    , reverseEdges, filterEdges
+    , addImmediateSuccessor
+    , mkWeightInfo, adjustEdgeWeight, setEdgeWeight
+
+    --Query the CFG
+    , infoEdgeList, edgeList
+    , getSuccessorEdges, getSuccessors
+    , getSuccEdgesSorted
+    , getEdgeInfo
+    , getCfgNodes, hasNode
+
+    -- Loop Information
+    , loopMembers, loopLevels, loopInfo
+
+    --Construction/Misc
+    , getCfg, getCfgProc, pprEdgeWeights, sanityCheckCfg
+
+    --Find backedges and update their weight
+    , optimizeCFG
+    , mkGlobalWeights
+
+     )
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Cmm.BlockId
+import GHC.Cmm as Cmm
+
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Dataflow.Block
+import qualified GHC.Cmm.Dataflow.Graph as G
+
+import Util
+import Digraph
+import Maybes
+
+import Unique
+import qualified GHC.CmmToAsm.CFG.Dominators as Dom
+import Data.IntMap.Strict (IntMap)
+import Data.IntSet (IntSet)
+
+import qualified Data.IntMap.Strict as IM
+import qualified Data.Map as M
+import qualified Data.IntSet as IS
+import qualified Data.Set as S
+import Data.Tree
+import Data.Bifunctor
+
+import Outputable
+-- DEBUGGING ONLY
+--import GHC.Cmm.DebugBlock
+--import OrdList
+--import GHC.Cmm.DebugBlock.Trace
+import GHC.Cmm.Ppr () -- For Outputable instances
+import qualified GHC.Driver.Session as D
+
+import Data.List (sort, nub, partition)
+import Data.STRef.Strict
+import Control.Monad.ST
+
+import Data.Array.MArray
+import Data.Array.ST
+import Data.Array.IArray
+import Data.Array.Unsafe (unsafeFreeze)
+import Data.Array.Base (unsafeRead, unsafeWrite)
+
+import Control.Monad
+
+type Prob = Double
+
+type Edge = (BlockId, BlockId)
+type Edges = [Edge]
+
+newtype EdgeWeight
+  = EdgeWeight { weightToDouble :: Double }
+  deriving (Eq,Ord,Enum,Num,Real,Fractional)
+
+instance Outputable EdgeWeight where
+  ppr (EdgeWeight w) = doublePrec 5 w
+
+type EdgeInfoMap edgeInfo = LabelMap (LabelMap edgeInfo)
+
+-- | A control flow graph where edges have been annotated with a weight.
+-- Implemented as IntMap (IntMap <edgeData>)
+-- We must uphold the invariant that for each edge A -> B we must have:
+-- A entry B in the outer map.
+-- A entry B in the map we get when looking up A.
+-- Maintaining this invariant is useful as any failed lookup now indicates
+-- an actual error in code which might go unnoticed for a while
+-- otherwise.
+type CFG = EdgeInfoMap EdgeInfo
+
+data CfgEdge
+  = CfgEdge
+  { edgeFrom :: !BlockId
+  , edgeTo :: !BlockId
+  , edgeInfo :: !EdgeInfo
+  }
+
+-- | Careful! Since we assume there is at most one edge from A to B
+--   the Eq instance does not consider weight.
+instance Eq CfgEdge where
+  (==) (CfgEdge from1 to1 _) (CfgEdge from2 to2 _)
+    = from1 == from2 && to1 == to2
+
+-- | Edges are sorted ascending pointwise by weight, source and destination
+instance Ord CfgEdge where
+  compare (CfgEdge from1 to1 (EdgeInfo {edgeWeight = weight1}))
+          (CfgEdge from2 to2 (EdgeInfo {edgeWeight = weight2}))
+    | weight1 < weight2 || weight1 == weight2 && from1 < from2 ||
+      weight1 == weight2 && from1 == from2 && to1 < to2
+    = LT
+    | from1 == from2 && to1 == to2 && weight1 == weight2
+    = EQ
+    | otherwise
+    = GT
+
+instance Outputable CfgEdge where
+  ppr (CfgEdge from1 to1 edgeInfo)
+    = parens (ppr from1 <+> text "-(" <> ppr edgeInfo <> text ")->" <+> ppr to1)
+
+-- | Can we trace back a edge to a specific Cmm Node
+-- or has it been introduced during assembly codegen. We use this to maintain
+-- some information which would otherwise be lost during the
+-- Cmm <-> asm transition.
+-- See also Note [Inverting Conditional Branches]
+data TransitionSource
+  = CmmSource { trans_cmmNode :: (CmmNode O C)
+              , trans_info :: BranchInfo }
+  | AsmCodeGen
+  deriving (Eq)
+
+data BranchInfo = NoInfo         -- ^ Unknown, but not heap or stack check.
+                | HeapStackCheck -- ^ Heap or stack check
+    deriving Eq
+
+instance Outputable BranchInfo where
+    ppr NoInfo = text "regular"
+    ppr HeapStackCheck = text "heap/stack"
+
+isHeapOrStackCheck :: TransitionSource -> Bool
+isHeapOrStackCheck (CmmSource { trans_info = HeapStackCheck}) = True
+isHeapOrStackCheck _ = False
+
+-- | Information about edges
+data EdgeInfo
+  = EdgeInfo
+  { transitionSource :: !TransitionSource
+  , edgeWeight :: !EdgeWeight
+  } deriving (Eq)
+
+instance Outputable EdgeInfo where
+  ppr edgeInfo = text "weight:" <+> ppr (edgeWeight edgeInfo)
+
+-- | Convenience function, generate edge info based
+--   on weight not originating from cmm.
+mkWeightInfo :: EdgeWeight -> EdgeInfo
+mkWeightInfo = EdgeInfo AsmCodeGen
+
+-- | Adjust the weight between the blocks using the given function.
+--   If there is no such edge returns the original map.
+adjustEdgeWeight :: CFG -> (EdgeWeight -> EdgeWeight)
+                 -> BlockId -> BlockId -> CFG
+adjustEdgeWeight cfg f from to
+  | Just info <- getEdgeInfo from to cfg
+  , !weight <- edgeWeight info
+  , !newWeight <- f weight
+  = addEdge from to (info { edgeWeight = newWeight}) cfg
+  | otherwise = cfg
+
+-- | Set the weight between the blocks to the given weight.
+--   If there is no such edge returns the original map.
+setEdgeWeight :: CFG -> EdgeWeight
+              -> BlockId -> BlockId -> CFG
+setEdgeWeight cfg !weight from to
+  | Just info <- getEdgeInfo from to cfg
+  = addEdge from to (info { edgeWeight = weight}) cfg
+  | otherwise = cfg
+
+
+getCfgNodes :: CFG -> [BlockId]
+getCfgNodes m =
+    mapKeys m
+
+-- | Is this block part of this graph?
+hasNode :: CFG -> BlockId -> Bool
+hasNode m node =
+  -- Check the invariant that each node must exist in the first map or not at all.
+  ASSERT( found || not (any (mapMember node) m))
+  found
+    where
+      found = mapMember node m
+
+
+
+-- | Check if the nodes in the cfg and the set of blocks are the same.
+--   In a case of a missmatch we panic and show the difference.
+sanityCheckCfg :: CFG -> LabelSet -> SDoc -> Bool
+sanityCheckCfg m blockSet msg
+    | blockSet == cfgNodes
+    = True
+    | otherwise =
+        pprPanic "Block list and cfg nodes don't match" (
+            text "difference:" <+> ppr diff $$
+            text "blocks:" <+> ppr blockSet $$
+            text "cfg:" <+> pprEdgeWeights m $$
+            msg )
+            False
+    where
+      cfgNodes = setFromList $ getCfgNodes m :: LabelSet
+      diff = (setUnion cfgNodes blockSet) `setDifference` (setIntersection cfgNodes blockSet) :: LabelSet
+
+-- | Filter the CFG with a custom function f.
+--   Paramaeters are `f from to edgeInfo`
+filterEdges :: (BlockId -> BlockId -> EdgeInfo -> Bool) -> CFG -> CFG
+filterEdges f cfg =
+    mapMapWithKey filterSources cfg
+    where
+      filterSources from m =
+        mapFilterWithKey (\to w -> f from to w) m
+
+
+{- Note [Updating the CFG during shortcutting]
+
+See Note [What is shortcutting] in the control flow optimization
+code (GHC.Cmm.ContFlowOpt) for a slightly more in depth explanation on shortcutting.
+
+In the native backend we shortcut jumps at the assembly level. (AsmCodeGen.hs)
+This means we remove blocks containing only one jump from the code
+and instead redirecting all jumps targeting this block to the deleted
+blocks jump target.
+
+However we want to have an accurate representation of control
+flow in the CFG. So we add/remove edges accordingly to account
+for the eliminated blocks and new edges.
+
+If we shortcut A -> B -> C to A -> C:
+* We delete edges A -> B and B -> C
+* Replacing them with the edge A -> C
+
+We also try to preserve jump weights while doing so.
+
+Note that:
+* The edge B -> C can't have interesting weights since
+  the block B consists of a single unconditional jump without branching.
+* We delete the edge A -> B and add the edge A -> C.
+* The edge A -> B can be one of many edges originating from A so likely
+  has edge weights we want to preserve.
+
+For this reason we simply store the edge info from the original A -> B
+edge and apply this information to the new edge A -> C.
+
+Sometimes we have a scenario where jump target C is not represented by an
+BlockId but an immediate value. I'm only aware of this happening without
+tables next to code currently.
+
+Then we go from A ---> B - -> IMM   to   A - -> IMM where the dashed arrows
+are not stored in the CFG.
+
+In that case we simply delete the edge A -> B.
+
+In terms of implementation the native backend first builds a mapping
+from blocks suitable for shortcutting to their jump targets.
+Then it redirects all jump instructions to these blocks using the
+built up mapping.
+This function (shortcutWeightMap) takes the same mapping and
+applies the mapping to the CFG in the way laid out above.
+
+-}
+shortcutWeightMap :: LabelMap (Maybe BlockId) -> CFG -> CFG
+shortcutWeightMap cuts cfg =
+  foldl' applyMapping cfg $ mapToList cuts
+    where
+-- takes the tuple (B,C) from the notation in [Updating the CFG during shortcutting]
+      applyMapping :: CFG -> (BlockId,Maybe BlockId) -> CFG
+      --Shortcut immediate
+      applyMapping m (from, Nothing) =
+        mapDelete from .
+        fmap (mapDelete from) $ m
+      --Regular shortcut
+      applyMapping m (from, Just to) =
+        let updatedMap :: CFG
+            updatedMap
+              = fmap (shortcutEdge (from,to)) $
+                (mapDelete from m :: CFG )
+        --Sometimes we can shortcut multiple blocks like so:
+        -- A -> B -> C -> D -> E => A -> E
+        -- so we check for such chains.
+        in case mapLookup to cuts of
+            Nothing -> updatedMap
+            Just dest -> applyMapping updatedMap (to, dest)
+      --Redirect edge from B to C
+      shortcutEdge :: (BlockId, BlockId) -> LabelMap EdgeInfo -> LabelMap EdgeInfo
+      shortcutEdge (from, to) m =
+        case mapLookup from m of
+          Just info -> mapInsert to info $ mapDelete from m
+          Nothing   -> m
+
+-- | Sometimes we insert a block which should unconditionally be executed
+--   after a given block. This function updates the CFG for these cases.
+--  So we get A -> B    => A -> A' -> B
+--             \                  \
+--              -> C    =>         -> C
+--
+addImmediateSuccessor :: BlockId -> BlockId -> CFG -> CFG
+addImmediateSuccessor node follower cfg
+    = updateEdges . addWeightEdge node follower uncondWeight $ cfg
+    where
+        uncondWeight = fromIntegral . D.uncondWeight .
+                       D.cfgWeightInfo $ D.unsafeGlobalDynFlags
+        targets = getSuccessorEdges cfg node
+        successors = map fst targets :: [BlockId]
+        updateEdges = addNewSuccs . remOldSuccs
+        remOldSuccs m = foldl' (flip (delEdge node)) m successors
+        addNewSuccs m =
+          foldl' (\m' (t,info) -> addEdge follower t info m') m targets
+
+-- | Adds a new edge, overwrites existing edges if present
+addEdge :: BlockId -> BlockId -> EdgeInfo -> CFG -> CFG
+addEdge from to info cfg =
+    mapAlter addFromToEdge from $
+    mapAlter addDestNode to cfg
+    where
+        -- Simply insert the edge into the edge list.
+        addFromToEdge Nothing = Just $ mapSingleton to info
+        addFromToEdge (Just wm) = Just $ mapInsert to info wm
+        -- We must add the destination node explicitly
+        addDestNode Nothing = Just $ mapEmpty
+        addDestNode n@(Just _) = n
+
+
+-- | Adds a edge with the given weight to the cfg
+--   If there already existed an edge it is overwritten.
+--   `addWeightEdge from to weight cfg`
+addWeightEdge :: BlockId -> BlockId -> EdgeWeight -> CFG -> CFG
+addWeightEdge from to weight cfg =
+    addEdge from to (mkWeightInfo weight) cfg
+
+delEdge :: BlockId -> BlockId -> CFG -> CFG
+delEdge from to m =
+    mapAlter remDest from m
+    where
+        remDest Nothing = Nothing
+        remDest (Just wm) = Just $ mapDelete to wm
+
+delNode :: BlockId -> CFG -> CFG
+delNode node cfg =
+  fmap (mapDelete node)  -- < Edges to the node
+    (mapDelete node cfg) -- < Edges from the node
+
+-- | Destinations from bid ordered by weight (descending)
+getSuccEdgesSorted :: CFG -> BlockId -> [(BlockId,EdgeInfo)]
+getSuccEdgesSorted m bid =
+    let destMap = mapFindWithDefault mapEmpty bid m
+        cfgEdges = mapToList destMap
+        sortedEdges = sortWith (negate . edgeWeight . snd) cfgEdges
+    in  --pprTrace "getSuccEdgesSorted" (ppr bid <+> text "map:" <+> ppr m)
+        sortedEdges
+
+-- | Get successors of a given node with edge weights.
+getSuccessorEdges :: HasDebugCallStack => CFG -> BlockId -> [(BlockId,EdgeInfo)]
+getSuccessorEdges m bid = maybe lookupError mapToList (mapLookup bid m)
+  where
+    lookupError = pprPanic "getSuccessorEdges: Block does not exist" $
+                    ppr bid <+> pprEdgeWeights m
+
+getEdgeInfo :: BlockId -> BlockId -> CFG -> Maybe EdgeInfo
+getEdgeInfo from to m
+    | Just wm <- mapLookup from m
+    , Just info <- mapLookup to wm
+    = Just $! info
+    | otherwise
+    = Nothing
+
+getEdgeWeight :: CFG -> BlockId -> BlockId -> EdgeWeight
+getEdgeWeight cfg from to =
+    edgeWeight $ expectJust "Edgeweight for noexisting block" $
+                 getEdgeInfo from to cfg
+
+getTransitionSource :: BlockId -> BlockId -> CFG -> TransitionSource
+getTransitionSource from to cfg = transitionSource $ expectJust "Source info for noexisting block" $
+                        getEdgeInfo from to cfg
+
+reverseEdges :: CFG -> CFG
+reverseEdges cfg = mapFoldlWithKey (\cfg from toMap -> go (addNode cfg from) from toMap) mapEmpty cfg
+  where
+    -- We must preserve nodes without outgoing edges!
+    addNode :: CFG -> BlockId -> CFG
+    addNode cfg b = mapInsertWith mapUnion b mapEmpty cfg
+    go :: CFG -> BlockId -> (LabelMap EdgeInfo) -> CFG
+    go cfg from toMap = mapFoldlWithKey (\cfg to info -> addEdge to from info cfg) cfg toMap  :: CFG
+
+
+-- | Returns a unordered list of all edges with info
+infoEdgeList :: CFG -> [CfgEdge]
+infoEdgeList m =
+    go (mapToList m) []
+  where
+    -- We avoid foldMap to avoid thunk buildup
+    go :: [(BlockId,LabelMap EdgeInfo)] -> [CfgEdge] -> [CfgEdge]
+    go [] acc = acc
+    go ((from,toMap):xs) acc
+      = go' xs from (mapToList toMap) acc
+    go' :: [(BlockId,LabelMap EdgeInfo)] -> BlockId -> [(BlockId,EdgeInfo)] -> [CfgEdge] -> [CfgEdge]
+    go' froms _    []              acc = go froms acc
+    go' froms from ((to,info):tos) acc
+      = go' froms from tos (CfgEdge from to info : acc)
+
+-- | Returns a unordered list of all edges without weights
+edgeList :: CFG -> [Edge]
+edgeList m =
+    go (mapToList m) []
+  where
+    -- We avoid foldMap to avoid thunk buildup
+    go :: [(BlockId,LabelMap EdgeInfo)] -> [Edge] -> [Edge]
+    go [] acc = acc
+    go ((from,toMap):xs) acc
+      = go' xs from (mapKeys toMap) acc
+    go' :: [(BlockId,LabelMap EdgeInfo)] -> BlockId -> [BlockId] -> [Edge] -> [Edge]
+    go' froms _    []              acc = go froms acc
+    go' froms from (to:tos) acc
+      = go' froms from tos ((from,to) : acc)
+
+-- | Get successors of a given node without edge weights.
+getSuccessors :: HasDebugCallStack => CFG -> BlockId -> [BlockId]
+getSuccessors m bid
+    | Just wm <- mapLookup bid m
+    = mapKeys wm
+    | otherwise = lookupError
+    where
+      lookupError = pprPanic "getSuccessors: Block does not exist" $
+                    ppr bid <+> pprEdgeWeights m
+
+pprEdgeWeights :: CFG -> SDoc
+pprEdgeWeights m =
+    let edges = sort $ infoEdgeList m :: [CfgEdge]
+        printEdge (CfgEdge from to (EdgeInfo { edgeWeight = weight }))
+            = text "\t" <> ppr from <+> text "->" <+> ppr to <>
+              text "[label=\"" <> ppr weight <> text "\",weight=\"" <>
+              ppr weight <> text "\"];\n"
+        --for the case that there are no edges from/to this node.
+        --This should rarely happen but it can save a lot of time
+        --to immediately see it when it does.
+        printNode node
+            = text "\t" <> ppr node <> text ";\n"
+        getEdgeNodes (CfgEdge from to _) = [from,to]
+        edgeNodes = setFromList $ concatMap getEdgeNodes edges :: LabelSet
+        nodes = filter (\n -> (not . setMember n) edgeNodes) . mapKeys $ mapFilter null m
+    in
+    text "digraph {\n" <>
+        (foldl' (<>) empty (map printEdge edges)) <>
+        (foldl' (<>) empty (map printNode nodes)) <>
+    text "}\n"
+
+{-# INLINE updateEdgeWeight #-} --Allows eliminating the tuple when possible
+-- | Invariant: The edge **must** exist already in the graph.
+updateEdgeWeight :: (EdgeWeight -> EdgeWeight) -> Edge -> CFG -> CFG
+updateEdgeWeight f (from, to) cfg
+    | Just oldInfo <- getEdgeInfo from to cfg
+    = let !oldWeight = edgeWeight oldInfo
+          !newWeight = f oldWeight
+      in addEdge from to (oldInfo {edgeWeight = newWeight}) cfg
+    | otherwise
+    = panic "Trying to update invalid edge"
+
+-- from to oldWeight => newWeight
+mapWeights :: (BlockId -> BlockId -> EdgeWeight -> EdgeWeight) -> CFG -> CFG
+mapWeights f cfg =
+  foldl' (\cfg (CfgEdge from to info) ->
+            let oldWeight = edgeWeight info
+                newWeight = f from to oldWeight
+            in addEdge from to (info {edgeWeight = newWeight}) cfg)
+          cfg (infoEdgeList cfg)
+
+
+-- | Insert a block in the control flow between two other blocks.
+-- We pass a list of tuples (A,B,C) where
+-- * A -> C: Old edge
+-- * A -> B -> C : New Arc, where B is the new block.
+-- It's possible that a block has two jumps to the same block
+-- in the assembly code. However we still only store a single edge for
+-- these cases.
+-- We assign the old edge info to the edge A -> B and assign B -> C the
+-- weight of an unconditional jump.
+addNodesBetween :: CFG -> [(BlockId,BlockId,BlockId)] -> CFG
+addNodesBetween m updates =
+  foldl'  updateWeight m .
+          weightUpdates $ updates
+    where
+      weight = fromIntegral . D.uncondWeight .
+                D.cfgWeightInfo $ D.unsafeGlobalDynFlags
+      -- We might add two blocks for different jumps along a single
+      -- edge. So we end up with edges:   A -> B -> C   ,   A -> D -> C
+      -- in this case after applying the first update the weight for A -> C
+      -- is no longer available. So we calculate future weights before updates.
+      weightUpdates = map getWeight
+      getWeight :: (BlockId,BlockId,BlockId) -> (BlockId,BlockId,BlockId,EdgeInfo)
+      getWeight (from,between,old)
+        | Just edgeInfo <- getEdgeInfo from old m
+        = (from,between,old,edgeInfo)
+        | otherwise
+        = pprPanic "Can't find weight for edge that should have one" (
+            text "triple" <+> ppr (from,between,old) $$
+            text "updates" <+> ppr updates $$
+            text "cfg:" <+> pprEdgeWeights m )
+      updateWeight :: CFG -> (BlockId,BlockId,BlockId,EdgeInfo) -> CFG
+      updateWeight m (from,between,old,edgeInfo)
+        = addEdge from between edgeInfo .
+          addWeightEdge between old weight .
+          delEdge from old $ m
+
+{-
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ~~~       Note [CFG Edge Weights]    ~~~
+  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  Edge weights assigned do not currently represent a specific
+  cost model and rather just a ranking of which blocks should
+  be placed next to each other given their connection type in
+  the CFG.
+  This is especially relevant if we whenever two blocks will
+  jump to the same target.
+
+                     A   B
+                      \ /
+                       C
+
+  Should A or B be placed in front of C? The block layout algorithm
+  decides this based on which edge (A,C)/(B,C) is heavier. So we
+  make a educated guess on which branch should be preferred.
+
+  We rank edges in this order:
+  * Unconditional Control Transfer - They will always
+    transfer control to their target. Unless there is a info table
+    we can turn the jump into a fallthrough as well.
+    We use 20k as default, so it's easy to spot if values have been
+    modified but unlikely that we run into issues with overflow.
+  * If branches (likely) - We assume branches marked as likely
+    are taken more than 80% of the time.
+    By ranking them below unconditional jumps we make sure we
+    prefer the unconditional if there is a conditional and
+    unconditional edge towards a block.
+  * If branches (regular) - The false branch can potentially be turned
+    into a fallthrough so we prefer it slightly over the true branch.
+  * Unlikely branches - These can be assumed to be taken less than 20%
+    of the time. So we given them one of the lowest priorities.
+  * Switches - Switches at this level are implemented as jump tables
+    so have a larger number of successors. So without more information
+    we can only say that each individual successor is unlikely to be
+    jumped to and we rank them accordingly.
+  * Calls - We currently ignore calls completely:
+        * By the time we return from a call there is a good chance
+          that the address we return to has already been evicted from
+          cache eliminating a main advantage sequential placement brings.
+        * Calls always require a info table in front of their return
+          address. This reduces the chance that we return to the same
+          cache line further.
+
+-}
+-- | Generate weights for a Cmm proc based on some simple heuristics.
+getCfgProc :: D.CfgWeights -> RawCmmDecl -> CFG
+getCfgProc _       (CmmData {}) = mapEmpty
+getCfgProc weights (CmmProc _info _lab _live graph) = getCfg weights graph
+
+getCfg :: D.CfgWeights -> CmmGraph -> CFG
+getCfg weights graph =
+  foldl' insertEdge edgelessCfg $ concatMap getBlockEdges blocks
+  where
+    D.CFGWeights
+            { D.uncondWeight = uncondWeight
+            , D.condBranchWeight = condBranchWeight
+            , D.switchWeight = switchWeight
+            , D.callWeight = callWeight
+            , D.likelyCondWeight = likelyCondWeight
+            , D.unlikelyCondWeight = unlikelyCondWeight
+            --  Last two are used in other places
+            --, D.infoTablePenalty = infoTablePenalty
+            --, D.backEdgeBonus = backEdgeBonus
+            } = weights
+    -- Explicitly add all nodes to the cfg to ensure they are part of the
+    -- CFG.
+    edgelessCfg = mapFromList $ zip (map G.entryLabel blocks) (repeat mapEmpty)
+    insertEdge :: CFG -> ((BlockId,BlockId),EdgeInfo) -> CFG
+    insertEdge m ((from,to),weight) =
+      mapAlter f from m
+        where
+          f :: Maybe (LabelMap EdgeInfo) -> Maybe (LabelMap EdgeInfo)
+          f Nothing = Just $ mapSingleton to weight
+          f (Just destMap) = Just $ mapInsert to weight destMap
+    getBlockEdges :: CmmBlock -> [((BlockId,BlockId),EdgeInfo)]
+    getBlockEdges block =
+      case branch of
+        CmmBranch dest -> [mkEdge dest uncondWeight]
+        CmmCondBranch cond t f l
+          | l == Nothing ->
+              [mkEdge f condBranchWeight,   mkEdge t condBranchWeight]
+          | l == Just True ->
+              [mkEdge f unlikelyCondWeight, mkEdge t likelyCondWeight]
+          | l == Just False ->
+              [mkEdge f likelyCondWeight,   mkEdge t unlikelyCondWeight]
+          where
+            mkEdgeInfo = -- pprTrace "Info" (ppr branchInfo <+> ppr cond)
+                         EdgeInfo (CmmSource branch branchInfo) . fromIntegral
+            mkEdge target weight = ((bid,target), mkEdgeInfo weight)
+            branchInfo =
+              foldRegsUsed
+                (panic "foldRegsDynFlags")
+                (\info r -> if r == SpLim || r == HpLim || r == BaseReg
+                    then HeapStackCheck else info)
+                NoInfo cond
+
+        (CmmSwitch _e ids) ->
+          let switchTargets = switchTargetsToList ids
+              --Compiler performance hack - for very wide switches don't
+              --consider targets for layout.
+              adjustedWeight =
+                if (length switchTargets > 10) then -1 else switchWeight
+          in map (\x -> mkEdge x adjustedWeight) switchTargets
+        (CmmCall { cml_cont = Just cont})  -> [mkEdge cont callWeight]
+        (CmmForeignCall {Cmm.succ = cont}) -> [mkEdge cont callWeight]
+        (CmmCall { cml_cont = Nothing })   -> []
+        other ->
+            panic "Foo" $
+            ASSERT2(False, ppr "Unknown successor cause:" <>
+              (ppr branch <+> text "=>" <> ppr (G.successors other)))
+            map (\x -> ((bid,x),mkEdgeInfo 0)) $ G.successors other
+      where
+        bid = G.entryLabel block
+        mkEdgeInfo = EdgeInfo (CmmSource branch NoInfo) . fromIntegral
+        mkEdge target weight = ((bid,target), mkEdgeInfo weight)
+        branch = lastNode block :: CmmNode O C
+
+    blocks = revPostorder graph :: [CmmBlock]
+
+--Find back edges by BFS
+findBackEdges :: HasDebugCallStack => BlockId -> CFG -> Edges
+findBackEdges root cfg =
+    --pprTraceIt "Backedges:" $
+    map fst .
+    filter (\x -> snd x == Backward) $ typedEdges
+  where
+    edges = edgeList cfg :: [(BlockId,BlockId)]
+    getSuccs = getSuccessors cfg :: BlockId -> [BlockId]
+    typedEdges =
+      classifyEdges root getSuccs edges :: [((BlockId,BlockId),EdgeType)]
+
+
+optimizeCFG :: D.CfgWeights -> RawCmmDecl -> CFG -> CFG
+optimizeCFG _ (CmmData {}) cfg = cfg
+optimizeCFG weights (CmmProc info _lab _live graph) cfg =
+    {-# SCC optimizeCFG #-}
+    -- pprTrace "Initial:" (pprEdgeWeights cfg) $
+    -- pprTrace "Initial:" (ppr $ mkGlobalWeights (g_entry graph) cfg) $
+
+    -- pprTrace "LoopInfo:" (ppr $ loopInfo cfg (g_entry graph)) $
+    favourFewerPreds  .
+    penalizeInfoTables info .
+    increaseBackEdgeWeight (g_entry graph) $ cfg
+  where
+
+    -- | Increase the weight of all backedges in the CFG
+    -- this helps to make loop jumpbacks the heaviest edges
+    increaseBackEdgeWeight :: BlockId -> CFG -> CFG
+    increaseBackEdgeWeight root cfg =
+        let backedges = findBackEdges root cfg
+            update weight
+              --Keep irrelevant edges irrelevant
+              | weight <= 0 = 0
+              | otherwise
+              = weight + fromIntegral (D.backEdgeBonus weights)
+        in  foldl'  (\cfg edge -> updateEdgeWeight update edge cfg)
+                    cfg backedges
+
+    -- | Since we cant fall through info tables we penalize these.
+    penalizeInfoTables :: LabelMap a -> CFG -> CFG
+    penalizeInfoTables info cfg =
+        mapWeights fupdate cfg
+      where
+        fupdate :: BlockId -> BlockId -> EdgeWeight -> EdgeWeight
+        fupdate _ to weight
+          | mapMember to info
+          = weight - (fromIntegral $ D.infoTablePenalty weights)
+          | otherwise = weight
+
+    -- | If a block has two successors, favour the one with fewer
+    -- predecessors and/or the one allowing fall through.
+    favourFewerPreds :: CFG -> CFG
+    favourFewerPreds cfg =
+        let
+            revCfg =
+              reverseEdges $ filterEdges
+                              (\_from -> fallthroughTarget)  cfg
+
+            predCount n = length $ getSuccessorEdges revCfg n
+            nodes = getCfgNodes cfg
+
+            modifiers :: Int -> Int -> (EdgeWeight, EdgeWeight)
+            modifiers preds1 preds2
+              | preds1 <  preds2 = ( 1,-1)
+              | preds1 == preds2 = ( 0, 0)
+              | otherwise        = (-1, 1)
+
+            update :: CFG -> BlockId -> CFG
+            update cfg node
+              | [(s1,e1),(s2,e2)] <- getSuccessorEdges cfg node
+              , !w1 <- edgeWeight e1
+              , !w2 <- edgeWeight e2
+              --Only change the weights if there isn't already a ordering.
+              , w1 == w2
+              , (mod1,mod2) <- modifiers (predCount s1) (predCount s2)
+              = (\cfg' ->
+                  (adjustEdgeWeight cfg' (+mod2) node s2))
+                    (adjustEdgeWeight cfg  (+mod1) node s1)
+              | otherwise
+              = cfg
+        in foldl' update cfg nodes
+      where
+        fallthroughTarget :: BlockId -> EdgeInfo -> Bool
+        fallthroughTarget to (EdgeInfo source _weight)
+          | mapMember to info = False
+          | AsmCodeGen <- source = True
+          | CmmSource { trans_cmmNode = CmmBranch {} } <- source = True
+          | CmmSource { trans_cmmNode = CmmCondBranch {} } <- source = True
+          | otherwise = False
+
+-- | Determine loop membership of blocks based on SCC analysis
+--   This is faster but only gives yes/no answers.
+loopMembers :: HasDebugCallStack => CFG -> LabelMap Bool
+loopMembers cfg =
+    foldl' (flip setLevel) mapEmpty sccs
+  where
+    mkNode :: BlockId -> Node BlockId BlockId
+    mkNode bid = DigraphNode bid bid (getSuccessors cfg bid)
+    nodes = map mkNode (getCfgNodes cfg)
+
+    sccs = stronglyConnCompFromEdgedVerticesOrd nodes
+
+    setLevel :: SCC BlockId -> LabelMap Bool -> LabelMap Bool
+    setLevel (AcyclicSCC bid) m = mapInsert bid False m
+    setLevel (CyclicSCC bids) m = foldl' (\m k -> mapInsert k True m) m bids
+
+loopLevels :: CFG -> BlockId -> LabelMap Int
+loopLevels cfg root = liLevels loopInfos
+    where
+      loopInfos = loopInfo cfg root
+
+data LoopInfo = LoopInfo
+  { liBackEdges :: [(Edge)] -- ^ List of back edges
+  , liLevels :: LabelMap Int -- ^ BlockId -> LoopLevel mapping
+  , liLoops :: [(Edge, LabelSet)] -- ^ (backEdge, loopBody), body includes header
+  }
+
+instance Outputable LoopInfo where
+    ppr (LoopInfo _ _lvls loops) =
+        text "Loops:(backEdge, bodyNodes)" $$
+            (vcat $ map ppr loops)
+
+{-  Note [Determining the loop body]
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    Starting with the knowledge that:
+    * head dominates the loop
+    * `tail` -> `head` is a backedge
+
+    We can determine all nodes by:
+    * Deleting the loop head from the graph.
+    * Collect all blocks which are reachable from the `tail`.
+
+    We do so by performing bfs from the tail node towards the head.
+ -}
+
+-- | Determine loop membership of blocks based on Dominator analysis.
+--   This is slower but gives loop levels instead of just loop membership.
+--   However it only detects natural loops. Irreducible control flow is not
+--   recognized even if it loops. But that is rare enough that we don't have
+--   to care about that special case.
+loopInfo :: HasDebugCallStack => CFG -> BlockId -> LoopInfo
+loopInfo cfg root = LoopInfo  { liBackEdges = backEdges
+                              , liLevels = mapFromList loopCounts
+                              , liLoops = loopBodies }
+  where
+    revCfg = reverseEdges cfg
+
+    graph = -- pprTrace "CFG - loopInfo" (pprEdgeWeights cfg) $
+            fmap (setFromList . mapKeys ) cfg :: LabelMap LabelSet
+
+
+    --TODO - This should be a no op: Export constructors? Use unsafeCoerce? ...
+    rooted = ( fromBlockId root
+              , toIntMap $ fmap toIntSet graph) :: (Int, IntMap IntSet)
+    tree = fmap toBlockId $ Dom.domTree rooted :: Tree BlockId
+
+    -- Map from Nodes to their dominators
+    domMap :: LabelMap LabelSet
+    domMap = mkDomMap tree
+
+    edges = edgeList cfg :: [(BlockId, BlockId)]
+    -- We can't recompute nodes from edges, there might be blocks not connected via edges.
+    nodes = getCfgNodes cfg :: [BlockId]
+
+    -- identify back edges
+    isBackEdge (from,to)
+      | Just doms <- mapLookup from domMap
+      , setMember to doms
+      = True
+      | otherwise = False
+
+    -- See Note [Determining the loop body]
+    -- Get the loop body associated with a back edge.
+    findBody edge@(tail, head)
+      = ( edge, setInsert head $ go (setSingleton tail) (setSingleton tail) )
+      where
+        -- See Note [Determining the loop body]
+        cfg' = delNode head revCfg
+
+        go :: LabelSet -> LabelSet -> LabelSet
+        go found current
+          | setNull current = found
+          | otherwise = go  (setUnion newSuccessors found)
+                            newSuccessors
+          where
+            -- Really predecessors, since we use the reversed cfg.
+            newSuccessors = setFilter (\n -> not $ setMember n found) successors :: LabelSet
+            successors = setFromList $ concatMap
+                                      (getSuccessors cfg')
+                                      -- we filter head as it's no longer part of the cfg.
+                                      (filter (/= head) $ setElems current) :: LabelSet
+
+    backEdges = filter isBackEdge edges
+    loopBodies = map findBody backEdges :: [(Edge, LabelSet)]
+
+    -- Block b is part of n loop bodies => loop nest level of n
+    loopCounts =
+      let bodies = map (first snd) loopBodies -- [(Header, Body)]
+          loopCount n = length $ nub . map fst . filter (setMember n . snd) $ bodies
+      in  map (\n -> (n, loopCount n)) $ nodes :: [(BlockId, Int)]
+
+    toIntSet :: LabelSet -> IntSet
+    toIntSet s = IS.fromList . map fromBlockId . setElems $ s
+    toIntMap :: LabelMap a -> IntMap a
+    toIntMap m = IM.fromList $ map (\(x,y) -> (fromBlockId x,y)) $ mapToList m
+
+    mkDomMap :: Tree BlockId -> LabelMap LabelSet
+    mkDomMap root = mapFromList $ go setEmpty root
+      where
+        go :: LabelSet -> Tree BlockId -> [(Label,LabelSet)]
+        go parents (Node lbl [])
+          =  [(lbl, parents)]
+        go parents (Node _ leaves)
+          = let nodes = map rootLabel leaves
+                entries = map (\x -> (x,parents)) nodes
+            in  entries ++ concatMap
+                            (\n -> go (setInsert (rootLabel n) parents) n)
+                            leaves
+
+    fromBlockId :: BlockId -> Int
+    fromBlockId = getKey . getUnique
+
+    toBlockId :: Int -> BlockId
+    toBlockId = mkBlockId . mkUniqueGrimily
+
+-- We make the CFG a Hoopl Graph, so we can reuse revPostOrder.
+newtype BlockNode (e :: Extensibility) (x :: Extensibility) = BN (BlockId,[BlockId])
+
+instance G.NonLocal (BlockNode) where
+  entryLabel (BN (lbl,_))   = lbl
+  successors (BN (_,succs)) = succs
+
+revPostorderFrom :: HasDebugCallStack => CFG -> BlockId -> [BlockId]
+revPostorderFrom cfg root =
+    map fromNode $ G.revPostorderFrom hooplGraph root
+  where
+    nodes = getCfgNodes cfg
+    hooplGraph = foldl' (\m n -> mapInsert n (toNode n) m) mapEmpty nodes
+
+    fromNode :: BlockNode C C -> BlockId
+    fromNode (BN x) = fst x
+
+    toNode :: BlockId -> BlockNode C C
+    toNode bid =
+        BN (bid,getSuccessors cfg $ bid)
+
+
+-- | We take in a CFG which has on its edges weights which are
+--   relative only to other edges originating from the same node.
+--
+--   We return a CFG for which each edge represents a GLOBAL weight.
+--   This means edge weights are comparable across the whole graph.
+--
+--   For irreducible control flow results might be imprecise, otherwise they
+--   are reliable.
+--
+--   The algorithm is based on the Paper
+--   "Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus
+--   The only big change is that we go over the nodes in the body of loops in
+--   reverse post order. Which is required for diamond control flow to work probably.
+--
+--   We also apply a few prediction heuristics (based on the same paper)
+
+{-# NOINLINE mkGlobalWeights #-}
+{-# SCC mkGlobalWeights #-}
+mkGlobalWeights :: HasDebugCallStack => BlockId -> CFG -> (LabelMap Double, LabelMap (LabelMap Double))
+mkGlobalWeights root localCfg
+  | null localCfg = panic "Error - Empty CFG"
+  | otherwise
+  = (blockFreqs', edgeFreqs')
+  where
+    -- Calculate fixpoints
+    (blockFreqs, edgeFreqs) = calcFreqs nodeProbs backEdges' bodies' revOrder'
+    blockFreqs' = mapFromList $ map (first fromVertex) (assocs blockFreqs) :: LabelMap Double
+    edgeFreqs' = fmap fromVertexMap $ fromVertexMap edgeFreqs
+
+    fromVertexMap :: IM.IntMap x -> LabelMap x
+    fromVertexMap m = mapFromList . map (first fromVertex) $ IM.toList m
+
+    revOrder = revPostorderFrom localCfg root :: [BlockId]
+    loopResults@(LoopInfo backedges _levels bodies) = loopInfo localCfg root
+
+    revOrder' = map toVertex revOrder
+    backEdges' = map (bimap toVertex toVertex) backedges
+    bodies' = map calcBody bodies
+
+    estimatedCfg = staticBranchPrediction root loopResults localCfg
+    -- Normalize the weights to probabilities and apply heuristics
+    nodeProbs = cfgEdgeProbabilities estimatedCfg toVertex
+
+    -- By mapping vertices to numbers in reverse post order we can bring any subset into reverse post
+    -- order simply by sorting.
+    -- TODO: The sort is redundant if we can guarantee that setElems returns elements ascending
+    calcBody (backedge, blocks) =
+        (toVertex $ snd backedge, sort . map toVertex $ (setElems blocks))
+
+    vertexMapping = mapFromList $ zip revOrder [0..] :: LabelMap Int
+    blockMapping = listArray (0,mapSize vertexMapping - 1) revOrder :: Array Int BlockId
+    -- Map from blockId to indices starting at zero
+    toVertex :: BlockId -> Int
+    toVertex   blockId  = expectJust "mkGlobalWeights" $ mapLookup blockId vertexMapping
+    -- Map from indices starting at zero to blockIds
+    fromVertex :: Int -> BlockId
+    fromVertex vertex   = blockMapping ! vertex
+
+{- Note [Static Branch Prediction]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The work here has been based on the paper
+"Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus.
+
+The primary differences are that if we branch on the result of a heap
+check we do not apply any of the heuristics.
+The reason is simple: They look like loops in the control flow graph
+but are usually never entered, and if at most once.
+
+Currently implemented is a heuristic to predict that we do not exit
+loops (lehPredicts) and one to predict that backedges are more likely
+than any other edge.
+
+The back edge case is special as it superceeds any other heuristic if it
+applies.
+
+Do NOT rely solely on nofib results for benchmarking this. I recommend at least
+comparing megaparsec and container benchmarks. Nofib does not seeem to have
+many instances of "loopy" Cmm where these make a difference.
+
+TODO:
+* The paper containers more benchmarks which should be implemented.
+* If we turn the likelihood on if/else branches into a probability
+  instead of true/false we could implement this as a Cmm pass.
+  + The complete Cmm code still exists and can be accessed by the heuristics
+  + There is no chance of register allocation/codegen inserting branches/blocks
+  + making the TransitionSource info wrong.
+  + potential to use this information in CmmPasses.
+  - Requires refactoring of all the code relying on the binary nature of likelihood.
+  - Requires refactoring `loopInfo` to work on both, Cmm Graphs and the backend CFG.
+-}
+
+-- | Combination of target node id and information about the branch
+--   we are looking at.
+type TargetNodeInfo = (BlockId, EdgeInfo)
+
+
+-- | Update branch weights based on certain heuristics.
+-- See Note [Static Branch Prediction]
+-- TODO: This should be combined with optimizeCFG
+{-# SCC staticBranchPrediction #-}
+staticBranchPrediction :: BlockId -> LoopInfo -> CFG -> CFG
+staticBranchPrediction _root (LoopInfo l_backEdges loopLevels l_loops) cfg =
+    -- pprTrace "staticEstimatesOn" (ppr (cfg)) $
+    foldl' update cfg nodes
+  where
+    nodes = getCfgNodes cfg
+    backedges = S.fromList $ l_backEdges
+    -- Loops keyed by their back edge
+    loops = M.fromList $ l_loops :: M.Map Edge LabelSet
+    loopHeads = S.fromList $ map snd $ M.keys loops
+
+    update :: CFG -> BlockId -> CFG
+    update cfg node
+        -- No successors, nothing to do.
+        | null successors = cfg
+
+        -- Mix of backedges and others:
+        -- Always predict the backedges.
+        | not (null m) && length m < length successors
+        -- Heap/Stack checks "loop", but only once.
+        -- So we simply exclude any case involving them.
+        , not $ any (isHeapOrStackCheck  . transitionSource . snd) successors
+        = let   loopChance = repeat $! pred_LBH / (fromIntegral $ length m)
+                exitChance = repeat $! (1 - pred_LBH) / fromIntegral (length not_m)
+                updates = zip (map fst m) loopChance ++ zip (map fst not_m) exitChance
+        in  -- pprTrace "mix" (ppr (node,successors)) $
+            foldl' (\cfg (to,weight) -> setEdgeWeight cfg weight node to) cfg updates
+
+        -- For (regular) non-binary branches we keep the weights from the STG -> Cmm translation.
+        | length successors /= 2
+        = cfg
+
+        -- Only backedges - no need to adjust
+        | length m > 0
+        = cfg
+
+        -- A regular binary branch, we can plug addition predictors in here.
+        | [(s1,s1_info),(s2,s2_info)] <- successors
+        , not $ any (isHeapOrStackCheck  . transitionSource . snd) successors
+        = -- Normalize weights to total of 1
+            let !w1 = max (edgeWeight s1_info) (0)
+                !w2 = max (edgeWeight s2_info) (0)
+                -- Of both weights are <= 0 we set both to 0.5
+                normalizeWeight w = if w1 + w2 == 0 then 0.5 else w/(w1+w2)
+                !cfg'  = setEdgeWeight cfg  (normalizeWeight w1) node s1
+                !cfg'' = setEdgeWeight cfg' (normalizeWeight w2) node s2
+
+                -- Figure out which heuristics apply to these successors
+                heuristics = map ($ ((s1,s1_info),(s2,s2_info)))
+                            [lehPredicts, phPredicts, ohPredicts, ghPredicts, lhhPredicts, chPredicts
+                            , shPredicts, rhPredicts]
+                -- Apply result of a heuristic. Argument is the likelihood
+                -- predicted for s1.
+                applyHeuristic :: CFG -> Maybe Prob -> CFG
+                applyHeuristic cfg Nothing = cfg
+                applyHeuristic cfg (Just (s1_pred :: Double))
+                  | s1_old == 0 || s2_old == 0 ||
+                    isHeapOrStackCheck (transitionSource s1_info) ||
+                    isHeapOrStackCheck (transitionSource s2_info)
+                  = cfg
+                  | otherwise =
+                    let -- Predictions from heuristic
+                        s1_prob = EdgeWeight s1_pred :: EdgeWeight
+                        s2_prob = 1.0 - s1_prob
+                        -- Update
+                        d = (s1_old * s1_prob) + (s2_old * s2_prob) :: EdgeWeight
+                        s1_prob' = s1_old * s1_prob / d
+                        !s2_prob' = s2_old * s2_prob / d
+                        !cfg_s1 = setEdgeWeight cfg    s1_prob' node s1
+                    in  -- pprTrace "Applying heuristic!" (ppr (node,s1,s2) $$ ppr (s1_prob', s2_prob')) $
+                        setEdgeWeight cfg_s1 s2_prob' node s2
+                  where
+                    -- Old weights
+                    s1_old = getEdgeWeight cfg node s1
+                    s2_old = getEdgeWeight cfg node s2
+
+            in
+            -- pprTraceIt "RegularCfgResult" $
+            foldl' applyHeuristic cfg'' heuristics
+
+        -- Branch on heap/stack check
+        | otherwise = cfg
+
+      where
+        -- Chance that loops are taken.
+        pred_LBH = 0.875
+        -- successors
+        successors = getSuccessorEdges cfg node
+        -- backedges
+        (m,not_m) = partition (\succ -> S.member (node, fst succ) backedges) successors
+
+        -- Heuristics return nothing if they don't say anything about this branch
+        -- or Just (prob_s1) where prob_s1 is the likelihood for s1 to be the
+        -- taken branch. s1 is the branch in the true case.
+
+        -- Loop exit heuristic.
+        -- We are unlikely to leave a loop unless it's to enter another one.
+        pred_LEH = 0.75
+        -- If and only if no successor is a loopheader,
+        -- then we will likely not exit the current loop body.
+        lehPredicts :: (TargetNodeInfo,TargetNodeInfo) -> Maybe Prob
+        lehPredicts ((s1,_s1_info),(s2,_s2_info))
+          | S.member s1 loopHeads || S.member s2 loopHeads
+          = Nothing
+
+          | otherwise
+          = --pprTrace "lehPredict:" (ppr $ compare s1Level s2Level) $
+            case compare s1Level s2Level of
+                EQ -> Nothing
+                LT -> Just (1-pred_LEH) --s1 exits to a shallower loop level (exits loop)
+                GT -> Just (pred_LEH)   --s1 exits to a deeper loop level
+            where
+                s1Level = mapLookup s1 loopLevels
+                s2Level = mapLookup s2 loopLevels
+
+        -- Comparing to a constant is unlikely to be equal.
+        ohPredicts (s1,_s2)
+            | CmmSource { trans_cmmNode = src1 } <- getTransitionSource node (fst s1) cfg
+            , CmmCondBranch cond ltrue _lfalse likely <- src1
+            , likely == Nothing
+            , CmmMachOp mop args <- cond
+            , MO_Eq {} <- mop
+            , not (null [x | x@CmmLit{} <- args])
+            = if fst s1 == ltrue then Just 0.3 else Just 0.7
+
+            | otherwise
+            = Nothing
+
+        -- TODO: These are all the other heuristics from the paper.
+        -- Not all will apply, for now we just stub them out as Nothing.
+        phPredicts = const Nothing
+        ghPredicts = const Nothing
+        lhhPredicts = const Nothing
+        chPredicts = const Nothing
+        shPredicts = const Nothing
+        rhPredicts = const Nothing
+
+-- We normalize all edge weights as probabilities between 0 and 1.
+-- Ignoring rounding errors all outgoing edges sum up to 1.
+cfgEdgeProbabilities :: CFG -> (BlockId -> Int) -> IM.IntMap (IM.IntMap Prob)
+cfgEdgeProbabilities cfg toVertex
+    = mapFoldlWithKey foldEdges IM.empty cfg
+  where
+    foldEdges = (\m from toMap -> IM.insert (toVertex from) (normalize toMap) m)
+
+    normalize :: (LabelMap EdgeInfo) -> (IM.IntMap Prob)
+    normalize weightMap
+        | edgeCount <= 1 = mapFoldlWithKey (\m k _ -> IM.insert (toVertex k) 1.0 m) IM.empty weightMap
+        | otherwise = mapFoldlWithKey (\m k _ -> IM.insert (toVertex k) (normalWeight k) m) IM.empty weightMap
+      where
+        edgeCount = mapSize weightMap
+        -- Negative weights are generally allowed but are mapped to zero.
+        -- We then check if there is at least one non-zero edge and if not
+        -- assign uniform weights to all branches.
+        minWeight = 0 :: Prob
+        weightMap' = fmap (\w -> max (weightToDouble . edgeWeight $ w) minWeight) weightMap
+        totalWeight = sum weightMap'
+
+        normalWeight :: BlockId -> Prob
+        normalWeight bid
+         | totalWeight == 0
+         = 1.0 / fromIntegral edgeCount
+         | Just w <- mapLookup bid weightMap'
+         = w/totalWeight
+         | otherwise = panic "impossible"
+
+-- This is the fixpoint algorithm from
+--   "Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus
+-- The adaption to Haskell is my own.
+calcFreqs :: IM.IntMap (IM.IntMap Prob) -> [(Int,Int)] -> [(Int, [Int])] -> [Int]
+          -> (Array Int Double, IM.IntMap (IM.IntMap Prob))
+calcFreqs graph backEdges loops revPostOrder = runST $ do
+    visitedNodes <- newArray (0,nodeCount-1) False :: ST s (STUArray s Int Bool)
+    blockFreqs <- newArray (0,nodeCount-1) 0.0 :: ST s (STUArray s Int Double)
+    edgeProbs <- newSTRef graph
+    edgeBackProbs <- newSTRef graph
+
+    -- let traceArray a = do
+    --       vs <- forM [0..nodeCount-1] $ \i -> readArray a i >>= (\v -> return (i,v))
+          -- trace ("array: " ++ show vs) $ return ()
+
+    let  -- See #1600, we need to inline or unboxing makes perf worse.
+        -- {-# INLINE getFreq #-}
+        {-# INLINE visited #-}
+        visited b = unsafeRead visitedNodes b
+        getFreq b = unsafeRead blockFreqs b
+        -- setFreq :: forall s. Int -> Double -> ST s ()
+        setFreq b f = unsafeWrite blockFreqs b f
+        -- setVisited :: forall s. Node -> ST s ()
+        setVisited b = unsafeWrite visitedNodes b True
+        -- Frequency/probability that edge is taken.
+        getProb' arr b1 b2 = readSTRef arr >>=
+            (\graph ->
+                return .
+                        fromMaybe (error "getFreq 1") .
+                        IM.lookup b2 .
+                        fromMaybe (error "getFreq 2") $
+                        (IM.lookup b1 graph)
+            )
+        setProb' arr b1 b2 prob = do
+          g <- readSTRef arr
+          let !m = fromMaybe (error "Foo") $ IM.lookup b1 g
+              !m' = IM.insert b2 prob m
+          writeSTRef arr $! (IM.insert b1 m' g)
+
+        getEdgeFreq b1 b2 = getProb' edgeProbs b1 b2
+        setEdgeFreq b1 b2 = setProb' edgeProbs b1 b2
+        getProb b1 b2 = fromMaybe (error "getProb") $ do
+            m' <- IM.lookup b1 graph
+            IM.lookup b2 m'
+
+        getBackProb b1 b2 = getProb' edgeBackProbs b1 b2
+        setBackProb b1 b2 = setProb' edgeBackProbs b1 b2
+
+
+    let -- calcOutFreqs :: Node -> ST s ()
+        calcOutFreqs bhead block = do
+          !f <- getFreq block
+          forM (successors block) $ \bi -> do
+            let !prob = getProb block bi
+            let !succFreq = f * prob
+            setEdgeFreq block bi succFreq
+            -- traceM $ "SetOut: " ++ show (block, bi, f, prob, succFreq)
+            when (bi == bhead) $ setBackProb block bi succFreq
+
+
+    let propFreq block head = do
+            -- traceM ("prop:" ++ show (block,head))
+            -- traceShowM block
+
+            !v <- visited block
+            if v then
+                return () --Dont look at nodes twice
+            else if block == head then
+                setFreq block 1.0 -- Loop header frequency is always 1
+            else do
+                let preds = IS.elems $ predecessors block
+                irreducible <- (fmap or) $ forM preds $ \bp -> do
+                    !bp_visited <- visited bp
+                    let bp_backedge = isBackEdge bp block
+                    return (not bp_visited && not bp_backedge)
+
+                if irreducible
+                then return () -- Rare we don't care
+                else do
+                    setFreq block 0
+                    !cycleProb <- sum <$> (forM preds $ \pred -> do
+                        if isBackEdge pred block
+                            then
+                                getBackProb pred block
+                            else do
+                                !f <- getFreq block
+                                !prob <- getEdgeFreq pred block
+                                setFreq block $! f + prob
+                                return 0)
+                    -- traceM $ "cycleProb:" ++ show cycleProb
+                    let limit = 1 - 1/512 -- Paper uses 1 - epsilon, but this works.
+                                          -- determines how large likelyhoods in loops can grow.
+                    !cycleProb <- return $ min cycleProb limit -- <- return $ if cycleProb > limit then limit else cycleProb
+                    -- traceM $ "cycleProb:" ++ show cycleProb
+
+                    !f <- getFreq block
+                    setFreq block (f / (1.0 - cycleProb))
+
+            setVisited block
+            calcOutFreqs head block
+
+    -- Loops, by nesting, inner to outer
+    forM_ loops $ \(head, body) -> do
+        forM_ [0 .. nodeCount - 1] (\i -> unsafeWrite visitedNodes i True) -- Mark all nodes as visited.
+        forM_ body (\i -> unsafeWrite visitedNodes i False) -- Mark all blocks reachable from head as not visited
+        forM_ body $ \block -> propFreq block head
+
+    -- After dealing with all loops, deal with non-looping parts of the CFG
+    forM_ [0 .. nodeCount - 1] (\i -> unsafeWrite visitedNodes i False) -- Everything in revPostOrder is reachable
+    forM_ revPostOrder $ \block -> propFreq block (head revPostOrder)
+
+    -- trace ("Final freqs:") $ return ()
+    -- let freqString = pprFreqs freqs
+    -- trace (unlines freqString) $ return ()
+    -- trace (pprFre) $ return ()
+    graph' <- readSTRef edgeProbs
+    freqs' <- unsafeFreeze  blockFreqs
+
+    return (freqs', graph')
+  where
+    -- How can these lookups fail? Consider the CFG [A -> B]
+    predecessors :: Int -> IS.IntSet
+    predecessors b = fromMaybe IS.empty $ IM.lookup b revGraph
+    successors :: Int -> [Int]
+    successors b = fromMaybe (lookupError "succ" b graph)$ IM.keys <$> IM.lookup b graph
+    lookupError s b g = pprPanic ("Lookup error " ++ s) $
+                            ( text "node" <+> ppr b $$
+                                text "graph" <+>
+                                vcat (map (\(k,m) -> ppr (k,m :: IM.IntMap Double)) $ IM.toList g)
+                            )
+
+    nodeCount = IM.foldl' (\count toMap -> IM.foldlWithKey' countTargets count toMap) (IM.size graph) graph
+      where
+        countTargets = (\count k _ -> countNode k + count )
+        countNode n = if IM.member n graph then 0 else 1
+
+    isBackEdge from to = S.member (from,to) backEdgeSet
+    backEdgeSet = S.fromList backEdges
+
+    revGraph :: IntMap IntSet
+    revGraph = IM.foldlWithKey' (\m from toMap -> addEdges m from toMap) IM.empty graph
+        where
+            addEdges m0 from toMap = IM.foldlWithKey' (\m k _ -> addEdge m from k) m0 toMap
+            addEdge m0 from to = IM.insertWith IS.union to (IS.singleton from) m0
diff --git a/compiler/GHC/CmmToAsm/CFG/Dominators.hs b/compiler/GHC/CmmToAsm/CFG/Dominators.hs
new file mode 100644
index 0000000000..b9dcacd8cb
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/CFG/Dominators.hs
@@ -0,0 +1,597 @@
+{-# LANGUAGE RankNTypes, BangPatterns, FlexibleContexts, Strict #-}
+
+{- |
+  Module      :  Dominators
+  Copyright   :  (c) Matt Morrow 2009
+  License     :  BSD3
+  Maintainer  :  <morrow@moonpatio.com>
+  Stability   :  experimental
+  Portability :  portable
+
+  Taken from the dom-lt package.
+
+  The Lengauer-Tarjan graph dominators algorithm.
+
+    \[1\] Lengauer, Tarjan,
+      /A Fast Algorithm for Finding Dominators in a Flowgraph/, 1979.
+
+    \[2\] Muchnick,
+      /Advanced Compiler Design and Implementation/, 1997.
+
+    \[3\] Brisk, Sarrafzadeh,
+      /Interference Graphs for Procedures in Static Single/
+      /Information Form are Interval Graphs/, 2007.
+
+  Originally taken from the dom-lt package.
+-}
+
+module GHC.CmmToAsm.CFG.Dominators (
+   Node,Path,Edge
+  ,Graph,Rooted
+  ,idom,ipdom
+  ,domTree,pdomTree
+  ,dom,pdom
+  ,pddfs,rpddfs
+  ,fromAdj,fromEdges
+  ,toAdj,toEdges
+  ,asTree,asGraph
+  ,parents,ancestors
+) where
+
+import GhcPrelude
+
+import Data.Bifunctor
+import Data.Tuple (swap)
+
+import Data.Tree
+import Data.IntMap(IntMap)
+import Data.IntSet(IntSet)
+import qualified Data.IntMap.Strict as IM
+import qualified Data.IntSet as IS
+
+import Control.Monad
+import Control.Monad.ST.Strict
+
+import Data.Array.ST
+import Data.Array.Base hiding ((!))
+  -- (unsafeNewArray_
+  -- ,unsafeWrite,unsafeRead
+  -- ,readArray,writeArray)
+
+import Util (debugIsOn)
+
+-----------------------------------------------------------------------------
+
+type Node       = Int
+type Path       = [Node]
+type Edge       = (Node,Node)
+type Graph      = IntMap IntSet
+type Rooted     = (Node, Graph)
+
+-----------------------------------------------------------------------------
+
+-- | /Dominators/.
+-- Complexity as for @idom@
+dom :: Rooted -> [(Node, Path)]
+dom = ancestors . domTree
+
+-- | /Post-dominators/.
+-- Complexity as for @idom@.
+pdom :: Rooted -> [(Node, Path)]
+pdom = ancestors . pdomTree
+
+-- | /Dominator tree/.
+-- Complexity as for @idom@.
+domTree :: Rooted -> Tree Node
+domTree a@(r,_) =
+  let is = filter ((/=r).fst) (idom a)
+      tg = fromEdges (fmap swap is)
+  in asTree (r,tg)
+
+-- | /Post-dominator tree/.
+-- Complexity as for @idom@.
+pdomTree :: Rooted -> Tree Node
+pdomTree a@(r,_) =
+  let is = filter ((/=r).fst) (ipdom a)
+      tg = fromEdges (fmap swap is)
+  in asTree (r,tg)
+
+-- | /Immediate dominators/.
+-- /O(|E|*alpha(|E|,|V|))/, where /alpha(m,n)/ is
+-- \"a functional inverse of Ackermann's function\".
+--
+-- This Complexity bound assumes /O(1)/ indexing. Since we're
+-- using @IntMap@, it has an additional /lg |V|/ factor
+-- somewhere in there. I'm not sure where.
+idom :: Rooted -> [(Node,Node)]
+idom rg = runST (evalS idomM =<< initEnv (pruneReach rg))
+
+-- | /Immediate post-dominators/.
+-- Complexity as for @idom@.
+ipdom :: Rooted -> [(Node,Node)]
+ipdom rg = runST (evalS idomM =<< initEnv (pruneReach (second predG rg)))
+
+-----------------------------------------------------------------------------
+
+-- | /Post-dominated depth-first search/.
+pddfs :: Rooted -> [Node]
+pddfs = reverse . rpddfs
+
+-- | /Reverse post-dominated depth-first search/.
+rpddfs :: Rooted -> [Node]
+rpddfs = concat . levels . pdomTree
+
+-----------------------------------------------------------------------------
+
+type Dom s a = S s (Env s) a
+type NodeSet    = IntSet
+type NodeMap a  = IntMap a
+data Env s = Env
+  {succE      :: !Graph
+  ,predE      :: !Graph
+  ,bucketE    :: !Graph
+  ,dfsE       :: {-# UNPACK #-}!Int
+  ,zeroE      :: {-# UNPACK #-}!Node
+  ,rootE      :: {-# UNPACK #-}!Node
+  ,labelE     :: {-# UNPACK #-}!(Arr s Node)
+  ,parentE    :: {-# UNPACK #-}!(Arr s Node)
+  ,ancestorE  :: {-# UNPACK #-}!(Arr s Node)
+  ,childE     :: {-# UNPACK #-}!(Arr s Node)
+  ,ndfsE      :: {-# UNPACK #-}!(Arr s Node)
+  ,dfnE       :: {-# UNPACK #-}!(Arr s Int)
+  ,sdnoE      :: {-# UNPACK #-}!(Arr s Int)
+  ,sizeE      :: {-# UNPACK #-}!(Arr s Int)
+  ,domE       :: {-# UNPACK #-}!(Arr s Node)
+  ,rnE        :: {-# UNPACK #-}!(Arr s Node)}
+
+-----------------------------------------------------------------------------
+
+idomM :: Dom s [(Node,Node)]
+idomM = do
+  dfsDom =<< rootM
+  n <- gets dfsE
+  forM_ [n,n-1..1] (\i-> do
+    w <- ndfsM i
+    sw <- sdnoM w
+    ps <- predsM w
+    forM_ ps (\v-> do
+      u <- eval v
+      su <- sdnoM u
+      when (su < sw)
+        (store sdnoE w su))
+    z <- ndfsM =<< sdnoM w
+    modify(\e->e{bucketE=IM.adjust
+                      (w`IS.insert`)
+                      z (bucketE e)})
+    pw <- parentM w
+    link pw w
+    bps <- bucketM pw
+    forM_ bps (\v-> do
+      u <- eval v
+      su <- sdnoM u
+      sv <- sdnoM v
+      let dv = case su < sv of
+                True-> u
+                False-> pw
+      store domE v dv))
+  forM_ [1..n] (\i-> do
+    w <- ndfsM i
+    j <- sdnoM w
+    z <- ndfsM j
+    dw <- domM w
+    when (dw /= z)
+      (do ddw <- domM dw
+          store domE w ddw))
+  fromEnv
+
+-----------------------------------------------------------------------------
+
+eval :: Node -> Dom s Node
+eval v = do
+  n0 <- zeroM
+  a  <- ancestorM v
+  case a==n0 of
+    True-> labelM v
+    False-> do
+      compress v
+      a   <- ancestorM v
+      l   <- labelM v
+      la  <- labelM a
+      sl  <- sdnoM l
+      sla <- sdnoM la
+      case sl <= sla of
+        True-> return l
+        False-> return la
+
+compress :: Node -> Dom s ()
+compress v = do
+  n0  <- zeroM
+  a   <- ancestorM v
+  aa  <- ancestorM a
+  when (aa /= n0) (do
+    compress a
+    a   <- ancestorM v
+    aa  <- ancestorM a
+    l   <- labelM v
+    la  <- labelM a
+    sl  <- sdnoM l
+    sla <- sdnoM la
+    when (sla < sl)
+      (store labelE v la)
+    store ancestorE v aa)
+
+-----------------------------------------------------------------------------
+
+link :: Node -> Node -> Dom s ()
+link v w = do
+  n0  <- zeroM
+  lw  <- labelM w
+  slw <- sdnoM lw
+  let balance s = do
+        c   <- childM s
+        lc  <- labelM c
+        slc <- sdnoM lc
+        case slw < slc of
+          False-> return s
+          True-> do
+            zs  <- sizeM s
+            zc  <- sizeM c
+            cc  <- childM c
+            zcc <- sizeM cc
+            case 2*zc <= zs+zcc of
+              True-> do
+                store ancestorE c s
+                store childE s cc
+                balance s
+              False-> do
+                store sizeE c zs
+                store ancestorE s c
+                balance c
+  s   <- balance w
+  lw  <- labelM w
+  zw  <- sizeM w
+  store labelE s lw
+  store sizeE v . (+zw) =<< sizeM v
+  let follow s = do
+        when (s /= n0) (do
+          store ancestorE s v
+          follow =<< childM s)
+  zv  <- sizeM v
+  follow =<< case zv < 2*zw of
+              False-> return s
+              True-> do
+                cv <- childM v
+                store childE v s
+                return cv
+
+-----------------------------------------------------------------------------
+
+dfsDom :: Node -> Dom s ()
+dfsDom i = do
+  _   <- go i
+  n0  <- zeroM
+  r   <- rootM
+  store parentE r n0
+  where go i = do
+          n <- nextM
+          store dfnE   i n
+          store sdnoE  i n
+          store ndfsE  n i
+          store labelE i i
+          ss <- succsM i
+          forM_ ss (\j-> do
+            s <- sdnoM j
+            case s==0 of
+              False-> return()
+              True-> do
+                store parentE j i
+                go j)
+
+-----------------------------------------------------------------------------
+
+initEnv :: Rooted -> ST s (Env s)
+initEnv (r0,g0) = do
+  let (g,rnmap) = renum 1 g0
+      pred      = predG g
+      r         = rnmap IM.! r0
+      n         = IM.size g
+      ns        = [0..n]
+      m         = n+1
+
+  let bucket = IM.fromList
+        (zip ns (repeat mempty))
+
+  rna <- newI m
+  writes rna (fmap swap
+        (IM.toList rnmap))
+
+  doms      <- newI m
+  sdno      <- newI m
+  size      <- newI m
+  parent    <- newI m
+  ancestor  <- newI m
+  child     <- newI m
+  label     <- newI m
+  ndfs      <- newI m
+  dfn       <- newI m
+
+  forM_ [0..n] (doms.=0)
+  forM_ [0..n] (sdno.=0)
+  forM_ [1..n] (size.=1)
+  forM_ [0..n] (ancestor.=0)
+  forM_ [0..n] (child.=0)
+
+  (doms.=r) r
+  (size.=0) 0
+  (label.=0) 0
+
+  return (Env
+    {rnE        = rna
+    ,dfsE       = 0
+    ,zeroE      = 0
+    ,rootE      = r
+    ,labelE     = label
+    ,parentE    = parent
+    ,ancestorE  = ancestor
+    ,childE     = child
+    ,ndfsE      = ndfs
+    ,dfnE       = dfn
+    ,sdnoE      = sdno
+    ,sizeE      = size
+    ,succE      = g
+    ,predE      = pred
+    ,bucketE    = bucket
+    ,domE       = doms})
+
+fromEnv :: Dom s [(Node,Node)]
+fromEnv = do
+  dom   <- gets domE
+  rn    <- gets rnE
+  -- r     <- gets rootE
+  (_,n) <- st (getBounds dom)
+  forM [1..n] (\i-> do
+    j <- st(rn!:i)
+    d <- st(dom!:i)
+    k <- st(rn!:d)
+    return (j,k))
+
+-----------------------------------------------------------------------------
+
+zeroM :: Dom s Node
+zeroM = gets zeroE
+domM :: Node -> Dom s Node
+domM = fetch domE
+rootM :: Dom s Node
+rootM = gets rootE
+succsM :: Node -> Dom s [Node]
+succsM i = gets (IS.toList . (! i) . succE)
+predsM :: Node -> Dom s [Node]
+predsM i = gets (IS.toList . (! i) . predE)
+bucketM :: Node -> Dom s [Node]
+bucketM i = gets (IS.toList . (! i) . bucketE)
+sizeM :: Node -> Dom s Int
+sizeM = fetch sizeE
+sdnoM :: Node -> Dom s Int
+sdnoM = fetch sdnoE
+-- dfnM :: Node -> Dom s Int
+-- dfnM = fetch dfnE
+ndfsM :: Int -> Dom s Node
+ndfsM = fetch ndfsE
+childM :: Node -> Dom s Node
+childM = fetch childE
+ancestorM :: Node -> Dom s Node
+ancestorM = fetch ancestorE
+parentM :: Node -> Dom s Node
+parentM = fetch parentE
+labelM :: Node -> Dom s Node
+labelM = fetch labelE
+nextM :: Dom s Int
+nextM = do
+  n <- gets dfsE
+  let n' = n+1
+  modify(\e->e{dfsE=n'})
+  return n'
+
+-----------------------------------------------------------------------------
+
+type A = STUArray
+type Arr s a = A s Int a
+
+infixl 9 !:
+infixr 2 .=
+
+(.=) :: (MArray (A s) a (ST s))
+     => Arr s a -> a -> Int -> ST s ()
+(v .= x) i
+  | debugIsOn = writeArray v i x
+  | otherwise = unsafeWrite v i x
+
+(!:) :: (MArray (A s) a (ST s))
+     => A s Int a -> Int -> ST s a
+a !: i
+  | debugIsOn = do
+      o <- readArray a i
+      return $! o
+  | otherwise = do
+      o <- unsafeRead a i
+      return $! o
+
+new :: (MArray (A s) a (ST s))
+    => Int -> ST s (Arr s a)
+new n = unsafeNewArray_ (0,n-1)
+
+newI :: Int -> ST s (Arr s Int)
+newI = new
+
+-- newD :: Int -> ST s (Arr s Double)
+-- newD = new
+
+-- dump :: (MArray (A s) a (ST s)) => Arr s a -> ST s [a]
+-- dump a = do
+--   (m,n) <- getBounds a
+--   forM [m..n] (\i -> a!:i)
+
+writes :: (MArray (A s) a (ST s))
+     => Arr s a -> [(Int,a)] -> ST s ()
+writes a xs = forM_ xs (\(i,x) -> (a.=x) i)
+
+-- arr :: (MArray (A s) a (ST s)) => [a] -> ST s (Arr s a)
+-- arr xs = do
+--   let n = length xs
+--   a <- new n
+--   go a n 0 xs
+--   return a
+--   where go _ _ _    [] = return ()
+--         go a n i (x:xs)
+--           | i <= n = (a.=x) i >> go a n (i+1) xs
+--           | otherwise = return ()
+
+-----------------------------------------------------------------------------
+
+(!) :: Monoid a => IntMap a -> Int -> a
+(!) g n = maybe mempty id (IM.lookup n g)
+
+fromAdj :: [(Node, [Node])] -> Graph
+fromAdj = IM.fromList . fmap (second IS.fromList)
+
+fromEdges :: [Edge] -> Graph
+fromEdges = collectI IS.union fst (IS.singleton . snd)
+
+toAdj :: Graph -> [(Node, [Node])]
+toAdj = fmap (second IS.toList) . IM.toList
+
+toEdges :: Graph -> [Edge]
+toEdges = concatMap (uncurry (fmap . (,))) . toAdj
+
+predG :: Graph -> Graph
+predG g = IM.unionWith IS.union (go g) g0
+  where g0 = fmap (const mempty) g
+        f :: IntMap IntSet -> Int -> IntSet -> IntMap IntSet
+        f m i a = foldl' (\m p -> IM.insertWith mappend p
+                                      (IS.singleton i) m)
+                        m
+                       (IS.toList a)
+        go :: IntMap IntSet -> IntMap IntSet
+        go = flip IM.foldlWithKey' mempty f
+
+pruneReach :: Rooted -> Rooted
+pruneReach (r,g) = (r,g2)
+  where is = reachable
+              (maybe mempty id
+                . flip IM.lookup g) $ r
+        g2 = IM.fromList
+            . fmap (second (IS.filter (`IS.member`is)))
+            . filter ((`IS.member`is) . fst)
+            . IM.toList $ g
+
+tip :: Tree a -> (a, [Tree a])
+tip (Node a ts) = (a, ts)
+
+parents :: Tree a -> [(a, a)]
+parents (Node i xs) = p i xs
+        ++ concatMap parents xs
+  where p i = fmap (flip (,) i . rootLabel)
+
+ancestors :: Tree a -> [(a, [a])]
+ancestors = go []
+  where go acc (Node i xs)
+          = let acc' = i:acc
+            in p acc' xs ++ concatMap (go acc') xs
+        p is = fmap (flip (,) is . rootLabel)
+
+asGraph :: Tree Node -> Rooted
+asGraph t@(Node a _) = let g = go t in (a, fromAdj g)
+  where go (Node a ts) = let as = (fst . unzip . fmap tip) ts
+                          in (a, as) : concatMap go ts
+
+asTree :: Rooted -> Tree Node
+asTree (r,g) = let go a = Node a (fmap go ((IS.toList . f) a))
+                   f = (g !)
+            in go r
+
+reachable :: (Node -> NodeSet) -> (Node -> NodeSet)
+reachable f a = go (IS.singleton a) a
+  where go seen a = let s = f a
+                        as = IS.toList (s `IS.difference` seen)
+                    in foldl' go (s `IS.union` seen) as
+
+collectI :: (c -> c -> c)
+        -> (a -> Int) -> (a -> c) -> [a] -> IntMap c
+collectI (<>) f g
+  = foldl' (\m a -> IM.insertWith (<>)
+                                  (f a)
+                                  (g a) m) mempty
+
+-- collect :: (Ord b) => (c -> c -> c)
+--         -> (a -> b) -> (a -> c) -> [a] -> Map b c
+-- collect (<>) f g
+--   = foldl' (\m a -> SM.insertWith (<>)
+--                                   (f a)
+--                                   (g a) m) mempty
+
+-- (renamed, old -> new)
+renum :: Int -> Graph -> (Graph, NodeMap Node)
+renum from = (\(_,m,g)->(g,m))
+  . IM.foldlWithKey'
+      f (from,mempty,mempty)
+  where
+    f :: (Int, NodeMap Node, IntMap IntSet) -> Node -> IntSet
+      -> (Int, NodeMap Node, IntMap IntSet)
+    f (!n,!env,!new) i ss =
+            let (j,n2,env2) = go n env i
+                (n3,env3,ss2) = IS.fold
+                  (\k (!n,!env,!new)->
+                      case go n env k of
+                        (l,n2,env2)-> (n2,env2,l `IS.insert` new))
+                  (n2,env2,mempty) ss
+                new2 = IM.insertWith IS.union j ss2 new
+            in (n3,env3,new2)
+    go :: Int
+        -> NodeMap Node
+        -> Node
+        -> (Node,Int,NodeMap Node)
+    go !n !env i =
+        case IM.lookup i env of
+        Just j -> (j,n,env)
+        Nothing -> (n,n+1,IM.insert i n env)
+
+-----------------------------------------------------------------------------
+
+newtype S z s a = S {unS :: forall o. (a -> s -> ST z o) -> s -> ST z o}
+instance Functor (S z s) where
+  fmap f (S g) = S (\k -> g (k . f))
+instance Monad (S z s) where
+  return = pure
+  S g >>= f = S (\k -> g (\a -> unS (f a) k))
+instance Applicative (S z s) where
+  pure a = S (\k -> k a)
+  (<*>) = ap
+-- get :: S z s s
+-- get = S (\k s -> k s s)
+gets :: (s -> a) -> S z s a
+gets f = S (\k s -> k (f s) s)
+-- set :: s -> S z s ()
+-- set s = S (\k _ -> k () s)
+modify :: (s -> s) -> S z s ()
+modify f = S (\k -> k () . f)
+-- runS :: S z s a -> s -> ST z (a, s)
+-- runS (S g) = g (\a s -> return (a,s))
+evalS :: S z s a -> s -> ST z a
+evalS (S g) = g ((return .) . const)
+-- execS :: S z s a -> s -> ST z s
+-- execS (S g) = g ((return .) . flip const)
+st :: ST z a -> S z s a
+st m = S (\k s-> do
+  a <- m
+  k a s)
+store :: (MArray (A z) a (ST z))
+      => (s -> Arr z a) -> Int -> a -> S z s ()
+store f i x = do
+  a <- gets f
+  st ((a.=x) i)
+fetch :: (MArray (A z) a (ST z))
+      => (s -> Arr z a) -> Int -> S z s a
+fetch f i = do
+  a <- gets f
+  st (a!:i)
+
diff --git a/compiler/GHC/CmmToAsm/CPrim.hs b/compiler/GHC/CmmToAsm/CPrim.hs
new file mode 100644
index 0000000000..34c3a7ff6a
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/CPrim.hs
@@ -0,0 +1,133 @@
+-- | Generating C symbol names emitted by the compiler.
+module GHC.CmmToAsm.CPrim
+    ( atomicReadLabel
+    , atomicWriteLabel
+    , atomicRMWLabel
+    , cmpxchgLabel
+    , popCntLabel
+    , pdepLabel
+    , pextLabel
+    , bSwapLabel
+    , bRevLabel
+    , clzLabel
+    , ctzLabel
+    , word2FloatLabel
+    ) where
+
+import GhcPrelude
+
+import GHC.Cmm.Type
+import GHC.Cmm.MachOp
+import Outputable
+
+popCntLabel :: Width -> String
+popCntLabel w = "hs_popcnt" ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "popCntLabel: Unsupported word width " (ppr w)
+
+pdepLabel :: Width -> String
+pdepLabel w = "hs_pdep" ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "pdepLabel: Unsupported word width " (ppr w)
+
+pextLabel :: Width -> String
+pextLabel w = "hs_pext" ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "pextLabel: Unsupported word width " (ppr w)
+
+bSwapLabel :: Width -> String
+bSwapLabel w = "hs_bswap" ++ pprWidth w
+  where
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "bSwapLabel: Unsupported word width " (ppr w)
+
+bRevLabel :: Width -> String
+bRevLabel w = "hs_bitrev" ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "bRevLabel: Unsupported word width " (ppr w)
+
+clzLabel :: Width -> String
+clzLabel w = "hs_clz" ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "clzLabel: Unsupported word width " (ppr w)
+
+ctzLabel :: Width -> String
+ctzLabel w = "hs_ctz" ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "ctzLabel: Unsupported word width " (ppr w)
+
+word2FloatLabel :: Width -> String
+word2FloatLabel w = "hs_word2float" ++ pprWidth w
+  where
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "word2FloatLabel: Unsupported word width " (ppr w)
+
+atomicRMWLabel :: Width -> AtomicMachOp -> String
+atomicRMWLabel w amop = "hs_atomic_" ++ pprFunName amop ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "atomicRMWLabel: Unsupported word width " (ppr w)
+
+    pprFunName AMO_Add  = "add"
+    pprFunName AMO_Sub  = "sub"
+    pprFunName AMO_And  = "and"
+    pprFunName AMO_Nand = "nand"
+    pprFunName AMO_Or   = "or"
+    pprFunName AMO_Xor  = "xor"
+
+cmpxchgLabel :: Width -> String
+cmpxchgLabel w = "hs_cmpxchg" ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "cmpxchgLabel: Unsupported word width " (ppr w)
+
+atomicReadLabel :: Width -> String
+atomicReadLabel w = "hs_atomicread" ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "atomicReadLabel: Unsupported word width " (ppr w)
+
+atomicWriteLabel :: Width -> String
+atomicWriteLabel w = "hs_atomicwrite" ++ pprWidth w
+  where
+    pprWidth W8  = "8"
+    pprWidth W16 = "16"
+    pprWidth W32 = "32"
+    pprWidth W64 = "64"
+    pprWidth w   = pprPanic "atomicWriteLabel: Unsupported word width " (ppr w)
diff --git a/compiler/GHC/CmmToAsm/Dwarf.hs b/compiler/GHC/CmmToAsm/Dwarf.hs
new file mode 100644
index 0000000000..8cacd19023
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Dwarf.hs
@@ -0,0 +1,269 @@
+module GHC.CmmToAsm.Dwarf (
+  dwarfGen
+  ) where
+
+import GhcPrelude
+
+import GHC.Cmm.CLabel
+import GHC.Cmm.Expr    ( GlobalReg(..) )
+import Config          ( cProjectName, cProjectVersion )
+import CoreSyn         ( Tickish(..) )
+import GHC.Cmm.DebugBlock
+import GHC.Driver.Session
+import Module
+import Outputable
+import GHC.Platform
+import Unique
+import UniqSupply
+
+import GHC.CmmToAsm.Dwarf.Constants
+import GHC.CmmToAsm.Dwarf.Types
+
+import Control.Arrow    ( first )
+import Control.Monad    ( mfilter )
+import Data.Maybe
+import Data.List        ( sortBy )
+import Data.Ord         ( comparing )
+import qualified Data.Map as Map
+import System.FilePath
+import System.Directory ( getCurrentDirectory )
+
+import qualified GHC.Cmm.Dataflow.Label as H
+import qualified GHC.Cmm.Dataflow.Collections as H
+
+-- | Generate DWARF/debug information
+dwarfGen :: DynFlags -> ModLocation -> UniqSupply -> [DebugBlock]
+            -> IO (SDoc, UniqSupply)
+dwarfGen _  _      us [] = return (empty, us)
+dwarfGen df modLoc us blocks = do
+
+  -- Convert debug data structures to DWARF info records
+  -- We strip out block information when running with -g0 or -g1.
+  let procs = debugSplitProcs blocks
+      stripBlocks dbg
+        | debugLevel df < 2 = dbg { dblBlocks = [] }
+        | otherwise         = dbg
+  compPath <- getCurrentDirectory
+  let lowLabel = dblCLabel $ head procs
+      highLabel = mkAsmTempEndLabel $ dblCLabel $ last procs
+      dwarfUnit = DwarfCompileUnit
+        { dwChildren = map (procToDwarf df) (map stripBlocks procs)
+        , dwName = fromMaybe "" (ml_hs_file modLoc)
+        , dwCompDir = addTrailingPathSeparator compPath
+        , dwProducer = cProjectName ++ " " ++ cProjectVersion
+        , dwLowLabel = lowLabel
+        , dwHighLabel = highLabel
+        , dwLineLabel = dwarfLineLabel
+        }
+
+  -- Check whether we have any source code information, so we do not
+  -- end up writing a pointer to an empty .debug_line section
+  -- (dsymutil on Mac Os gets confused by this).
+  let haveSrcIn blk = isJust (dblSourceTick blk) && isJust (dblPosition blk)
+                      || any haveSrcIn (dblBlocks blk)
+      haveSrc = any haveSrcIn procs
+
+  -- .debug_abbrev section: Declare the format we're using
+  let abbrevSct = pprAbbrevDecls haveSrc
+
+  -- .debug_info section: Information records on procedures and blocks
+  let -- unique to identify start and end compilation unit .debug_inf
+      (unitU, us') = takeUniqFromSupply us
+      infoSct = vcat [ ptext dwarfInfoLabel <> colon
+                     , dwarfInfoSection
+                     , compileUnitHeader unitU
+                     , pprDwarfInfo haveSrc dwarfUnit
+                     , compileUnitFooter unitU
+                     ]
+
+  -- .debug_line section: Generated mainly by the assembler, but we
+  -- need to label it
+  let lineSct = dwarfLineSection $$
+                ptext dwarfLineLabel <> colon
+
+  -- .debug_frame section: Information about the layout of the GHC stack
+  let (framesU, us'') = takeUniqFromSupply us'
+      frameSct = dwarfFrameSection $$
+                 ptext dwarfFrameLabel <> colon $$
+                 pprDwarfFrame (debugFrame framesU procs)
+
+  -- .aranges section: Information about the bounds of compilation units
+  let aranges' | gopt Opt_SplitSections df = map mkDwarfARange procs
+               | otherwise                 = [DwarfARange lowLabel highLabel]
+  let aranges = dwarfARangesSection $$ pprDwarfARanges aranges' unitU
+
+  return (infoSct $$ abbrevSct $$ lineSct $$ frameSct $$ aranges, us'')
+
+-- | Build an address range entry for one proc.
+-- With split sections, each proc needs its own entry, since they may get
+-- scattered in the final binary. Without split sections, we could make a
+-- single arange based on the first/last proc.
+mkDwarfARange :: DebugBlock -> DwarfARange
+mkDwarfARange proc = DwarfARange start end
+  where
+    start = dblCLabel proc
+    end = mkAsmTempEndLabel start
+
+-- | Header for a compilation unit, establishing global format
+-- parameters
+compileUnitHeader :: Unique -> SDoc
+compileUnitHeader unitU = sdocWithPlatform $ \plat ->
+  let cuLabel = mkAsmTempLabel unitU  -- sits right before initialLength field
+      length = ppr (mkAsmTempEndLabel cuLabel) <> char '-' <> ppr cuLabel
+               <> text "-4"       -- length of initialLength field
+  in vcat [ ppr cuLabel <> colon
+          , text "\t.long " <> length  -- compilation unit size
+          , pprHalf 3                          -- DWARF version
+          , sectionOffset (ptext dwarfAbbrevLabel) (ptext dwarfAbbrevLabel)
+                                               -- abbrevs offset
+          , text "\t.byte " <> ppr (platformWordSizeInBytes plat) -- word size
+          ]
+
+-- | Compilation unit footer, mainly establishing size of debug sections
+compileUnitFooter :: Unique -> SDoc
+compileUnitFooter unitU =
+  let cuEndLabel = mkAsmTempEndLabel $ mkAsmTempLabel unitU
+  in ppr cuEndLabel <> colon
+
+-- | Splits the blocks by procedures. In the result all nested blocks
+-- will come from the same procedure as the top-level block. See
+-- Note [Splitting DebugBlocks] for details.
+debugSplitProcs :: [DebugBlock] -> [DebugBlock]
+debugSplitProcs b = concat $ H.mapElems $ mergeMaps $ map (split Nothing) b
+  where mergeMaps = foldr (H.mapUnionWithKey (const (++))) H.mapEmpty
+        split :: Maybe DebugBlock -> DebugBlock -> H.LabelMap [DebugBlock]
+        split parent blk = H.mapInsert prc [blk'] nested
+          where prc = dblProcedure blk
+                blk' = blk { dblBlocks = own_blks
+                           , dblParent = parent
+                           }
+                own_blks = fromMaybe [] $ H.mapLookup prc nested
+                nested = mergeMaps $ map (split parent') $ dblBlocks blk
+                -- Figure out who should be the parent of nested blocks.
+                -- If @blk@ is optimized out then it isn't a good choice
+                -- and we just use its parent.
+                parent'
+                  | Nothing <- dblPosition blk = parent
+                  | otherwise                  = Just blk
+
+{-
+Note [Splitting DebugBlocks]
+
+DWARF requires that we break up the nested DebugBlocks produced from
+the C-- AST. For instance, we begin with tick trees containing nested procs.
+For example,
+
+    proc A [tick1, tick2]
+      block B [tick3]
+        proc C [tick4]
+
+when producing DWARF we need to procs (which are represented in DWARF as
+TAG_subprogram DIEs) to be top-level DIEs. debugSplitProcs is responsible for
+this transform, pulling out the nested procs into top-level procs.
+
+However, in doing this we need to be careful to preserve the parentage of the
+nested procs. This is the reason DebugBlocks carry the dblParent field, allowing
+us to reorganize the above tree as,
+
+    proc A [tick1, tick2]
+      block B [tick3]
+    proc C [tick4] parent=B
+
+Here we have annotated the new proc C with an attribute giving its original
+parent, B.
+-}
+
+-- | Generate DWARF info for a procedure debug block
+procToDwarf :: DynFlags -> DebugBlock -> DwarfInfo
+procToDwarf df prc
+  = DwarfSubprogram { dwChildren = map (blockToDwarf df) (dblBlocks prc)
+                    , dwName     = case dblSourceTick prc of
+                         Just s@SourceNote{} -> sourceName s
+                         _otherwise -> showSDocDump df $ ppr $ dblLabel prc
+                    , dwLabel    = dblCLabel prc
+                    , dwParent   = fmap mkAsmTempDieLabel
+                                   $ mfilter goodParent
+                                   $ fmap dblCLabel (dblParent prc)
+                    }
+  where
+  goodParent a | a == dblCLabel prc = False
+               -- Omit parent if it would be self-referential
+  goodParent a | not (externallyVisibleCLabel a)
+               , debugLevel df < 2 = False
+               -- We strip block information when running -g0 or -g1, don't
+               -- refer to blocks in that case. Fixes #14894.
+  goodParent _ = True
+
+-- | Generate DWARF info for a block
+blockToDwarf :: DynFlags -> DebugBlock -> DwarfInfo
+blockToDwarf df blk
+  = DwarfBlock { dwChildren = concatMap (tickToDwarf df) (dblTicks blk)
+                              ++ map (blockToDwarf df) (dblBlocks blk)
+               , dwLabel    = dblCLabel blk
+               , dwMarker   = marker
+               }
+  where
+    marker
+      | Just _ <- dblPosition blk = Just $ mkAsmTempLabel $ dblLabel blk
+      | otherwise                 = Nothing   -- block was optimized out
+
+tickToDwarf :: DynFlags -> Tickish () -> [DwarfInfo]
+tickToDwarf _  (SourceNote ss _) = [DwarfSrcNote ss]
+tickToDwarf _ _ = []
+
+-- | Generates the data for the debug frame section, which encodes the
+-- desired stack unwind behaviour for the debugger
+debugFrame :: Unique -> [DebugBlock] -> DwarfFrame
+debugFrame u procs
+  = DwarfFrame { dwCieLabel = mkAsmTempLabel u
+               , dwCieInit  = initUws
+               , dwCieProcs = map (procToFrame initUws) procs
+               }
+  where
+    initUws :: UnwindTable
+    initUws = Map.fromList [(Sp, Just (UwReg Sp 0))]
+
+-- | Generates unwind information for a procedure debug block
+procToFrame :: UnwindTable -> DebugBlock -> DwarfFrameProc
+procToFrame initUws blk
+  = DwarfFrameProc { dwFdeProc    = dblCLabel blk
+                   , dwFdeHasInfo = dblHasInfoTbl blk
+                   , dwFdeBlocks  = map (uncurry blockToFrame)
+                                        (setHasInfo blockUws)
+                   }
+  where blockUws :: [(DebugBlock, [UnwindPoint])]
+        blockUws = map snd $ sortBy (comparing fst) $ flatten blk
+
+        flatten :: DebugBlock
+                -> [(Int, (DebugBlock, [UnwindPoint]))]
+        flatten b@DebugBlock{ dblPosition=pos, dblUnwind=uws, dblBlocks=blocks }
+          | Just p <- pos  = (p, (b, uws')):nested
+          | otherwise      = nested -- block was optimized out
+          where uws'   = addDefaultUnwindings initUws uws
+                nested = concatMap flatten blocks
+
+        -- | If the current procedure has an info table, then we also say that
+        -- its first block has one to ensure that it gets the necessary -1
+        -- offset applied to its start address.
+        -- See Note [Info Offset] in Dwarf.Types.
+        setHasInfo :: [(DebugBlock, [UnwindPoint])]
+                   -> [(DebugBlock, [UnwindPoint])]
+        setHasInfo [] = []
+        setHasInfo (c0:cs) = first setIt c0 : cs
+          where
+            setIt child =
+              child { dblHasInfoTbl = dblHasInfoTbl child
+                                      || dblHasInfoTbl blk }
+
+blockToFrame :: DebugBlock -> [UnwindPoint] -> DwarfFrameBlock
+blockToFrame blk uws
+  = DwarfFrameBlock { dwFdeBlkHasInfo = dblHasInfoTbl blk
+                    , dwFdeUnwind     = uws
+                    }
+
+addDefaultUnwindings :: UnwindTable -> [UnwindPoint] -> [UnwindPoint]
+addDefaultUnwindings tbl pts =
+    [ UnwindPoint lbl (tbl' `mappend` tbl)
+      -- mappend is left-biased
+    | UnwindPoint lbl tbl' <- pts
+    ]
diff --git a/compiler/GHC/CmmToAsm/Dwarf/Constants.hs b/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
new file mode 100644
index 0000000000..4ab54b6629
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Dwarf/Constants.hs
@@ -0,0 +1,229 @@
+-- | Constants describing the DWARF format. Most of this simply
+-- mirrors /usr/include/dwarf.h.
+
+module GHC.CmmToAsm.Dwarf.Constants where
+
+import GhcPrelude
+
+import AsmUtils
+import FastString
+import GHC.Platform
+import Outputable
+
+import GHC.Platform.Reg
+import GHC.CmmToAsm.X86.Regs
+
+import Data.Word
+
+-- | Language ID used for Haskell.
+dW_LANG_Haskell :: Word
+dW_LANG_Haskell = 0x18
+  -- Thanks to Nathan Howell for getting us our very own language ID!
+
+-- * Dwarf tags
+dW_TAG_compile_unit, dW_TAG_subroutine_type,
+  dW_TAG_file_type, dW_TAG_subprogram, dW_TAG_lexical_block,
+  dW_TAG_base_type, dW_TAG_structure_type, dW_TAG_pointer_type,
+  dW_TAG_array_type, dW_TAG_subrange_type, dW_TAG_typedef,
+  dW_TAG_variable, dW_TAG_arg_variable, dW_TAG_auto_variable,
+  dW_TAG_ghc_src_note :: Word
+dW_TAG_array_type      = 1
+dW_TAG_lexical_block   = 11
+dW_TAG_pointer_type    = 15
+dW_TAG_compile_unit    = 17
+dW_TAG_structure_type  = 19
+dW_TAG_typedef         = 22
+dW_TAG_subroutine_type = 32
+dW_TAG_subrange_type   = 33
+dW_TAG_base_type       = 36
+dW_TAG_file_type       = 41
+dW_TAG_subprogram      = 46
+dW_TAG_variable        = 52
+dW_TAG_auto_variable   = 256
+dW_TAG_arg_variable    = 257
+
+dW_TAG_ghc_src_note    = 0x5b00
+
+-- * Dwarf attributes
+dW_AT_name, dW_AT_stmt_list, dW_AT_low_pc, dW_AT_high_pc, dW_AT_language,
+  dW_AT_comp_dir, dW_AT_producer, dW_AT_external, dW_AT_frame_base,
+  dW_AT_use_UTF8, dW_AT_MIPS_linkage_name :: Word
+dW_AT_name              = 0x03
+dW_AT_stmt_list         = 0x10
+dW_AT_low_pc            = 0x11
+dW_AT_high_pc           = 0x12
+dW_AT_language          = 0x13
+dW_AT_comp_dir          = 0x1b
+dW_AT_producer          = 0x25
+dW_AT_external          = 0x3f
+dW_AT_frame_base        = 0x40
+dW_AT_use_UTF8          = 0x53
+dW_AT_MIPS_linkage_name = 0x2007
+
+-- * Custom DWARF attributes
+-- Chosen a more or less random section of the vendor-extensible region
+
+-- ** Describing C-- blocks
+-- These appear in DW_TAG_lexical_scope DIEs corresponding to C-- blocks
+dW_AT_ghc_tick_parent :: Word
+dW_AT_ghc_tick_parent     = 0x2b20
+
+-- ** Describing source notes
+-- These appear in DW_TAG_ghc_src_note DIEs
+dW_AT_ghc_span_file, dW_AT_ghc_span_start_line,
+  dW_AT_ghc_span_start_col, dW_AT_ghc_span_end_line,
+  dW_AT_ghc_span_end_col :: Word
+dW_AT_ghc_span_file       = 0x2b00
+dW_AT_ghc_span_start_line = 0x2b01
+dW_AT_ghc_span_start_col  = 0x2b02
+dW_AT_ghc_span_end_line   = 0x2b03
+dW_AT_ghc_span_end_col    = 0x2b04
+
+
+-- * Abbrev declarations
+dW_CHILDREN_no, dW_CHILDREN_yes :: Word8
+dW_CHILDREN_no  = 0
+dW_CHILDREN_yes = 1
+
+dW_FORM_addr, dW_FORM_data2, dW_FORM_data4, dW_FORM_string, dW_FORM_flag,
+  dW_FORM_block1, dW_FORM_ref4, dW_FORM_ref_addr, dW_FORM_flag_present :: Word
+dW_FORM_addr   = 0x01
+dW_FORM_data2  = 0x05
+dW_FORM_data4  = 0x06
+dW_FORM_string = 0x08
+dW_FORM_flag   = 0x0c
+dW_FORM_block1 = 0x0a
+dW_FORM_ref_addr     = 0x10
+dW_FORM_ref4         = 0x13
+dW_FORM_flag_present = 0x19
+
+-- * Dwarf native types
+dW_ATE_address, dW_ATE_boolean, dW_ATE_float, dW_ATE_signed,
+  dW_ATE_signed_char, dW_ATE_unsigned, dW_ATE_unsigned_char :: Word
+dW_ATE_address       = 1
+dW_ATE_boolean       = 2
+dW_ATE_float         = 4
+dW_ATE_signed        = 5
+dW_ATE_signed_char   = 6
+dW_ATE_unsigned      = 7
+dW_ATE_unsigned_char = 8
+
+-- * Call frame information
+dW_CFA_set_loc, dW_CFA_undefined, dW_CFA_same_value,
+  dW_CFA_def_cfa, dW_CFA_def_cfa_offset, dW_CFA_def_cfa_expression,
+  dW_CFA_expression, dW_CFA_offset_extended_sf, dW_CFA_def_cfa_offset_sf,
+  dW_CFA_def_cfa_sf, dW_CFA_val_offset, dW_CFA_val_expression,
+  dW_CFA_offset :: Word8
+dW_CFA_set_loc            = 0x01
+dW_CFA_undefined          = 0x07
+dW_CFA_same_value         = 0x08
+dW_CFA_def_cfa            = 0x0c
+dW_CFA_def_cfa_offset     = 0x0e
+dW_CFA_def_cfa_expression = 0x0f
+dW_CFA_expression         = 0x10
+dW_CFA_offset_extended_sf = 0x11
+dW_CFA_def_cfa_sf         = 0x12
+dW_CFA_def_cfa_offset_sf  = 0x13
+dW_CFA_val_offset         = 0x14
+dW_CFA_val_expression     = 0x16
+dW_CFA_offset             = 0x80
+
+-- * Operations
+dW_OP_addr, dW_OP_deref, dW_OP_consts,
+  dW_OP_minus, dW_OP_mul, dW_OP_plus,
+  dW_OP_lit0, dW_OP_breg0, dW_OP_call_frame_cfa :: Word8
+dW_OP_addr           = 0x03
+dW_OP_deref          = 0x06
+dW_OP_consts         = 0x11
+dW_OP_minus          = 0x1c
+dW_OP_mul            = 0x1e
+dW_OP_plus           = 0x22
+dW_OP_lit0           = 0x30
+dW_OP_breg0          = 0x70
+dW_OP_call_frame_cfa = 0x9c
+
+-- * Dwarf section declarations
+dwarfInfoSection, dwarfAbbrevSection, dwarfLineSection,
+  dwarfFrameSection, dwarfGhcSection, dwarfARangesSection :: SDoc
+dwarfInfoSection    = dwarfSection "info"
+dwarfAbbrevSection  = dwarfSection "abbrev"
+dwarfLineSection    = dwarfSection "line"
+dwarfFrameSection   = dwarfSection "frame"
+dwarfGhcSection     = dwarfSection "ghc"
+dwarfARangesSection = dwarfSection "aranges"
+
+dwarfSection :: String -> SDoc
+dwarfSection name = sdocWithPlatform $ \plat ->
+  case platformOS plat of
+    os | osElfTarget os
+       -> text "\t.section .debug_" <> text name <> text ",\"\","
+          <> sectionType "progbits"
+       | osMachOTarget os
+       -> text "\t.section __DWARF,__debug_" <> text name <> text ",regular,debug"
+       | otherwise
+       -> text "\t.section .debug_" <> text name <> text ",\"dr\""
+
+-- * Dwarf section labels
+dwarfInfoLabel, dwarfAbbrevLabel, dwarfLineLabel, dwarfFrameLabel :: PtrString
+dwarfInfoLabel   = sLit ".Lsection_info"
+dwarfAbbrevLabel = sLit ".Lsection_abbrev"
+dwarfLineLabel   = sLit ".Lsection_line"
+dwarfFrameLabel  = sLit ".Lsection_frame"
+
+-- | Mapping of registers to DWARF register numbers
+dwarfRegNo :: Platform -> Reg -> Word8
+dwarfRegNo p r = case platformArch p of
+  ArchX86
+    | r == eax  -> 0
+    | r == ecx  -> 1  -- yes, no typo
+    | r == edx  -> 2
+    | r == ebx  -> 3
+    | r == esp  -> 4
+    | r == ebp  -> 5
+    | r == esi  -> 6
+    | r == edi  -> 7
+  ArchX86_64
+    | r == rax  -> 0
+    | r == rdx  -> 1 -- this neither. The order GCC allocates registers in?
+    | r == rcx  -> 2
+    | r == rbx  -> 3
+    | r == rsi  -> 4
+    | r == rdi  -> 5
+    | r == rbp  -> 6
+    | r == rsp  -> 7
+    | r == r8   -> 8
+    | r == r9   -> 9
+    | r == r10  -> 10
+    | r == r11  -> 11
+    | r == r12  -> 12
+    | r == r13  -> 13
+    | r == r14  -> 14
+    | r == r15  -> 15
+    | r == xmm0 -> 17
+    | r == xmm1 -> 18
+    | r == xmm2 -> 19
+    | r == xmm3 -> 20
+    | r == xmm4 -> 21
+    | r == xmm5 -> 22
+    | r == xmm6 -> 23
+    | r == xmm7 -> 24
+    | r == xmm8 -> 25
+    | r == xmm9 -> 26
+    | r == xmm10 -> 27
+    | r == xmm11 -> 28
+    | r == xmm12 -> 29
+    | r == xmm13 -> 30
+    | r == xmm14 -> 31
+    | r == xmm15 -> 32
+  _other -> error "dwarfRegNo: Unsupported platform or unknown register!"
+
+-- | Virtual register number to use for return address.
+dwarfReturnRegNo :: Platform -> Word8
+dwarfReturnRegNo p
+  -- We "overwrite" IP with our pseudo register - that makes sense, as
+  -- when using this mechanism gdb already knows the IP anyway. Clang
+  -- does this too, so it must be safe.
+  = case platformArch p of
+    ArchX86    -> 8  -- eip
+    ArchX86_64 -> 16 -- rip
+    _other     -> error "dwarfReturnRegNo: Unsupported platform!"
diff --git a/compiler/GHC/CmmToAsm/Dwarf/Types.hs b/compiler/GHC/CmmToAsm/Dwarf/Types.hs
new file mode 100644
index 0000000000..5eda37a653
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Dwarf/Types.hs
@@ -0,0 +1,612 @@
+module GHC.CmmToAsm.Dwarf.Types
+  ( -- * Dwarf information
+    DwarfInfo(..)
+  , pprDwarfInfo
+  , pprAbbrevDecls
+    -- * Dwarf address range table
+  , DwarfARange(..)
+  , pprDwarfARanges
+    -- * Dwarf frame
+  , DwarfFrame(..), DwarfFrameProc(..), DwarfFrameBlock(..)
+  , pprDwarfFrame
+    -- * Utilities
+  , pprByte
+  , pprHalf
+  , pprData4'
+  , pprDwWord
+  , pprWord
+  , pprLEBWord
+  , pprLEBInt
+  , wordAlign
+  , sectionOffset
+  )
+  where
+
+import GhcPrelude
+
+import GHC.Cmm.DebugBlock
+import GHC.Cmm.CLabel
+import GHC.Cmm.Expr         ( GlobalReg(..) )
+import Encoding
+import FastString
+import Outputable
+import GHC.Platform
+import Unique
+import GHC.Platform.Reg
+import SrcLoc
+import Util
+
+import GHC.CmmToAsm.Dwarf.Constants
+
+import qualified Data.ByteString as BS
+import qualified Control.Monad.Trans.State.Strict as S
+import Control.Monad (zipWithM, join)
+import Data.Bits
+import qualified Data.Map as Map
+import Data.Word
+import Data.Char
+
+import GHC.Platform.Regs
+
+-- | Individual dwarf records. Each one will be encoded as an entry in
+-- the @.debug_info@ section.
+data DwarfInfo
+  = DwarfCompileUnit { dwChildren :: [DwarfInfo]
+                     , dwName :: String
+                     , dwProducer :: String
+                     , dwCompDir :: String
+                     , dwLowLabel :: CLabel
+                     , dwHighLabel :: CLabel
+                     , dwLineLabel :: PtrString }
+  | DwarfSubprogram { dwChildren :: [DwarfInfo]
+                    , dwName :: String
+                    , dwLabel :: CLabel
+                    , dwParent :: Maybe CLabel
+                      -- ^ label of DIE belonging to the parent tick
+                    }
+  | DwarfBlock { dwChildren :: [DwarfInfo]
+               , dwLabel :: CLabel
+               , dwMarker :: Maybe CLabel
+               }
+  | DwarfSrcNote { dwSrcSpan :: RealSrcSpan
+                 }
+
+-- | Abbreviation codes used for encoding above records in the
+-- @.debug_info@ section.
+data DwarfAbbrev
+  = DwAbbrNull          -- ^ Pseudo, used for marking the end of lists
+  | DwAbbrCompileUnit
+  | DwAbbrSubprogram
+  | DwAbbrSubprogramWithParent
+  | DwAbbrBlockWithoutCode
+  | DwAbbrBlock
+  | DwAbbrGhcSrcNote
+  deriving (Eq, Enum)
+
+-- | Generate assembly for the given abbreviation code
+pprAbbrev :: DwarfAbbrev -> SDoc
+pprAbbrev = pprLEBWord . fromIntegral . fromEnum
+
+-- | Abbreviation declaration. This explains the binary encoding we
+-- use for representing 'DwarfInfo'. Be aware that this must be updated
+-- along with 'pprDwarfInfo'.
+pprAbbrevDecls :: Bool -> SDoc
+pprAbbrevDecls haveDebugLine =
+  let mkAbbrev abbr tag chld flds =
+        let fld (tag, form) = pprLEBWord tag $$ pprLEBWord form
+        in pprAbbrev abbr $$ pprLEBWord tag $$ pprByte chld $$
+           vcat (map fld flds) $$ pprByte 0 $$ pprByte 0
+      -- These are shared between DwAbbrSubprogram and
+      -- DwAbbrSubprogramWithParent
+      subprogramAttrs =
+           [ (dW_AT_name, dW_FORM_string)
+           , (dW_AT_MIPS_linkage_name, dW_FORM_string)
+           , (dW_AT_external, dW_FORM_flag)
+           , (dW_AT_low_pc, dW_FORM_addr)
+           , (dW_AT_high_pc, dW_FORM_addr)
+           , (dW_AT_frame_base, dW_FORM_block1)
+           ]
+  in dwarfAbbrevSection $$
+     ptext dwarfAbbrevLabel <> colon $$
+     mkAbbrev DwAbbrCompileUnit dW_TAG_compile_unit dW_CHILDREN_yes
+       ([(dW_AT_name,     dW_FORM_string)
+       , (dW_AT_producer, dW_FORM_string)
+       , (dW_AT_language, dW_FORM_data4)
+       , (dW_AT_comp_dir, dW_FORM_string)
+       , (dW_AT_use_UTF8, dW_FORM_flag_present)  -- not represented in body
+       , (dW_AT_low_pc,   dW_FORM_addr)
+       , (dW_AT_high_pc,  dW_FORM_addr)
+       ] ++
+       (if haveDebugLine
+        then [ (dW_AT_stmt_list, dW_FORM_data4) ]
+        else [])) $$
+     mkAbbrev DwAbbrSubprogram dW_TAG_subprogram dW_CHILDREN_yes
+       subprogramAttrs $$
+     mkAbbrev DwAbbrSubprogramWithParent dW_TAG_subprogram dW_CHILDREN_yes
+       (subprogramAttrs ++ [(dW_AT_ghc_tick_parent, dW_FORM_ref_addr)]) $$
+     mkAbbrev DwAbbrBlockWithoutCode dW_TAG_lexical_block dW_CHILDREN_yes
+       [ (dW_AT_name, dW_FORM_string)
+       ] $$
+     mkAbbrev DwAbbrBlock dW_TAG_lexical_block dW_CHILDREN_yes
+       [ (dW_AT_name, dW_FORM_string)
+       , (dW_AT_low_pc, dW_FORM_addr)
+       , (dW_AT_high_pc, dW_FORM_addr)
+       ] $$
+     mkAbbrev DwAbbrGhcSrcNote dW_TAG_ghc_src_note dW_CHILDREN_no
+       [ (dW_AT_ghc_span_file, dW_FORM_string)
+       , (dW_AT_ghc_span_start_line, dW_FORM_data4)
+       , (dW_AT_ghc_span_start_col, dW_FORM_data2)
+       , (dW_AT_ghc_span_end_line, dW_FORM_data4)
+       , (dW_AT_ghc_span_end_col, dW_FORM_data2)
+       ] $$
+     pprByte 0
+
+-- | Generate assembly for DWARF data
+pprDwarfInfo :: Bool -> DwarfInfo -> SDoc
+pprDwarfInfo haveSrc d
+  = case d of
+      DwarfCompileUnit {}  -> hasChildren
+      DwarfSubprogram {}   -> hasChildren
+      DwarfBlock {}        -> hasChildren
+      DwarfSrcNote {}      -> noChildren
+  where
+    hasChildren =
+        pprDwarfInfoOpen haveSrc d $$
+        vcat (map (pprDwarfInfo haveSrc) (dwChildren d)) $$
+        pprDwarfInfoClose
+    noChildren = pprDwarfInfoOpen haveSrc d
+
+-- | Prints assembler data corresponding to DWARF info records. Note
+-- that the binary format of this is parameterized in @abbrevDecls@ and
+-- has to be kept in synch.
+pprDwarfInfoOpen :: Bool -> DwarfInfo -> SDoc
+pprDwarfInfoOpen haveSrc (DwarfCompileUnit _ name producer compDir lowLabel
+                                           highLabel lineLbl) =
+  pprAbbrev DwAbbrCompileUnit
+  $$ pprString name
+  $$ pprString producer
+  $$ pprData4 dW_LANG_Haskell
+  $$ pprString compDir
+  $$ pprWord (ppr lowLabel)
+  $$ pprWord (ppr highLabel)
+  $$ if haveSrc
+     then sectionOffset (ptext lineLbl) (ptext dwarfLineLabel)
+     else empty
+pprDwarfInfoOpen _ (DwarfSubprogram _ name label
+                                    parent) = sdocWithDynFlags $ \df ->
+  ppr (mkAsmTempDieLabel label) <> colon
+  $$ pprAbbrev abbrev
+  $$ pprString name
+  $$ pprString (renderWithStyle (initSDocContext df (mkCodeStyle CStyle)) (ppr label))
+  $$ pprFlag (externallyVisibleCLabel label)
+  $$ pprWord (ppr label)
+  $$ pprWord (ppr $ mkAsmTempEndLabel label)
+  $$ pprByte 1
+  $$ pprByte dW_OP_call_frame_cfa
+  $$ parentValue
+  where
+    abbrev = case parent of Nothing -> DwAbbrSubprogram
+                            Just _  -> DwAbbrSubprogramWithParent
+    parentValue = maybe empty pprParentDie parent
+    pprParentDie sym = sectionOffset (ppr sym) (ptext dwarfInfoLabel)
+pprDwarfInfoOpen _ (DwarfBlock _ label Nothing) = sdocWithDynFlags $ \df ->
+  ppr (mkAsmTempDieLabel label) <> colon
+  $$ pprAbbrev DwAbbrBlockWithoutCode
+  $$ pprString (renderWithStyle (initSDocContext df (mkCodeStyle CStyle)) (ppr label))
+pprDwarfInfoOpen _ (DwarfBlock _ label (Just marker)) = sdocWithDynFlags $ \df ->
+  ppr (mkAsmTempDieLabel label) <> colon
+  $$ pprAbbrev DwAbbrBlock
+  $$ pprString (renderWithStyle (initSDocContext df (mkCodeStyle CStyle)) (ppr label))
+  $$ pprWord (ppr marker)
+  $$ pprWord (ppr $ mkAsmTempEndLabel marker)
+pprDwarfInfoOpen _ (DwarfSrcNote ss) =
+  pprAbbrev DwAbbrGhcSrcNote
+  $$ pprString' (ftext $ srcSpanFile ss)
+  $$ pprData4 (fromIntegral $ srcSpanStartLine ss)
+  $$ pprHalf (fromIntegral $ srcSpanStartCol ss)
+  $$ pprData4 (fromIntegral $ srcSpanEndLine ss)
+  $$ pprHalf (fromIntegral $ srcSpanEndCol ss)
+
+-- | Close a DWARF info record with children
+pprDwarfInfoClose :: SDoc
+pprDwarfInfoClose = pprAbbrev DwAbbrNull
+
+-- | A DWARF address range. This is used by the debugger to quickly locate
+-- which compilation unit a given address belongs to. This type assumes
+-- a non-segmented address-space.
+data DwarfARange
+  = DwarfARange
+    { dwArngStartLabel :: CLabel
+    , dwArngEndLabel   :: CLabel
+    }
+
+-- | Print assembler directives corresponding to a DWARF @.debug_aranges@
+-- address table entry.
+pprDwarfARanges :: [DwarfARange] -> Unique -> SDoc
+pprDwarfARanges arngs unitU = sdocWithPlatform $ \plat ->
+  let wordSize = platformWordSizeInBytes plat
+      paddingSize = 4 :: Int
+      -- header is 12 bytes long.
+      -- entry is 8 bytes (32-bit platform) or 16 bytes (64-bit platform).
+      -- pad such that first entry begins at multiple of entry size.
+      pad n = vcat $ replicate n $ pprByte 0
+      -- Fix for #17428
+      initialLength = 8 + paddingSize + (1 + length arngs) * 2 * wordSize
+  in pprDwWord (ppr initialLength)
+     $$ pprHalf 2
+     $$ sectionOffset (ppr $ mkAsmTempLabel $ unitU)
+                      (ptext dwarfInfoLabel)
+     $$ pprByte (fromIntegral wordSize)
+     $$ pprByte 0
+     $$ pad paddingSize
+     -- body
+     $$ vcat (map pprDwarfARange arngs)
+     -- terminus
+     $$ pprWord (char '0')
+     $$ pprWord (char '0')
+
+pprDwarfARange :: DwarfARange -> SDoc
+pprDwarfARange arng = pprWord (ppr $ dwArngStartLabel arng) $$ pprWord length
+  where
+    length = ppr (dwArngEndLabel arng)
+             <> char '-' <> ppr (dwArngStartLabel arng)
+
+-- | Information about unwind instructions for a procedure. This
+-- corresponds to a "Common Information Entry" (CIE) in DWARF.
+data DwarfFrame
+  = DwarfFrame
+    { dwCieLabel :: CLabel
+    , dwCieInit  :: UnwindTable
+    , dwCieProcs :: [DwarfFrameProc]
+    }
+
+-- | Unwind instructions for an individual procedure. Corresponds to a
+-- "Frame Description Entry" (FDE) in DWARF.
+data DwarfFrameProc
+  = DwarfFrameProc
+    { dwFdeProc    :: CLabel
+    , dwFdeHasInfo :: Bool
+    , dwFdeBlocks  :: [DwarfFrameBlock]
+      -- ^ List of blocks. Order must match asm!
+    }
+
+-- | Unwind instructions for a block. Will become part of the
+-- containing FDE.
+data DwarfFrameBlock
+  = DwarfFrameBlock
+    { dwFdeBlkHasInfo :: Bool
+    , dwFdeUnwind     :: [UnwindPoint]
+      -- ^ these unwind points must occur in the same order as they occur
+      -- in the block
+    }
+
+instance Outputable DwarfFrameBlock where
+  ppr (DwarfFrameBlock hasInfo unwinds) = braces $ ppr hasInfo <+> ppr unwinds
+
+-- | Header for the @.debug_frame@ section. Here we emit the "Common
+-- Information Entry" record that establishes general call frame
+-- parameters and the default stack layout.
+pprDwarfFrame :: DwarfFrame -> SDoc
+pprDwarfFrame DwarfFrame{dwCieLabel=cieLabel,dwCieInit=cieInit,dwCieProcs=procs}
+  = sdocWithPlatform $ \plat ->
+    let cieStartLabel= mkAsmTempDerivedLabel cieLabel (fsLit "_start")
+        cieEndLabel = mkAsmTempEndLabel cieLabel
+        length      = ppr cieEndLabel <> char '-' <> ppr cieStartLabel
+        spReg       = dwarfGlobalRegNo plat Sp
+        retReg      = dwarfReturnRegNo plat
+        wordSize    = platformWordSizeInBytes plat
+        pprInit :: (GlobalReg, Maybe UnwindExpr) -> SDoc
+        pprInit (g, uw) = pprSetUnwind plat g (Nothing, uw)
+
+        -- Preserve C stack pointer: This necessary to override that default
+        -- unwinding behavior of setting $sp = CFA.
+        preserveSp = case platformArch plat of
+          ArchX86    -> pprByte dW_CFA_same_value $$ pprLEBWord 4
+          ArchX86_64 -> pprByte dW_CFA_same_value $$ pprLEBWord 7
+          _          -> empty
+    in vcat [ ppr cieLabel <> colon
+            , pprData4' length -- Length of CIE
+            , ppr cieStartLabel <> colon
+            , pprData4' (text "-1")
+                               -- Common Information Entry marker (-1 = 0xf..f)
+            , pprByte 3        -- CIE version (we require DWARF 3)
+            , pprByte 0        -- Augmentation (none)
+            , pprByte 1        -- Code offset multiplicator
+            , pprByte (128-fromIntegral wordSize)
+                               -- Data offset multiplicator
+                               -- (stacks grow down => "-w" in signed LEB128)
+            , pprByte retReg   -- virtual register holding return address
+            ] $$
+       -- Initial unwind table
+       vcat (map pprInit $ Map.toList cieInit) $$
+       vcat [ -- RET = *CFA
+              pprByte (dW_CFA_offset+retReg)
+            , pprByte 0
+
+              -- Preserve C stack pointer
+            , preserveSp
+
+              -- Sp' = CFA
+              -- (we need to set this manually as our (STG) Sp register is
+              -- often not the architecture's default stack register)
+            , pprByte dW_CFA_val_offset
+            , pprLEBWord (fromIntegral spReg)
+            , pprLEBWord 0
+            ] $$
+       wordAlign $$
+       ppr cieEndLabel <> colon $$
+       -- Procedure unwind tables
+       vcat (map (pprFrameProc cieLabel cieInit) procs)
+
+-- | Writes a "Frame Description Entry" for a procedure. This consists
+-- mainly of referencing the CIE and writing state machine
+-- instructions to describe how the frame base (CFA) changes.
+pprFrameProc :: CLabel -> UnwindTable -> DwarfFrameProc -> SDoc
+pprFrameProc frameLbl initUw (DwarfFrameProc procLbl hasInfo blocks)
+  = let fdeLabel    = mkAsmTempDerivedLabel procLbl (fsLit "_fde")
+        fdeEndLabel = mkAsmTempDerivedLabel procLbl (fsLit "_fde_end")
+        procEnd     = mkAsmTempEndLabel procLbl
+        ifInfo str  = if hasInfo then text str else empty
+                      -- see [Note: Info Offset]
+    in vcat [ whenPprDebug $ text "# Unwinding for" <+> ppr procLbl <> colon
+            , pprData4' (ppr fdeEndLabel <> char '-' <> ppr fdeLabel)
+            , ppr fdeLabel <> colon
+            , pprData4' (ppr frameLbl <> char '-' <>
+                         ptext dwarfFrameLabel)    -- Reference to CIE
+            , pprWord (ppr procLbl <> ifInfo "-1") -- Code pointer
+            , pprWord (ppr procEnd <> char '-' <>
+                       ppr procLbl <> ifInfo "+1") -- Block byte length
+            ] $$
+       vcat (S.evalState (mapM pprFrameBlock blocks) initUw) $$
+       wordAlign $$
+       ppr fdeEndLabel <> colon
+
+-- | Generates unwind information for a block. We only generate
+-- instructions where unwind information actually changes. This small
+-- optimisations saves a lot of space, as subsequent blocks often have
+-- the same unwind information.
+pprFrameBlock :: DwarfFrameBlock -> S.State UnwindTable SDoc
+pprFrameBlock (DwarfFrameBlock hasInfo uws0) =
+    vcat <$> zipWithM pprFrameDecl (True : repeat False) uws0
+  where
+    pprFrameDecl :: Bool -> UnwindPoint -> S.State UnwindTable SDoc
+    pprFrameDecl firstDecl (UnwindPoint lbl uws) = S.state $ \oldUws ->
+        let -- Did a register's unwind expression change?
+            isChanged :: GlobalReg -> Maybe UnwindExpr
+                      -> Maybe (Maybe UnwindExpr, Maybe UnwindExpr)
+            isChanged g new
+                -- the value didn't change
+              | Just new == old = Nothing
+                -- the value was and still is undefined
+              | Nothing <- old
+              , Nothing <- new  = Nothing
+                -- the value changed
+              | otherwise       = Just (join old, new)
+              where
+                old = Map.lookup g oldUws
+
+            changed = Map.toList $ Map.mapMaybeWithKey isChanged uws
+
+        in if oldUws == uws
+             then (empty, oldUws)
+             else let -- see [Note: Info Offset]
+                      needsOffset = firstDecl && hasInfo
+                      lblDoc = ppr lbl <>
+                               if needsOffset then text "-1" else empty
+                      doc = sdocWithPlatform $ \plat ->
+                           pprByte dW_CFA_set_loc $$ pprWord lblDoc $$
+                           vcat (map (uncurry $ pprSetUnwind plat) changed)
+                  in (doc, uws)
+
+-- Note [Info Offset]
+--
+-- GDB was pretty much written with C-like programs in mind, and as a
+-- result they assume that once you have a return address, it is a
+-- good idea to look at (PC-1) to unwind further - as that's where the
+-- "call" instruction is supposed to be.
+--
+-- Now on one hand, code generated by GHC looks nothing like what GDB
+-- expects, and in fact going up from a return pointer is guaranteed
+-- to land us inside an info table! On the other hand, that actually
+-- gives us some wiggle room, as we expect IP to never *actually* end
+-- up inside the info table, so we can "cheat" by putting whatever GDB
+-- expects to see there. This is probably pretty safe, as GDB cannot
+-- assume (PC-1) to be a valid code pointer in the first place - and I
+-- have seen no code trying to correct this.
+--
+-- Note that this will not prevent GDB from failing to look-up the
+-- correct function name for the frame, as that uses the symbol table,
+-- which we can not manipulate as easily.
+--
+-- There's a GDB patch to address this at [1]. At the moment of writing
+-- it's not merged, so I recommend building GDB with the patch if you
+-- care about unwinding. The hack above doesn't cover every case.
+--
+-- [1] https://sourceware.org/ml/gdb-patches/2018-02/msg00055.html
+
+-- | Get DWARF register ID for a given GlobalReg
+dwarfGlobalRegNo :: Platform -> GlobalReg -> Word8
+dwarfGlobalRegNo p UnwindReturnReg = dwarfReturnRegNo p
+dwarfGlobalRegNo p reg = maybe 0 (dwarfRegNo p . RegReal) $ globalRegMaybe p reg
+
+-- | Generate code for setting the unwind information for a register,
+-- optimized using its known old value in the table. Note that "Sp" is
+-- special: We see it as synonym for the CFA.
+pprSetUnwind :: Platform
+             -> GlobalReg
+                -- ^ the register to produce an unwinding table entry for
+             -> (Maybe UnwindExpr, Maybe UnwindExpr)
+                -- ^ the old and new values of the register
+             -> SDoc
+pprSetUnwind plat g  (_, Nothing)
+  = pprUndefUnwind plat g
+pprSetUnwind _    Sp (Just (UwReg s _), Just (UwReg s' o')) | s == s'
+  = if o' >= 0
+    then pprByte dW_CFA_def_cfa_offset $$ pprLEBWord (fromIntegral o')
+    else pprByte dW_CFA_def_cfa_offset_sf $$ pprLEBInt o'
+pprSetUnwind plat Sp (_, Just (UwReg s' o'))
+  = if o' >= 0
+    then pprByte dW_CFA_def_cfa $$
+         pprLEBRegNo plat s' $$
+         pprLEBWord (fromIntegral o')
+    else pprByte dW_CFA_def_cfa_sf $$
+         pprLEBRegNo plat s' $$
+         pprLEBInt o'
+pprSetUnwind _    Sp (_, Just uw)
+  = pprByte dW_CFA_def_cfa_expression $$ pprUnwindExpr False uw
+pprSetUnwind plat g  (_, Just (UwDeref (UwReg Sp o)))
+  | o < 0 && ((-o) `mod` platformWordSizeInBytes plat) == 0 -- expected case
+  = pprByte (dW_CFA_offset + dwarfGlobalRegNo plat g) $$
+    pprLEBWord (fromIntegral ((-o) `div` platformWordSizeInBytes plat))
+  | otherwise
+  = pprByte dW_CFA_offset_extended_sf $$
+    pprLEBRegNo plat g $$
+    pprLEBInt o
+pprSetUnwind plat g  (_, Just (UwDeref uw))
+  = pprByte dW_CFA_expression $$
+    pprLEBRegNo plat g $$
+    pprUnwindExpr True uw
+pprSetUnwind plat g  (_, Just (UwReg g' 0))
+  | g == g'
+  = pprByte dW_CFA_same_value $$
+    pprLEBRegNo plat g
+pprSetUnwind plat g  (_, Just uw)
+  = pprByte dW_CFA_val_expression $$
+    pprLEBRegNo plat g $$
+    pprUnwindExpr True uw
+
+-- | Print the register number of the given 'GlobalReg' as an unsigned LEB128
+-- encoded number.
+pprLEBRegNo :: Platform -> GlobalReg -> SDoc
+pprLEBRegNo plat = pprLEBWord . fromIntegral . dwarfGlobalRegNo plat
+
+-- | Generates a DWARF expression for the given unwind expression. If
+-- @spIsCFA@ is true, we see @Sp@ as the frame base CFA where it gets
+-- mentioned.
+pprUnwindExpr :: Bool -> UnwindExpr -> SDoc
+pprUnwindExpr spIsCFA expr
+  = sdocWithPlatform $ \plat ->
+    let pprE (UwConst i)
+          | i >= 0 && i < 32 = pprByte (dW_OP_lit0 + fromIntegral i)
+          | otherwise        = pprByte dW_OP_consts $$ pprLEBInt i -- lazy...
+        pprE (UwReg Sp i) | spIsCFA
+                             = if i == 0
+                               then pprByte dW_OP_call_frame_cfa
+                               else pprE (UwPlus (UwReg Sp 0) (UwConst i))
+        pprE (UwReg g i)      = pprByte (dW_OP_breg0+dwarfGlobalRegNo plat g) $$
+                               pprLEBInt i
+        pprE (UwDeref u)      = pprE u $$ pprByte dW_OP_deref
+        pprE (UwLabel l)      = pprByte dW_OP_addr $$ pprWord (ppr l)
+        pprE (UwPlus u1 u2)   = pprE u1 $$ pprE u2 $$ pprByte dW_OP_plus
+        pprE (UwMinus u1 u2)  = pprE u1 $$ pprE u2 $$ pprByte dW_OP_minus
+        pprE (UwTimes u1 u2)  = pprE u1 $$ pprE u2 $$ pprByte dW_OP_mul
+    in text "\t.uleb128 2f-1f" $$ -- DW_FORM_block length
+       -- computed as the difference of the following local labels 2: and 1:
+       text "1:" $$
+       pprE expr $$
+       text "2:"
+
+-- | Generate code for re-setting the unwind information for a
+-- register to @undefined@
+pprUndefUnwind :: Platform -> GlobalReg -> SDoc
+pprUndefUnwind plat g  = pprByte dW_CFA_undefined $$
+                         pprLEBRegNo plat g
+
+
+-- | Align assembly at (machine) word boundary
+wordAlign :: SDoc
+wordAlign = sdocWithPlatform $ \plat ->
+  text "\t.align " <> case platformOS plat of
+    OSDarwin -> case platformWordSize plat of
+      PW8 -> char '3'
+      PW4 -> char '2'
+    _other   -> ppr (platformWordSizeInBytes plat)
+
+-- | Assembly for a single byte of constant DWARF data
+pprByte :: Word8 -> SDoc
+pprByte x = text "\t.byte " <> ppr (fromIntegral x :: Word)
+
+-- | Assembly for a two-byte constant integer
+pprHalf :: Word16 -> SDoc
+pprHalf x = text "\t.short" <+> ppr (fromIntegral x :: Word)
+
+-- | Assembly for a constant DWARF flag
+pprFlag :: Bool -> SDoc
+pprFlag f = pprByte (if f then 0xff else 0x00)
+
+-- | Assembly for 4 bytes of dynamic DWARF data
+pprData4' :: SDoc -> SDoc
+pprData4' x = text "\t.long " <> x
+
+-- | Assembly for 4 bytes of constant DWARF data
+pprData4 :: Word -> SDoc
+pprData4 = pprData4' . ppr
+
+-- | Assembly for a DWARF word of dynamic data. This means 32 bit, as
+-- we are generating 32 bit DWARF.
+pprDwWord :: SDoc -> SDoc
+pprDwWord = pprData4'
+
+-- | Assembly for a machine word of dynamic data. Depends on the
+-- architecture we are currently generating code for.
+pprWord :: SDoc -> SDoc
+pprWord s = (<> s) . sdocWithPlatform $ \plat ->
+  case platformWordSize plat of
+    PW4 -> text "\t.long "
+    PW8 -> text "\t.quad "
+
+-- | Prints a number in "little endian base 128" format. The idea is
+-- to optimize for small numbers by stopping once all further bytes
+-- would be 0. The highest bit in every byte signals whether there
+-- are further bytes to read.
+pprLEBWord :: Word -> SDoc
+pprLEBWord x | x < 128   = pprByte (fromIntegral x)
+             | otherwise = pprByte (fromIntegral $ 128 .|. (x .&. 127)) $$
+                           pprLEBWord (x `shiftR` 7)
+
+-- | Same as @pprLEBWord@, but for a signed number
+pprLEBInt :: Int -> SDoc
+pprLEBInt x | x >= -64 && x < 64
+                        = pprByte (fromIntegral (x .&. 127))
+            | otherwise = pprByte (fromIntegral $ 128 .|. (x .&. 127)) $$
+                          pprLEBInt (x `shiftR` 7)
+
+-- | Generates a dynamic null-terminated string. If required the
+-- caller needs to make sure that the string is escaped properly.
+pprString' :: SDoc -> SDoc
+pprString' str = text "\t.asciz \"" <> str <> char '"'
+
+-- | Generate a string constant. We take care to escape the string.
+pprString :: String -> SDoc
+pprString str
+  = pprString' $ hcat $ map escapeChar $
+    if str `lengthIs` utf8EncodedLength str
+    then str
+    else map (chr . fromIntegral) $ BS.unpack $ bytesFS $ mkFastString str
+
+-- | Escape a single non-unicode character
+escapeChar :: Char -> SDoc
+escapeChar '\\' = text "\\\\"
+escapeChar '\"' = text "\\\""
+escapeChar '\n' = text "\\n"
+escapeChar c
+  | isAscii c && isPrint c && c /= '?' -- prevents trigraph warnings
+  = char c
+  | otherwise
+  = char '\\' <> char (intToDigit (ch `div` 64)) <>
+                 char (intToDigit ((ch `div` 8) `mod` 8)) <>
+                 char (intToDigit (ch `mod` 8))
+  where ch = ord c
+
+-- | Generate an offset into another section. This is tricky because
+-- this is handled differently depending on platform: Mac Os expects
+-- us to calculate the offset using assembler arithmetic. Linux expects
+-- us to just reference the target directly, and will figure out on
+-- their own that we actually need an offset. Finally, Windows has
+-- a special directive to refer to relative offsets. Fun.
+sectionOffset :: SDoc -> SDoc -> SDoc
+sectionOffset target section = sdocWithPlatform $ \plat ->
+  case platformOS plat of
+    OSDarwin  -> pprDwWord (target <> char '-' <> section)
+    OSMinGW32 -> text "\t.secrel32 " <> target
+    _other    -> pprDwWord target
diff --git a/compiler/GHC/CmmToAsm/Format.hs b/compiler/GHC/CmmToAsm/Format.hs
new file mode 100644
index 0000000000..446c760939
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Format.hs
@@ -0,0 +1,105 @@
+-- | Formats on this architecture
+--      A Format is a combination of width and class
+--
+--      TODO:   Signed vs unsigned?
+--
+--      TODO:   This module is currently shared by all architectures because
+--              NCGMonad need to know about it to make a VReg. It would be better
+--              to have architecture specific formats, and do the overloading
+--              properly. eg SPARC doesn't care about FF80.
+--
+module GHC.CmmToAsm.Format (
+    Format(..),
+    intFormat,
+    floatFormat,
+    isFloatFormat,
+    cmmTypeFormat,
+    formatToWidth,
+    formatInBytes
+)
+
+where
+
+import GhcPrelude
+
+import GHC.Cmm
+import Outputable
+
+-- It looks very like the old MachRep, but it's now of purely local
+-- significance, here in the native code generator.  You can change it
+-- without global consequences.
+--
+-- A major use is as an opcode qualifier; thus the opcode
+--      mov.l a b
+-- might be encoded
+--      MOV II32 a b
+-- where the Format field encodes the ".l" part.
+
+-- ToDo: it's not clear to me that we need separate signed-vs-unsigned formats
+--        here.  I've removed them from the x86 version, we'll see what happens --SDM
+
+-- ToDo: quite a few occurrences of Format could usefully be replaced by Width
+
+data Format
+        = II8
+        | II16
+        | II32
+        | II64
+        | FF32
+        | FF64
+        deriving (Show, Eq)
+
+
+-- | Get the integer format of this width.
+intFormat :: Width -> Format
+intFormat width
+ = case width of
+        W8      -> II8
+        W16     -> II16
+        W32     -> II32
+        W64     -> II64
+        other   -> sorry $ "The native code generator cannot " ++
+            "produce code for Format.intFormat " ++ show other
+            ++ "\n\tConsider using the llvm backend with -fllvm"
+
+
+-- | Get the float format of this width.
+floatFormat :: Width -> Format
+floatFormat width
+ = case width of
+        W32     -> FF32
+        W64     -> FF64
+
+        other   -> pprPanic "Format.floatFormat" (ppr other)
+
+
+-- | Check if a format represents a floating point value.
+isFloatFormat :: Format -> Bool
+isFloatFormat format
+ = case format of
+        FF32    -> True
+        FF64    -> True
+        _       -> False
+
+
+-- | Convert a Cmm type to a Format.
+cmmTypeFormat :: CmmType -> Format
+cmmTypeFormat ty
+        | isFloatType ty        = floatFormat (typeWidth ty)
+        | otherwise             = intFormat (typeWidth ty)
+
+
+-- | Get the Width of a Format.
+formatToWidth :: Format -> Width
+formatToWidth format
+ = case format of
+        II8             -> W8
+        II16            -> W16
+        II32            -> W32
+        II64            -> W64
+        FF32            -> W32
+        FF64            -> W64
+
+
+formatInBytes :: Format -> Int
+formatInBytes = widthInBytes . formatToWidth
diff --git a/compiler/GHC/CmmToAsm/Instr.hs b/compiler/GHC/CmmToAsm/Instr.hs
new file mode 100644
index 0000000000..44fa9b7cc9
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Instr.hs
@@ -0,0 +1,202 @@
+
+module GHC.CmmToAsm.Instr (
+        RegUsage(..),
+        noUsage,
+        GenBasicBlock(..), blockId,
+        ListGraph(..),
+        NatCmm,
+        NatCmmDecl,
+        NatBasicBlock,
+        topInfoTable,
+        entryBlocks,
+        Instruction(..)
+)
+
+where
+
+import GhcPrelude
+
+import GHC.Platform.Reg
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Driver.Session
+import GHC.Cmm hiding (topInfoTable)
+import GHC.Platform
+
+-- | Holds a list of source and destination registers used by a
+--      particular instruction.
+--
+--   Machine registers that are pre-allocated to stgRegs are filtered
+--      out, because they are uninteresting from a register allocation
+--      standpoint.  (We wouldn't want them to end up on the free list!)
+--
+--   As far as we are concerned, the fixed registers simply don't exist
+--      (for allocation purposes, anyway).
+--
+data RegUsage
+        = RU [Reg] [Reg]
+
+-- | No regs read or written to.
+noUsage :: RegUsage
+noUsage  = RU [] []
+
+-- Our flavours of the Cmm types
+-- Type synonyms for Cmm populated with native code
+type NatCmm instr
+        = GenCmmGroup
+                RawCmmStatics
+                (LabelMap RawCmmStatics)
+                (ListGraph instr)
+
+type NatCmmDecl statics instr
+        = GenCmmDecl
+                statics
+                (LabelMap RawCmmStatics)
+                (ListGraph instr)
+
+
+type NatBasicBlock instr
+        = GenBasicBlock instr
+
+
+-- | Returns the info table associated with the CmmDecl's entry point,
+-- if any.
+topInfoTable :: GenCmmDecl a (LabelMap i) (ListGraph b) -> Maybe i
+topInfoTable (CmmProc infos _ _ (ListGraph (b:_)))
+  = mapLookup (blockId b) infos
+topInfoTable _
+  = Nothing
+
+-- | Return the list of BlockIds in a CmmDecl that are entry points
+-- for this proc (i.e. they may be jumped to from outside this proc).
+entryBlocks :: GenCmmDecl a (LabelMap i) (ListGraph b) -> [BlockId]
+entryBlocks (CmmProc info _ _ (ListGraph code)) = entries
+  where
+        infos = mapKeys info
+        entries = case code of
+                    [] -> infos
+                    BasicBlock entry _ : _ -- first block is the entry point
+                       | entry `elem` infos -> infos
+                       | otherwise          -> entry : infos
+entryBlocks _ = []
+
+-- | Common things that we can do with instructions, on all architectures.
+--      These are used by the shared parts of the native code generator,
+--      specifically the register allocators.
+--
+class   Instruction instr where
+
+        -- | Get the registers that are being used by this instruction.
+        --      regUsage doesn't need to do any trickery for jumps and such.
+        --      Just state precisely the regs read and written by that insn.
+        --      The consequences of control flow transfers, as far as register
+        --      allocation goes, are taken care of by the register allocator.
+        --
+        regUsageOfInstr
+                :: Platform
+                -> instr
+                -> RegUsage
+
+
+        -- | Apply a given mapping to all the register references in this
+        --      instruction.
+        patchRegsOfInstr
+                :: instr
+                -> (Reg -> Reg)
+                -> instr
+
+
+        -- | Checks whether this instruction is a jump/branch instruction.
+        --      One that can change the flow of control in a way that the
+        --      register allocator needs to worry about.
+        isJumpishInstr
+                :: instr -> Bool
+
+
+        -- | Give the possible destinations of this jump instruction.
+        --      Must be defined for all jumpish instructions.
+        jumpDestsOfInstr
+                :: instr -> [BlockId]
+
+
+        -- | Change the destination of this jump instruction.
+        --      Used in the linear allocator when adding fixup blocks for join
+        --      points.
+        patchJumpInstr
+                :: instr
+                -> (BlockId -> BlockId)
+                -> instr
+
+
+        -- | An instruction to spill a register into a spill slot.
+        mkSpillInstr
+                :: DynFlags
+                -> Reg          -- ^ the reg to spill
+                -> Int          -- ^ the current stack delta
+                -> Int          -- ^ spill slot to use
+                -> instr
+
+
+        -- | An instruction to reload a register from a spill slot.
+        mkLoadInstr
+                :: DynFlags
+                -> Reg          -- ^ the reg to reload.
+                -> Int          -- ^ the current stack delta
+                -> Int          -- ^ the spill slot to use
+                -> instr
+
+        -- | See if this instruction is telling us the current C stack delta
+        takeDeltaInstr
+                :: instr
+                -> Maybe Int
+
+        -- | Check whether this instruction is some meta thing inserted into
+        --      the instruction stream for other purposes.
+        --
+        --      Not something that has to be treated as a real machine instruction
+        --      and have its registers allocated.
+        --
+        --      eg, comments, delta, ldata, etc.
+        isMetaInstr
+                :: instr
+                -> Bool
+
+
+
+        -- | Copy the value in a register to another one.
+        --      Must work for all register classes.
+        mkRegRegMoveInstr
+                :: Platform
+                -> Reg          -- ^ source register
+                -> Reg          -- ^ destination register
+                -> instr
+
+        -- | Take the source and destination from this reg -> reg move instruction
+        --      or Nothing if it's not one
+        takeRegRegMoveInstr
+                :: instr
+                -> Maybe (Reg, Reg)
+
+        -- | Make an unconditional jump instruction.
+        --      For architectures with branch delay slots, its ok to put
+        --      a NOP after the jump. Don't fill the delay slot with an
+        --      instruction that references regs or you'll confuse the
+        --      linear allocator.
+        mkJumpInstr
+                :: BlockId
+                -> [instr]
+
+
+        -- Subtract an amount from the C stack pointer
+        mkStackAllocInstr
+                :: Platform
+                -> Int
+                -> [instr]
+
+        -- Add an amount to the C stack pointer
+        mkStackDeallocInstr
+                :: Platform
+                -> Int
+                -> [instr]
diff --git a/compiler/GHC/CmmToAsm/Monad.hs b/compiler/GHC/CmmToAsm/Monad.hs
new file mode 100644
index 0000000000..c9414a2eee
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Monad.hs
@@ -0,0 +1,294 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE BangPatterns #-}
+
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 1993-2004
+--
+-- The native code generator's monad.
+--
+-- -----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.Monad (
+        NcgImpl(..),
+        NatM_State(..), mkNatM_State,
+
+        NatM, -- instance Monad
+        initNat,
+        addImportNat,
+        addNodeBetweenNat,
+        addImmediateSuccessorNat,
+        updateCfgNat,
+        getUniqueNat,
+        mapAccumLNat,
+        setDeltaNat,
+        getDeltaNat,
+        getThisModuleNat,
+        getBlockIdNat,
+        getNewLabelNat,
+        getNewRegNat,
+        getNewRegPairNat,
+        getPicBaseMaybeNat,
+        getPicBaseNat,
+        getDynFlags,
+        getModLoc,
+        getFileId,
+        getDebugBlock,
+
+        DwarfFiles
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Reg.Target
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.CLabel           ( CLabel )
+import GHC.Cmm.DebugBlock
+import FastString       ( FastString )
+import UniqFM
+import UniqSupply
+import Unique           ( Unique )
+import GHC.Driver.Session
+import Module
+
+import Control.Monad    ( ap )
+
+import GHC.CmmToAsm.Instr
+import Outputable (SDoc, pprPanic, ppr)
+import GHC.Cmm (RawCmmDecl, RawCmmStatics)
+import GHC.CmmToAsm.CFG
+
+data NcgImpl statics instr jumpDest = NcgImpl {
+    cmmTopCodeGen             :: RawCmmDecl -> NatM [NatCmmDecl statics instr],
+    generateJumpTableForInstr :: instr -> Maybe (NatCmmDecl statics instr),
+    getJumpDestBlockId        :: jumpDest -> Maybe BlockId,
+    canShortcut               :: instr -> Maybe jumpDest,
+    shortcutStatics           :: (BlockId -> Maybe jumpDest) -> statics -> statics,
+    shortcutJump              :: (BlockId -> Maybe jumpDest) -> instr -> instr,
+    pprNatCmmDecl             :: NatCmmDecl statics instr -> SDoc,
+    maxSpillSlots             :: Int,
+    allocatableRegs           :: [RealReg],
+    ncgExpandTop              :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr],
+    ncgAllocMoreStack         :: Int -> NatCmmDecl statics instr
+                              -> UniqSM (NatCmmDecl statics instr, [(BlockId,BlockId)]),
+    -- ^ The list of block ids records the redirected jumps to allow us to update
+    -- the CFG.
+    ncgMakeFarBranches        :: LabelMap RawCmmStatics -> [NatBasicBlock instr] -> [NatBasicBlock instr],
+    extractUnwindPoints       :: [instr] -> [UnwindPoint],
+    -- ^ given the instruction sequence of a block, produce a list of
+    -- the block's 'UnwindPoint's
+    -- See Note [What is this unwinding business?] in Debug
+    -- and Note [Unwinding information in the NCG] in this module.
+    invertCondBranches        :: Maybe CFG -> LabelMap RawCmmStatics -> [NatBasicBlock instr]
+                              -> [NatBasicBlock instr]
+    -- ^ Turn the sequence of `jcc l1; jmp l2` into `jncc l2; <block_l1>`
+    -- when possible.
+    }
+
+data NatM_State
+        = NatM_State {
+                natm_us          :: UniqSupply,
+                natm_delta       :: Int,
+                natm_imports     :: [(CLabel)],
+                natm_pic         :: Maybe Reg,
+                natm_dflags      :: DynFlags,
+                natm_this_module :: Module,
+                natm_modloc      :: ModLocation,
+                natm_fileid      :: DwarfFiles,
+                natm_debug_map   :: LabelMap DebugBlock,
+                natm_cfg         :: CFG
+        -- ^ Having a CFG with additional information is essential for some
+        -- operations. However we can't reconstruct all information once we
+        -- generated instructions. So instead we update the CFG as we go.
+        }
+
+type DwarfFiles = UniqFM (FastString, Int)
+
+newtype NatM result = NatM (NatM_State -> (result, NatM_State))
+    deriving (Functor)
+
+unNat :: NatM a -> NatM_State -> (a, NatM_State)
+unNat (NatM a) = a
+
+mkNatM_State :: UniqSupply -> Int -> DynFlags -> Module -> ModLocation ->
+                DwarfFiles -> LabelMap DebugBlock -> CFG -> NatM_State
+mkNatM_State us delta dflags this_mod
+        = \loc dwf dbg cfg ->
+                NatM_State
+                        { natm_us = us
+                        , natm_delta = delta
+                        , natm_imports = []
+                        , natm_pic = Nothing
+                        , natm_dflags = dflags
+                        , natm_this_module = this_mod
+                        , natm_modloc = loc
+                        , natm_fileid = dwf
+                        , natm_debug_map = dbg
+                        , natm_cfg = cfg
+                        }
+
+initNat :: NatM_State -> NatM a -> (a, NatM_State)
+initNat init_st m
+        = case unNat m init_st of { (r,st) -> (r,st) }
+
+instance Applicative NatM where
+      pure = returnNat
+      (<*>) = ap
+
+instance Monad NatM where
+  (>>=) = thenNat
+
+instance MonadUnique NatM where
+  getUniqueSupplyM = NatM $ \st ->
+      case splitUniqSupply (natm_us st) of
+          (us1, us2) -> (us1, st {natm_us = us2})
+
+  getUniqueM = NatM $ \st ->
+      case takeUniqFromSupply (natm_us st) of
+          (uniq, us') -> (uniq, st {natm_us = us'})
+
+thenNat :: NatM a -> (a -> NatM b) -> NatM b
+thenNat expr cont
+        = NatM $ \st -> case unNat expr st of
+                        (result, st') -> unNat (cont result) st'
+
+returnNat :: a -> NatM a
+returnNat result
+        = NatM $ \st ->  (result, st)
+
+mapAccumLNat :: (acc -> x -> NatM (acc, y))
+                -> acc
+                -> [x]
+                -> NatM (acc, [y])
+
+mapAccumLNat _ b []
+  = return (b, [])
+mapAccumLNat f b (x:xs)
+  = do (b__2, x__2)  <- f b x
+       (b__3, xs__2) <- mapAccumLNat f b__2 xs
+       return (b__3, x__2:xs__2)
+
+getUniqueNat :: NatM Unique
+getUniqueNat = NatM $ \ st ->
+    case takeUniqFromSupply $ natm_us st of
+    (uniq, us') -> (uniq, st {natm_us = us'})
+
+instance HasDynFlags NatM where
+    getDynFlags = NatM $ \ st -> (natm_dflags st, st)
+
+
+getDeltaNat :: NatM Int
+getDeltaNat = NatM $ \ st -> (natm_delta st, st)
+
+
+setDeltaNat :: Int -> NatM ()
+setDeltaNat delta = NatM $ \ st -> ((), st {natm_delta = delta})
+
+
+getThisModuleNat :: NatM Module
+getThisModuleNat = NatM $ \ st -> (natm_this_module st, st)
+
+
+addImportNat :: CLabel -> NatM ()
+addImportNat imp
+        = NatM $ \ st -> ((), st {natm_imports = imp : natm_imports st})
+
+updateCfgNat :: (CFG -> CFG) -> NatM ()
+updateCfgNat f
+        = NatM $ \ st -> let !cfg' = f (natm_cfg st)
+                         in ((), st { natm_cfg = cfg'})
+
+-- | Record that we added a block between `from` and `old`.
+addNodeBetweenNat :: BlockId -> BlockId -> BlockId -> NatM ()
+addNodeBetweenNat from between to
+ = do   df <- getDynFlags
+        let jmpWeight = fromIntegral . uncondWeight .
+                        cfgWeightInfo $ df
+        updateCfgNat (updateCfg jmpWeight from between to)
+  where
+    -- When transforming A -> B to A -> A' -> B
+    -- A -> A' keeps the old edge info while
+    -- A' -> B gets the info for an unconditional
+    -- jump.
+    updateCfg weight from between old m
+        | Just info <- getEdgeInfo from old m
+        = addEdge from between info .
+          addWeightEdge between old weight .
+          delEdge from old $ m
+        | otherwise
+        = pprPanic "Failed to update cfg: Untracked edge" (ppr (from,to))
+
+
+-- | Place `succ` after `block` and change any edges
+--   block -> X to `succ` -> X
+addImmediateSuccessorNat :: BlockId -> BlockId -> NatM ()
+addImmediateSuccessorNat block succ
+        = updateCfgNat (addImmediateSuccessor block succ)
+
+getBlockIdNat :: NatM BlockId
+getBlockIdNat
+ = do   u <- getUniqueNat
+        return (mkBlockId u)
+
+
+getNewLabelNat :: NatM CLabel
+getNewLabelNat
+ = blockLbl <$> getBlockIdNat
+
+
+getNewRegNat :: Format -> NatM Reg
+getNewRegNat rep
+ = do u <- getUniqueNat
+      dflags <- getDynFlags
+      return (RegVirtual $ targetMkVirtualReg (targetPlatform dflags) u rep)
+
+
+getNewRegPairNat :: Format -> NatM (Reg,Reg)
+getNewRegPairNat rep
+ = do u <- getUniqueNat
+      dflags <- getDynFlags
+      let vLo = targetMkVirtualReg (targetPlatform dflags) u rep
+      let lo  = RegVirtual $ targetMkVirtualReg (targetPlatform dflags) u rep
+      let hi  = RegVirtual $ getHiVirtualRegFromLo vLo
+      return (lo, hi)
+
+
+getPicBaseMaybeNat :: NatM (Maybe Reg)
+getPicBaseMaybeNat
+        = NatM (\state -> (natm_pic state, state))
+
+
+getPicBaseNat :: Format -> NatM Reg
+getPicBaseNat rep
+ = do   mbPicBase <- getPicBaseMaybeNat
+        case mbPicBase of
+                Just picBase -> return picBase
+                Nothing
+                 -> do
+                        reg <- getNewRegNat rep
+                        NatM (\state -> (reg, state { natm_pic = Just reg }))
+
+getModLoc :: NatM ModLocation
+getModLoc
+        = NatM $ \ st -> (natm_modloc st, st)
+
+getFileId :: FastString -> NatM Int
+getFileId f = NatM $ \st ->
+  case lookupUFM (natm_fileid st) f of
+    Just (_,n) -> (n, st)
+    Nothing    -> let n = 1 + sizeUFM (natm_fileid st)
+                      fids = addToUFM (natm_fileid st) f (f,n)
+                  in n `seq` fids `seq` (n, st { natm_fileid = fids  })
+
+getDebugBlock :: Label -> NatM (Maybe DebugBlock)
+getDebugBlock l = NatM $ \st -> (mapLookup l (natm_debug_map st), st)
diff --git a/compiler/GHC/CmmToAsm/PIC.hs b/compiler/GHC/CmmToAsm/PIC.hs
new file mode 100644
index 0000000000..323d93d173
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PIC.hs
@@ -0,0 +1,837 @@
+{-
+  This module handles generation of position independent code and
+  dynamic-linking related issues for the native code generator.
+
+  This depends both the architecture and OS, so we define it here
+  instead of in one of the architecture specific modules.
+
+  Things outside this module which are related to this:
+
+  + module CLabel
+    - PIC base label (pretty printed as local label 1)
+    - DynamicLinkerLabels - several kinds:
+        CodeStub, SymbolPtr, GotSymbolPtr, GotSymbolOffset
+    - labelDynamic predicate
+  + module Cmm
+    - The GlobalReg datatype has a PicBaseReg constructor
+    - The CmmLit datatype has a CmmLabelDiffOff constructor
+  + codeGen & RTS
+    - When tablesNextToCode, no absolute addresses are stored in info tables
+      any more. Instead, offsets from the info label are used.
+    - For Win32 only, SRTs might contain addresses of __imp_ symbol pointers
+      because Win32 doesn't support external references in data sections.
+      TODO: make sure this still works, it might be bitrotted
+  + NCG
+    - The cmmToCmm pass in AsmCodeGen calls cmmMakeDynamicReference for all
+      labels.
+    - nativeCodeGen calls pprImportedSymbol and pprGotDeclaration to output
+      all the necessary stuff for imported symbols.
+    - The NCG monad keeps track of a list of imported symbols.
+    - MachCodeGen invokes initializePicBase to generate code to initialize
+      the PIC base register when needed.
+    - MachCodeGen calls cmmMakeDynamicReference whenever it uses a CLabel
+      that wasn't in the original Cmm code (e.g. floating point literals).
+-}
+
+module GHC.CmmToAsm.PIC (
+        cmmMakeDynamicReference,
+        CmmMakeDynamicReferenceM(..),
+        ReferenceKind(..),
+        needImportedSymbols,
+        pprImportedSymbol,
+        pprGotDeclaration,
+
+        initializePicBase_ppc,
+        initializePicBase_x86
+)
+
+where
+
+import GhcPrelude
+
+import qualified GHC.CmmToAsm.PPC.Instr as PPC
+import qualified GHC.CmmToAsm.PPC.Regs  as PPC
+import qualified GHC.CmmToAsm.X86.Instr as X86
+
+import GHC.Platform
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Monad
+
+
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm
+import GHC.Cmm.CLabel           ( CLabel, ForeignLabelSource(..), pprCLabel,
+                          mkDynamicLinkerLabel, DynamicLinkerLabelInfo(..),
+                          dynamicLinkerLabelInfo, mkPicBaseLabel,
+                          labelDynamic, externallyVisibleCLabel )
+
+import GHC.Cmm.CLabel           ( mkForeignLabel )
+
+
+import BasicTypes
+import Module
+
+import Outputable
+
+import GHC.Driver.Session
+import FastString
+
+
+
+--------------------------------------------------------------------------------
+-- It gets called by the cmmToCmm pass for every CmmLabel in the Cmm
+-- code. It does The Right Thing(tm) to convert the CmmLabel into a
+-- position-independent, dynamic-linking-aware reference to the thing
+-- in question.
+-- Note that this also has to be called from MachCodeGen in order to
+-- access static data like floating point literals (labels that were
+-- created after the cmmToCmm pass).
+-- The function must run in a monad that can keep track of imported symbols
+-- A function for recording an imported symbol must be passed in:
+-- - addImportCmmOpt for the CmmOptM monad
+-- - addImportNat for the NatM monad.
+
+data ReferenceKind
+        = DataReference
+        | CallReference
+        | JumpReference
+        deriving(Eq)
+
+class Monad m => CmmMakeDynamicReferenceM m where
+    addImport :: CLabel -> m ()
+    getThisModule :: m Module
+
+instance CmmMakeDynamicReferenceM NatM where
+    addImport = addImportNat
+    getThisModule = getThisModuleNat
+
+cmmMakeDynamicReference
+  :: CmmMakeDynamicReferenceM m
+  => DynFlags
+  -> ReferenceKind     -- whether this is the target of a jump
+  -> CLabel            -- the label
+  -> m CmmExpr
+
+cmmMakeDynamicReference dflags referenceKind lbl
+  | Just _ <- dynamicLinkerLabelInfo lbl
+  = return $ CmmLit $ CmmLabel lbl   -- already processed it, pass through
+
+  | otherwise
+  = do this_mod <- getThisModule
+       case howToAccessLabel
+                dflags
+                (platformArch $ targetPlatform dflags)
+                (platformOS   $ targetPlatform dflags)
+                this_mod
+                referenceKind lbl of
+
+        AccessViaStub -> do
+              let stub = mkDynamicLinkerLabel CodeStub lbl
+              addImport stub
+              return $ CmmLit $ CmmLabel stub
+
+        AccessViaSymbolPtr -> do
+              let symbolPtr = mkDynamicLinkerLabel SymbolPtr lbl
+              addImport symbolPtr
+              return $ CmmLoad (cmmMakePicReference dflags symbolPtr) (bWord dflags)
+
+        AccessDirectly -> case referenceKind of
+                -- for data, we might have to make some calculations:
+              DataReference -> return $ cmmMakePicReference dflags lbl
+                -- all currently supported processors support
+                -- PC-relative branch and call instructions,
+                -- so just jump there if it's a call or a jump
+              _ -> return $ CmmLit $ CmmLabel lbl
+
+
+-- -----------------------------------------------------------------------------
+-- Create a position independent reference to a label.
+-- (but do not bother with dynamic linking).
+-- We calculate the label's address by adding some (platform-dependent)
+-- offset to our base register; this offset is calculated by
+-- the function picRelative in the platform-dependent part below.
+
+cmmMakePicReference :: DynFlags -> CLabel -> CmmExpr
+cmmMakePicReference dflags lbl
+
+        -- Windows doesn't need PIC,
+        -- everything gets relocated at runtime
+        | OSMinGW32 <- platformOS $ targetPlatform dflags
+        = CmmLit $ CmmLabel lbl
+
+        | OSAIX <- platformOS $ targetPlatform dflags
+        = CmmMachOp (MO_Add W32)
+                [ CmmReg (CmmGlobal PicBaseReg)
+                , CmmLit $ picRelative dflags
+                                (platformArch   $ targetPlatform dflags)
+                                (platformOS     $ targetPlatform dflags)
+                                lbl ]
+
+        -- both ABI versions default to medium code model
+        | ArchPPC_64 _ <- platformArch $ targetPlatform dflags
+        = CmmMachOp (MO_Add W32) -- code model medium
+                [ CmmReg (CmmGlobal PicBaseReg)
+                , CmmLit $ picRelative dflags
+                                (platformArch   $ targetPlatform dflags)
+                                (platformOS     $ targetPlatform dflags)
+                                lbl ]
+
+        | (positionIndependent dflags || gopt Opt_ExternalDynamicRefs dflags)
+            && absoluteLabel lbl
+        = CmmMachOp (MO_Add (wordWidth dflags))
+                [ CmmReg (CmmGlobal PicBaseReg)
+                , CmmLit $ picRelative dflags
+                                (platformArch   $ targetPlatform dflags)
+                                (platformOS     $ targetPlatform dflags)
+                                lbl ]
+
+        | otherwise
+        = CmmLit $ CmmLabel lbl
+
+
+absoluteLabel :: CLabel -> Bool
+absoluteLabel lbl
+ = case dynamicLinkerLabelInfo lbl of
+        Just (GotSymbolPtr, _)    -> False
+        Just (GotSymbolOffset, _) -> False
+        _                         -> True
+
+
+--------------------------------------------------------------------------------
+-- Knowledge about how special dynamic linker labels like symbol
+-- pointers, code stubs and GOT offsets look like is located in the
+-- module CLabel.
+
+-- We have to decide which labels need to be accessed
+-- indirectly or via a piece of stub code.
+data LabelAccessStyle
+        = AccessViaStub
+        | AccessViaSymbolPtr
+        | AccessDirectly
+
+howToAccessLabel
+        :: DynFlags -> Arch -> OS -> Module -> ReferenceKind -> CLabel -> LabelAccessStyle
+
+
+-- Windows
+-- In Windows speak, a "module" is a set of objects linked into the
+-- same Portable Executable (PE) file. (both .exe and .dll files are PEs).
+--
+-- If we're compiling a multi-module program then symbols from other modules
+-- are accessed by a symbol pointer named __imp_SYMBOL. At runtime we have the
+-- following.
+--
+--   (in the local module)
+--     __imp_SYMBOL: addr of SYMBOL
+--
+--   (in the other module)
+--     SYMBOL: the real function / data.
+--
+-- To access the function at SYMBOL from our local module, we just need to
+-- dereference the local __imp_SYMBOL.
+--
+-- If not compiling with -dynamic we assume that all our code will be linked
+-- into the same .exe file. In this case we always access symbols directly,
+-- and never use __imp_SYMBOL.
+--
+howToAccessLabel dflags _ OSMinGW32 this_mod _ lbl
+
+        -- Assume all symbols will be in the same PE, so just access them directly.
+        | not (gopt Opt_ExternalDynamicRefs dflags)
+        = AccessDirectly
+
+        -- If the target symbol is in another PE we need to access it via the
+        --      appropriate __imp_SYMBOL pointer.
+        | labelDynamic dflags this_mod lbl
+        = AccessViaSymbolPtr
+
+        -- Target symbol is in the same PE as the caller, so just access it directly.
+        | otherwise
+        = AccessDirectly
+
+
+-- Mach-O (Darwin, Mac OS X)
+--
+-- Indirect access is required in the following cases:
+--  * things imported from a dynamic library
+--  * (not on x86_64) data from a different module, if we're generating PIC code
+-- It is always possible to access something indirectly,
+-- even when it's not necessary.
+--
+howToAccessLabel dflags arch OSDarwin this_mod DataReference lbl
+        -- data access to a dynamic library goes via a symbol pointer
+        | labelDynamic dflags this_mod lbl
+        = AccessViaSymbolPtr
+
+        -- when generating PIC code, all cross-module data references must
+        -- must go via a symbol pointer, too, because the assembler
+        -- cannot generate code for a label difference where one
+        -- label is undefined. Doesn't apply t x86_64.
+        -- Unfortunately, we don't know whether it's cross-module,
+        -- so we do it for all externally visible labels.
+        -- This is a slight waste of time and space, but otherwise
+        -- we'd need to pass the current Module all the way in to
+        -- this function.
+        | arch /= ArchX86_64
+        , positionIndependent dflags && externallyVisibleCLabel lbl
+        = AccessViaSymbolPtr
+
+        | otherwise
+        = AccessDirectly
+
+howToAccessLabel dflags arch OSDarwin this_mod JumpReference lbl
+        -- dyld code stubs don't work for tailcalls because the
+        -- stack alignment is only right for regular calls.
+        -- Therefore, we have to go via a symbol pointer:
+        | arch == ArchX86 || arch == ArchX86_64
+        , labelDynamic dflags this_mod lbl
+        = AccessViaSymbolPtr
+
+
+howToAccessLabel dflags arch OSDarwin this_mod _ lbl
+        -- Code stubs are the usual method of choice for imported code;
+        -- not needed on x86_64 because Apple's new linker, ld64, generates
+        -- them automatically.
+        | arch /= ArchX86_64
+        , labelDynamic dflags this_mod lbl
+        = AccessViaStub
+
+        | otherwise
+        = AccessDirectly
+
+
+----------------------------------------------------------------------------
+-- AIX
+
+-- quite simple (for now)
+howToAccessLabel _dflags _arch OSAIX _this_mod kind _lbl
+        = case kind of
+            DataReference -> AccessViaSymbolPtr
+            CallReference -> AccessDirectly
+            JumpReference -> AccessDirectly
+
+-- ELF (Linux)
+--
+-- ELF tries to pretend to the main application code that dynamic linking does
+-- not exist. While this may sound convenient, it tends to mess things up in
+-- very bad ways, so we have to be careful when we generate code for a non-PIE
+-- main program (-dynamic but no -fPIC).
+--
+-- Indirect access is required for references to imported symbols
+-- from position independent code. It is also required from the main program
+-- when dynamic libraries containing Haskell code are used.
+
+howToAccessLabel _ (ArchPPC_64 _) os _ kind _
+        | osElfTarget os
+        = case kind of
+          -- ELF PPC64 (powerpc64-linux), AIX, MacOS 9, BeOS/PPC
+          DataReference -> AccessViaSymbolPtr
+          -- RTLD does not generate stubs for function descriptors
+          -- in tail calls. Create a symbol pointer and generate
+          -- the code to load the function descriptor at the call site.
+          JumpReference -> AccessViaSymbolPtr
+          -- regular calls are handled by the runtime linker
+          _             -> AccessDirectly
+
+howToAccessLabel dflags _ os _ _ _
+        -- no PIC -> the dynamic linker does everything for us;
+        --           if we don't dynamically link to Haskell code,
+        --           it actually manages to do so without messing things up.
+        | osElfTarget os
+        , not (positionIndependent dflags) &&
+          not (gopt Opt_ExternalDynamicRefs dflags)
+        = AccessDirectly
+
+howToAccessLabel dflags arch os this_mod DataReference lbl
+        | osElfTarget os
+        = case () of
+            -- A dynamic label needs to be accessed via a symbol pointer.
+          _ | labelDynamic dflags this_mod lbl
+            -> AccessViaSymbolPtr
+
+            -- For PowerPC32 -fPIC, we have to access even static data
+            -- via a symbol pointer (see below for an explanation why
+            -- PowerPC32 Linux is especially broken).
+            | arch == ArchPPC
+            , positionIndependent dflags
+            -> AccessViaSymbolPtr
+
+            | otherwise
+            -> AccessDirectly
+
+
+        -- In most cases, we have to avoid symbol stubs on ELF, for the following reasons:
+        --   on i386, the position-independent symbol stubs in the Procedure Linkage Table
+        --   require the address of the GOT to be loaded into register %ebx on entry.
+        --   The linker will take any reference to the symbol stub as a hint that
+        --   the label in question is a code label. When linking executables, this
+        --   will cause the linker to replace even data references to the label with
+        --   references to the symbol stub.
+
+        -- This leaves calling a (foreign) function from non-PIC code
+        -- (AccessDirectly, because we get an implicit symbol stub)
+        -- and calling functions from PIC code on non-i386 platforms (via a symbol stub)
+
+howToAccessLabel dflags arch os this_mod CallReference lbl
+        | osElfTarget os
+        , labelDynamic dflags this_mod lbl && not (positionIndependent dflags)
+        = AccessDirectly
+
+        | osElfTarget os
+        , arch /= ArchX86
+        , labelDynamic dflags this_mod lbl
+        , positionIndependent dflags
+        = AccessViaStub
+
+howToAccessLabel dflags _ os this_mod _ lbl
+        | osElfTarget os
+        = if labelDynamic dflags this_mod lbl
+            then AccessViaSymbolPtr
+            else AccessDirectly
+
+-- all other platforms
+howToAccessLabel dflags _ _ _ _ _
+        | not (positionIndependent dflags)
+        = AccessDirectly
+
+        | otherwise
+        = panic "howToAccessLabel: PIC not defined for this platform"
+
+
+
+-- -------------------------------------------------------------------
+-- | Says what we have to add to our 'PIC base register' in order to
+--      get the address of a label.
+
+picRelative :: DynFlags -> Arch -> OS -> CLabel -> CmmLit
+
+-- Darwin, but not x86_64:
+-- The PIC base register points to the PIC base label at the beginning
+-- of the current CmmDecl. We just have to use a label difference to
+-- get the offset.
+-- We have already made sure that all labels that are not from the current
+-- module are accessed indirectly ('as' can't calculate differences between
+-- undefined labels).
+picRelative dflags arch OSDarwin lbl
+        | arch /= ArchX86_64
+        = CmmLabelDiffOff lbl mkPicBaseLabel 0 (wordWidth dflags)
+
+-- On AIX we use an indirect local TOC anchored by 'gotLabel'.
+-- This way we use up only one global TOC entry per compilation-unit
+-- (this is quite similar to GCC's @-mminimal-toc@ compilation mode)
+picRelative dflags _ OSAIX lbl
+        = CmmLabelDiffOff lbl gotLabel 0 (wordWidth dflags)
+
+-- PowerPC Linux:
+-- The PIC base register points to our fake GOT. Use a label difference
+-- to get the offset.
+-- We have made sure that *everything* is accessed indirectly, so this
+-- is only used for offsets from the GOT to symbol pointers inside the
+-- GOT.
+picRelative dflags ArchPPC os lbl
+        | osElfTarget os
+        = CmmLabelDiffOff lbl gotLabel 0 (wordWidth dflags)
+
+
+-- Most Linux versions:
+-- The PIC base register points to the GOT. Use foo@got for symbol
+-- pointers, and foo@gotoff for everything else.
+-- Linux and Darwin on x86_64:
+-- The PIC base register is %rip, we use foo@gotpcrel for symbol pointers,
+-- and a GotSymbolOffset label for other things.
+-- For reasons of tradition, the symbol offset label is written as a plain label.
+picRelative _ arch os lbl
+        | osElfTarget os || (os == OSDarwin && arch == ArchX86_64)
+        = let   result
+                        | Just (SymbolPtr, lbl') <- dynamicLinkerLabelInfo lbl
+                        = CmmLabel $ mkDynamicLinkerLabel GotSymbolPtr lbl'
+
+                        | otherwise
+                        = CmmLabel $ mkDynamicLinkerLabel GotSymbolOffset lbl
+
+          in    result
+
+picRelative _ _ _ _
+        = panic "PositionIndependentCode.picRelative undefined for this platform"
+
+
+
+--------------------------------------------------------------------------------
+
+needImportedSymbols :: DynFlags -> Arch -> OS -> Bool
+needImportedSymbols dflags arch os
+        | os    == OSDarwin
+        , arch  /= ArchX86_64
+        = True
+
+        | os    == OSAIX
+        = True
+
+        -- PowerPC Linux: -fPIC or -dynamic
+        | osElfTarget os
+        , arch  == ArchPPC
+        = positionIndependent dflags || gopt Opt_ExternalDynamicRefs dflags
+
+        -- PowerPC 64 Linux: always
+        | osElfTarget os
+        , arch == ArchPPC_64 ELF_V1 || arch == ArchPPC_64 ELF_V2
+        = True
+
+        -- i386 (and others?): -dynamic but not -fPIC
+        | osElfTarget os
+        , arch /= ArchPPC_64 ELF_V1 && arch /= ArchPPC_64 ELF_V2
+        = gopt Opt_ExternalDynamicRefs dflags &&
+          not (positionIndependent dflags)
+
+        | otherwise
+        = False
+
+-- gotLabel
+-- The label used to refer to our "fake GOT" from
+-- position-independent code.
+gotLabel :: CLabel
+gotLabel
+        -- HACK: this label isn't really foreign
+        = mkForeignLabel
+                (fsLit ".LCTOC1")
+                Nothing ForeignLabelInThisPackage IsData
+
+
+
+--------------------------------------------------------------------------------
+-- We don't need to declare any offset tables.
+-- However, for PIC on x86, we need a small helper function.
+pprGotDeclaration :: DynFlags -> Arch -> OS -> SDoc
+pprGotDeclaration dflags ArchX86 OSDarwin
+        | positionIndependent dflags
+        = vcat [
+                text ".section __TEXT,__textcoal_nt,coalesced,no_toc",
+                text ".weak_definition ___i686.get_pc_thunk.ax",
+                text ".private_extern ___i686.get_pc_thunk.ax",
+                text "___i686.get_pc_thunk.ax:",
+                text "\tmovl (%esp), %eax",
+                text "\tret" ]
+
+pprGotDeclaration _ _ OSDarwin
+        = empty
+
+-- Emit XCOFF TOC section
+pprGotDeclaration _ _ OSAIX
+        = vcat $ [ text ".toc"
+                 , text ".tc ghc_toc_table[TC],.LCTOC1"
+                 , text ".csect ghc_toc_table[RW]"
+                   -- See Note [.LCTOC1 in PPC PIC code]
+                 , text ".set .LCTOC1,$+0x8000"
+                 ]
+
+
+-- PPC 64 ELF v1 needs a Table Of Contents (TOC)
+pprGotDeclaration _ (ArchPPC_64 ELF_V1) _
+        = text ".section \".toc\",\"aw\""
+-- In ELF v2 we also need to tell the assembler that we want ABI
+-- version 2. This would normally be done at the top of the file
+-- right after a file directive, but I could not figure out how
+-- to do that.
+pprGotDeclaration _ (ArchPPC_64 ELF_V2) _
+        = vcat [ text ".abiversion 2",
+                 text ".section \".toc\",\"aw\""
+               ]
+
+-- Emit GOT declaration
+-- Output whatever needs to be output once per .s file.
+pprGotDeclaration dflags arch os
+        | osElfTarget os
+        , arch /= ArchPPC_64 ELF_V1 && arch /= ArchPPC_64 ELF_V2
+        , not (positionIndependent dflags)
+        = empty
+
+        | osElfTarget os
+        , arch /= ArchPPC_64 ELF_V1 && arch /= ArchPPC_64 ELF_V2
+        = vcat [
+                -- See Note [.LCTOC1 in PPC PIC code]
+                text ".section \".got2\",\"aw\"",
+                text ".LCTOC1 = .+32768" ]
+
+pprGotDeclaration _ _ _
+        = panic "pprGotDeclaration: no match"
+
+
+--------------------------------------------------------------------------------
+-- On Darwin, we have to generate our own stub code for lazy binding..
+-- For each processor architecture, there are two versions, one for PIC
+-- and one for non-PIC.
+--
+
+pprImportedSymbol :: DynFlags -> Platform -> CLabel -> SDoc
+pprImportedSymbol dflags (Platform { platformMini = PlatformMini { platformMini_arch = ArchX86, platformMini_os = OSDarwin } }) importedLbl
+        | Just (CodeStub, lbl) <- dynamicLinkerLabelInfo importedLbl
+        = case positionIndependent dflags of
+           False ->
+            vcat [
+                text ".symbol_stub",
+                text "L" <> pprCLabel dflags lbl <> ptext (sLit "$stub:"),
+                    text "\t.indirect_symbol" <+> pprCLabel dflags lbl,
+                    text "\tjmp *L" <> pprCLabel dflags lbl
+                        <> text "$lazy_ptr",
+                text "L" <> pprCLabel dflags lbl
+                    <> text "$stub_binder:",
+                    text "\tpushl $L" <> pprCLabel dflags lbl
+                        <> text "$lazy_ptr",
+                    text "\tjmp dyld_stub_binding_helper"
+            ]
+           True ->
+            vcat [
+                text ".section __TEXT,__picsymbolstub2,"
+                    <> text "symbol_stubs,pure_instructions,25",
+                text "L" <> pprCLabel dflags lbl <> ptext (sLit "$stub:"),
+                    text "\t.indirect_symbol" <+> pprCLabel dflags lbl,
+                    text "\tcall ___i686.get_pc_thunk.ax",
+                text "1:",
+                    text "\tmovl L" <> pprCLabel dflags lbl
+                        <> text "$lazy_ptr-1b(%eax),%edx",
+                    text "\tjmp *%edx",
+                text "L" <> pprCLabel dflags lbl
+                    <> text "$stub_binder:",
+                    text "\tlea L" <> pprCLabel dflags lbl
+                        <> text "$lazy_ptr-1b(%eax),%eax",
+                    text "\tpushl %eax",
+                    text "\tjmp dyld_stub_binding_helper"
+            ]
+          $+$ vcat [        text ".section __DATA, __la_sym_ptr"
+                    <> (if positionIndependent dflags then int 2 else int 3)
+                    <> text ",lazy_symbol_pointers",
+                text "L" <> pprCLabel dflags lbl <> ptext (sLit "$lazy_ptr:"),
+                    text "\t.indirect_symbol" <+> pprCLabel dflags lbl,
+                    text "\t.long L" <> pprCLabel dflags lbl
+                    <> text "$stub_binder"]
+
+        | Just (SymbolPtr, lbl) <- dynamicLinkerLabelInfo importedLbl
+        = vcat [
+                text ".non_lazy_symbol_pointer",
+                char 'L' <> pprCLabel dflags lbl <> text "$non_lazy_ptr:",
+                text "\t.indirect_symbol" <+> pprCLabel dflags lbl,
+                text "\t.long\t0"]
+
+        | otherwise
+        = empty
+
+
+pprImportedSymbol _ (Platform { platformMini = PlatformMini { platformMini_os = OSDarwin } }) _
+        = empty
+
+-- XCOFF / AIX
+--
+-- Similar to PPC64 ELF v1, there's dedicated TOC register (r2). To
+-- workaround the limitation of a global TOC we use an indirect TOC
+-- with the label `ghc_toc_table`.
+--
+-- See also GCC's `-mminimal-toc` compilation mode or
+-- http://www.ibm.com/developerworks/rational/library/overview-toc-aix/
+--
+-- NB: No DSO-support yet
+
+pprImportedSymbol dflags (Platform { platformMini = PlatformMini { platformMini_os = OSAIX } }) importedLbl
+        = case dynamicLinkerLabelInfo importedLbl of
+            Just (SymbolPtr, lbl)
+              -> vcat [
+                   text "LC.." <> pprCLabel dflags lbl <> char ':',
+                   text "\t.long" <+> pprCLabel dflags lbl ]
+            _ -> empty
+
+-- ELF / Linux
+--
+-- In theory, we don't need to generate any stubs or symbol pointers
+-- by hand for Linux.
+--
+-- Reality differs from this in two areas.
+--
+-- 1) If we just use a dynamically imported symbol directly in a read-only
+--    section of the main executable (as GCC does), ld generates R_*_COPY
+--    relocations, which are fundamentally incompatible with reversed info
+--    tables. Therefore, we need a table of imported addresses in a writable
+--    section.
+--    The "official" GOT mechanism (label@got) isn't intended to be used
+--    in position dependent code, so we have to create our own "fake GOT"
+--    when not Opt_PIC && WayDyn `elem` ways dflags.
+--
+-- 2) PowerPC Linux is just plain broken.
+--    While it's theoretically possible to use GOT offsets larger
+--    than 16 bit, the standard crt*.o files don't, which leads to
+--    linker errors as soon as the GOT size exceeds 16 bit.
+--    Also, the assembler doesn't support @gotoff labels.
+--    In order to be able to use a larger GOT, we have to circumvent the
+--    entire GOT mechanism and do it ourselves (this is also what GCC does).
+
+
+-- When needImportedSymbols is defined,
+-- the NCG will keep track of all DynamicLinkerLabels it uses
+-- and output each of them using pprImportedSymbol.
+
+pprImportedSymbol dflags platform@(Platform { platformMini = PlatformMini { platformMini_arch = ArchPPC_64 _ } })
+                  importedLbl
+        | osElfTarget (platformOS platform)
+        = case dynamicLinkerLabelInfo importedLbl of
+            Just (SymbolPtr, lbl)
+              -> vcat [
+                   text ".section \".toc\", \"aw\"",
+                   text ".LC_" <> pprCLabel dflags lbl <> char ':',
+                   text "\t.quad" <+> pprCLabel dflags lbl ]
+            _ -> empty
+
+pprImportedSymbol dflags platform importedLbl
+        | osElfTarget (platformOS platform)
+        = case dynamicLinkerLabelInfo importedLbl of
+            Just (SymbolPtr, lbl)
+              -> let symbolSize = case wordWidth dflags of
+                         W32 -> sLit "\t.long"
+                         W64 -> sLit "\t.quad"
+                         _ -> panic "Unknown wordRep in pprImportedSymbol"
+
+                 in vcat [
+                      text ".section \".got2\", \"aw\"",
+                      text ".LC_" <> pprCLabel dflags lbl <> char ':',
+                      ptext symbolSize <+> pprCLabel dflags lbl ]
+
+            -- PLT code stubs are generated automatically by the dynamic linker.
+            _ -> empty
+
+pprImportedSymbol _ _ _
+        = panic "PIC.pprImportedSymbol: no match"
+
+--------------------------------------------------------------------------------
+-- Generate code to calculate the address that should be put in the
+-- PIC base register.
+-- This is called by MachCodeGen for every CmmProc that accessed the
+-- PIC base register. It adds the appropriate instructions to the
+-- top of the CmmProc.
+
+-- It is assumed that the first NatCmmDecl in the input list is a Proc
+-- and the rest are CmmDatas.
+
+-- Darwin is simple: just fetch the address of a local label.
+-- The FETCHPC pseudo-instruction is expanded to multiple instructions
+-- during pretty-printing so that we don't have to deal with the
+-- local label:
+
+-- PowerPC version:
+--          bcl 20,31,1f.
+--      1:  mflr picReg
+
+-- i386 version:
+--          call 1f
+--      1:  popl %picReg
+
+
+
+-- Get a pointer to our own fake GOT, which is defined on a per-module basis.
+-- This is exactly how GCC does it in linux.
+
+initializePicBase_ppc
+        :: Arch -> OS -> Reg
+        -> [NatCmmDecl RawCmmStatics PPC.Instr]
+        -> NatM [NatCmmDecl RawCmmStatics PPC.Instr]
+
+initializePicBase_ppc ArchPPC os picReg
+    (CmmProc info lab live (ListGraph blocks) : statics)
+    | osElfTarget os
+    = do
+        let
+            gotOffset = PPC.ImmConstantDiff
+                                (PPC.ImmCLbl gotLabel)
+                                (PPC.ImmCLbl mkPicBaseLabel)
+
+            blocks' = case blocks of
+                       [] -> []
+                       (b:bs) -> fetchPC b : map maybeFetchPC bs
+
+            maybeFetchPC b@(BasicBlock bID _)
+              | bID `mapMember` info = fetchPC b
+              | otherwise            = b
+
+            -- GCC does PIC prologs thusly:
+            --     bcl 20,31,.L1
+            -- .L1:
+            --     mflr 30
+            --     addis 30,30,.LCTOC1-.L1@ha
+            --     addi 30,30,.LCTOC1-.L1@l
+            -- TODO: below we use it over temporary register,
+            -- it can and should be optimised by picking
+            -- correct PIC reg.
+            fetchPC (BasicBlock bID insns) =
+              BasicBlock bID (PPC.FETCHPC picReg
+                              : PPC.ADDIS picReg picReg (PPC.HA gotOffset)
+                              : PPC.ADD picReg picReg
+                                        (PPC.RIImm (PPC.LO gotOffset))
+                              : PPC.MR PPC.r30 picReg
+                              : insns)
+
+        return (CmmProc info lab live (ListGraph blocks') : statics)
+
+-------------------------------------------------------------------------
+-- Load TOC into register 2
+-- PowerPC 64-bit ELF ABI 2.0 requires the address of the callee
+-- in register 12.
+-- We pass the label to FETCHTOC and create a .localentry too.
+-- TODO: Explain this better and refer to ABI spec!
+{-
+We would like to do approximately this, but spill slot allocation
+might be added before the first BasicBlock. That violates the ABI.
+
+For now we will emit the prologue code in the pretty printer,
+which is also what we do for ELF v1.
+initializePicBase_ppc (ArchPPC_64 ELF_V2) OSLinux picReg
+        (CmmProc info lab live (ListGraph (entry:blocks)) : statics)
+        = do
+           bID <-getUniqueM
+           return (CmmProc info lab live (ListGraph (b':entry:blocks))
+                                         : statics)
+        where   BasicBlock entryID _ = entry
+                b' = BasicBlock bID [PPC.FETCHTOC picReg lab,
+                                     PPC.BCC PPC.ALWAYS entryID]
+-}
+
+initializePicBase_ppc _ _ _ _
+        = panic "initializePicBase_ppc: not needed"
+
+
+-- We cheat a bit here by defining a pseudo-instruction named FETCHGOT
+-- which pretty-prints as:
+--              call 1f
+-- 1:           popl %picReg
+--              addl __GLOBAL_OFFSET_TABLE__+.-1b, %picReg
+-- (See PprMach.hs)
+
+initializePicBase_x86
+        :: Arch -> OS -> Reg
+        -> [NatCmmDecl (Alignment, RawCmmStatics) X86.Instr]
+        -> NatM [NatCmmDecl (Alignment, RawCmmStatics) X86.Instr]
+
+initializePicBase_x86 ArchX86 os picReg
+        (CmmProc info lab live (ListGraph blocks) : statics)
+    | osElfTarget os
+    = return (CmmProc info lab live (ListGraph blocks') : statics)
+    where blocks' = case blocks of
+                     [] -> []
+                     (b:bs) -> fetchGOT b : map maybeFetchGOT bs
+
+          -- we want to add a FETCHGOT instruction to the beginning of
+          -- every block that is an entry point, which corresponds to
+          -- the blocks that have entries in the info-table mapping.
+          maybeFetchGOT b@(BasicBlock bID _)
+            | bID `mapMember` info = fetchGOT b
+            | otherwise            = b
+
+          fetchGOT (BasicBlock bID insns) =
+             BasicBlock bID (X86.FETCHGOT picReg : insns)
+
+initializePicBase_x86 ArchX86 OSDarwin picReg
+        (CmmProc info lab live (ListGraph (entry:blocks)) : statics)
+        = return (CmmProc info lab live (ListGraph (block':blocks)) : statics)
+
+    where BasicBlock bID insns = entry
+          block' = BasicBlock bID (X86.FETCHPC picReg : insns)
+
+initializePicBase_x86 _ _ _ _
+        = panic "initializePicBase_x86: not needed"
+
diff --git a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
new file mode 100644
index 0000000000..02319171dc
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
@@ -0,0 +1,2455 @@
+{-# LANGUAGE CPP, GADTs #-}
+
+-----------------------------------------------------------------------------
+--
+-- Generating machine code (instruction selection)
+--
+-- (c) The University of Glasgow 1996-2004
+--
+-----------------------------------------------------------------------------
+
+-- This is a big module, but, if you pay attention to
+-- (a) the sectioning, and (b) the type signatures,
+-- the structure should not be too overwhelming.
+
+module GHC.CmmToAsm.PPC.CodeGen (
+        cmmTopCodeGen,
+        generateJumpTableForInstr,
+        InstrBlock
+)
+
+where
+
+#include "HsVersions.h"
+
+-- NCG stuff:
+import GhcPrelude
+
+import GHC.Platform.Regs
+import GHC.CmmToAsm.PPC.Instr
+import GHC.CmmToAsm.PPC.Cond
+import GHC.CmmToAsm.PPC.Regs
+import GHC.CmmToAsm.CPrim
+import GHC.CmmToAsm.Monad
+   ( NatM, getNewRegNat, getNewLabelNat
+   , getBlockIdNat, getPicBaseNat, getNewRegPairNat
+   , getPicBaseMaybeNat
+   )
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.PIC
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Reg.Target
+import GHC.Platform
+
+-- Our intermediate code:
+import GHC.Cmm.BlockId
+import GHC.Cmm.Ppr           ( pprExpr )
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import GHC.Cmm.CLabel
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+
+-- The rest:
+import OrdList
+import Outputable
+import GHC.Driver.Session
+
+import Control.Monad    ( mapAndUnzipM, when )
+import Data.Bits
+import Data.Word
+
+import BasicTypes
+import FastString
+import Util
+
+-- -----------------------------------------------------------------------------
+-- Top-level of the instruction selector
+
+-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
+-- They are really trees of insns to facilitate fast appending, where a
+-- left-to-right traversal (pre-order?) yields the insns in the correct
+-- order.
+
+cmmTopCodeGen
+        :: RawCmmDecl
+        -> NatM [NatCmmDecl RawCmmStatics Instr]
+
+cmmTopCodeGen (CmmProc info lab live graph) = do
+  let blocks = toBlockListEntryFirst graph
+  (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
+  dflags <- getDynFlags
+  let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
+      tops = proc : concat statics
+      os   = platformOS $ targetPlatform dflags
+      arch = platformArch $ targetPlatform dflags
+  case arch of
+    ArchPPC | os == OSAIX -> return tops
+            | otherwise -> do
+      picBaseMb <- getPicBaseMaybeNat
+      case picBaseMb of
+           Just picBase -> initializePicBase_ppc arch os picBase tops
+           Nothing -> return tops
+    ArchPPC_64 ELF_V1 -> fixup_entry tops
+                      -- generating function descriptor is handled in
+                      -- pretty printer
+    ArchPPC_64 ELF_V2 -> fixup_entry tops
+                      -- generating function prologue is handled in
+                      -- pretty printer
+    _          -> panic "PPC.cmmTopCodeGen: unknown arch"
+    where
+      fixup_entry (CmmProc info lab live (ListGraph (entry:blocks)) : statics)
+        = do
+        let BasicBlock bID insns = entry
+        bID' <- if lab == (blockLbl bID)
+                then newBlockId
+                else return bID
+        let b' = BasicBlock bID' insns
+        return (CmmProc info lab live (ListGraph (b':blocks)) : statics)
+      fixup_entry _ = panic "cmmTopCodegen: Broken CmmProc"
+
+cmmTopCodeGen (CmmData sec dat) = do
+  return [CmmData sec dat]  -- no translation, we just use CmmStatic
+
+basicBlockCodeGen
+        :: Block CmmNode C C
+        -> NatM ( [NatBasicBlock Instr]
+                , [NatCmmDecl RawCmmStatics Instr])
+
+basicBlockCodeGen block = do
+  let (_, nodes, tail)  = blockSplit block
+      id = entryLabel block
+      stmts = blockToList nodes
+  mid_instrs <- stmtsToInstrs stmts
+  tail_instrs <- stmtToInstrs tail
+  let instrs = mid_instrs `appOL` tail_instrs
+  -- code generation may introduce new basic block boundaries, which
+  -- are indicated by the NEWBLOCK instruction.  We must split up the
+  -- instruction stream into basic blocks again.  Also, we extract
+  -- LDATAs here too.
+  let
+        (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs
+
+        mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
+          = ([], BasicBlock id instrs : blocks, statics)
+        mkBlocks (LDATA sec dat) (instrs,blocks,statics)
+          = (instrs, blocks, CmmData sec dat:statics)
+        mkBlocks instr (instrs,blocks,statics)
+          = (instr:instrs, blocks, statics)
+  return (BasicBlock id top : other_blocks, statics)
+
+stmtsToInstrs :: [CmmNode e x] -> NatM InstrBlock
+stmtsToInstrs stmts
+   = do instrss <- mapM stmtToInstrs stmts
+        return (concatOL instrss)
+
+stmtToInstrs :: CmmNode e x -> NatM InstrBlock
+stmtToInstrs stmt = do
+  dflags <- getDynFlags
+  case stmt of
+    CmmComment s   -> return (unitOL (COMMENT s))
+    CmmTick {}     -> return nilOL
+    CmmUnwind {}   -> return nilOL
+
+    CmmAssign reg src
+      | isFloatType ty -> assignReg_FltCode format reg src
+      | target32Bit (targetPlatform dflags) &&
+        isWord64 ty    -> assignReg_I64Code      reg src
+      | otherwise      -> assignReg_IntCode format reg src
+        where ty = cmmRegType dflags reg
+              format = cmmTypeFormat ty
+
+    CmmStore addr src
+      | isFloatType ty -> assignMem_FltCode format addr src
+      | target32Bit (targetPlatform dflags) &&
+        isWord64 ty    -> assignMem_I64Code      addr src
+      | otherwise      -> assignMem_IntCode format addr src
+        where ty = cmmExprType dflags src
+              format = cmmTypeFormat ty
+
+    CmmUnsafeForeignCall target result_regs args
+       -> genCCall target result_regs args
+
+    CmmBranch id          -> genBranch id
+    CmmCondBranch arg true false prediction -> do
+      b1 <- genCondJump true arg prediction
+      b2 <- genBranch false
+      return (b1 `appOL` b2)
+    CmmSwitch arg ids -> do dflags <- getDynFlags
+                            genSwitch dflags arg ids
+    CmmCall { cml_target = arg
+            , cml_args_regs = gregs } -> do
+                                dflags <- getDynFlags
+                                genJump arg (jumpRegs dflags gregs)
+    _ ->
+      panic "stmtToInstrs: statement should have been cps'd away"
+
+jumpRegs :: DynFlags -> [GlobalReg] -> [Reg]
+jumpRegs dflags gregs = [ RegReal r | Just r <- map (globalRegMaybe platform) gregs ]
+    where platform = targetPlatform dflags
+
+--------------------------------------------------------------------------------
+-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
+--      They are really trees of insns to facilitate fast appending, where a
+--      left-to-right traversal yields the insns in the correct order.
+--
+type InstrBlock
+        = OrdList Instr
+
+
+-- | Register's passed up the tree.  If the stix code forces the register
+--      to live in a pre-decided machine register, it comes out as @Fixed@;
+--      otherwise, it comes out as @Any@, and the parent can decide which
+--      register to put it in.
+--
+data Register
+        = Fixed Format Reg InstrBlock
+        | Any   Format (Reg -> InstrBlock)
+
+
+swizzleRegisterRep :: Register -> Format -> Register
+swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code
+swizzleRegisterRep (Any _ codefn)     format = Any   format codefn
+
+
+-- | Grab the Reg for a CmmReg
+getRegisterReg :: Platform -> CmmReg -> Reg
+
+getRegisterReg _ (CmmLocal (LocalReg u pk))
+  = RegVirtual $ mkVirtualReg u (cmmTypeFormat pk)
+
+getRegisterReg platform (CmmGlobal mid)
+  = case globalRegMaybe platform mid of
+        Just reg -> RegReal reg
+        Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
+        -- By this stage, the only MagicIds remaining should be the
+        -- ones which map to a real machine register on this
+        -- platform.  Hence ...
+
+-- | Convert a BlockId to some CmmStatic data
+jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic
+jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags))
+jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
+    where blockLabel = blockLbl blockid
+
+
+
+-- -----------------------------------------------------------------------------
+-- General things for putting together code sequences
+
+-- Expand CmmRegOff.  ToDo: should we do it this way around, or convert
+-- CmmExprs into CmmRegOff?
+mangleIndexTree :: DynFlags -> CmmExpr -> CmmExpr
+mangleIndexTree dflags (CmmRegOff reg off)
+  = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
+  where width = typeWidth (cmmRegType dflags reg)
+
+mangleIndexTree _ _
+        = panic "PPC.CodeGen.mangleIndexTree: no match"
+
+-- -----------------------------------------------------------------------------
+--  Code gen for 64-bit arithmetic on 32-bit platforms
+
+{-
+Simple support for generating 64-bit code (ie, 64 bit values and 64
+bit assignments) on 32-bit platforms.  Unlike the main code generator
+we merely shoot for generating working code as simply as possible, and
+pay little attention to code quality.  Specifically, there is no
+attempt to deal cleverly with the fixed-vs-floating register
+distinction; all values are generated into (pairs of) floating
+registers, even if this would mean some redundant reg-reg moves as a
+result.  Only one of the VRegUniques is returned, since it will be
+of the VRegUniqueLo form, and the upper-half VReg can be determined
+by applying getHiVRegFromLo to it.
+-}
+
+data ChildCode64        -- a.k.a "Register64"
+      = ChildCode64
+           InstrBlock   -- code
+           Reg          -- the lower 32-bit temporary which contains the
+                        -- result; use getHiVRegFromLo to find the other
+                        -- VRegUnique.  Rules of this simplified insn
+                        -- selection game are therefore that the returned
+                        -- Reg may be modified
+
+
+-- | Compute an expression into a register, but
+--      we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getSomeReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code -> do
+        tmp <- getNewRegNat rep
+        return (tmp, code tmp)
+    Fixed _ reg code ->
+        return (reg, code)
+
+getI64Amodes :: CmmExpr -> NatM (AddrMode, AddrMode, InstrBlock)
+getI64Amodes addrTree = do
+    Amode hi_addr addr_code <- getAmode D addrTree
+    case addrOffset hi_addr 4 of
+        Just lo_addr -> return (hi_addr, lo_addr, addr_code)
+        Nothing      -> do (hi_ptr, code) <- getSomeReg addrTree
+                           return (AddrRegImm hi_ptr (ImmInt 0),
+                                   AddrRegImm hi_ptr (ImmInt 4),
+                                   code)
+
+
+assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
+assignMem_I64Code addrTree valueTree = do
+        (hi_addr, lo_addr, addr_code) <- getI64Amodes addrTree
+        ChildCode64 vcode rlo <- iselExpr64 valueTree
+        let
+                rhi = getHiVRegFromLo rlo
+
+                -- Big-endian store
+                mov_hi = ST II32 rhi hi_addr
+                mov_lo = ST II32 rlo lo_addr
+        return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
+
+
+assignReg_I64Code :: CmmReg  -> CmmExpr -> NatM InstrBlock
+assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do
+   ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
+   let
+         r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32
+         r_dst_hi = getHiVRegFromLo r_dst_lo
+         r_src_hi = getHiVRegFromLo r_src_lo
+         mov_lo = MR r_dst_lo r_src_lo
+         mov_hi = MR r_dst_hi r_src_hi
+   return (
+        vcode `snocOL` mov_lo `snocOL` mov_hi
+     )
+
+assignReg_I64Code _ _
+   = panic "assignReg_I64Code(powerpc): invalid lvalue"
+
+
+iselExpr64        :: CmmExpr -> NatM ChildCode64
+iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do
+    (hi_addr, lo_addr, addr_code) <- getI64Amodes addrTree
+    (rlo, rhi) <- getNewRegPairNat II32
+    let mov_hi = LD II32 rhi hi_addr
+        mov_lo = LD II32 rlo lo_addr
+    return $ ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi)
+                         rlo
+
+iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty
+   = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32))
+
+iselExpr64 (CmmLit (CmmInt i _)) = do
+  (rlo,rhi) <- getNewRegPairNat II32
+  let
+        half0 = fromIntegral (fromIntegral i :: Word16)
+        half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
+        half2 = fromIntegral (fromIntegral (i `shiftR` 32) :: Word16)
+        half3 = fromIntegral (fromIntegral (i `shiftR` 48) :: Word16)
+
+        code = toOL [
+                LIS rlo (ImmInt half1),
+                OR rlo rlo (RIImm $ ImmInt half0),
+                LIS rhi (ImmInt half3),
+                OR rhi rhi (RIImm $ ImmInt half2)
+                ]
+  return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
+   ChildCode64 code1 r1lo <- iselExpr64 e1
+   ChildCode64 code2 r2lo <- iselExpr64 e2
+   (rlo,rhi) <- getNewRegPairNat II32
+   let
+        r1hi = getHiVRegFromLo r1lo
+        r2hi = getHiVRegFromLo r2lo
+        code =  code1 `appOL`
+                code2 `appOL`
+                toOL [ ADDC rlo r1lo r2lo,
+                       ADDE rhi r1hi r2hi ]
+   return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do
+   ChildCode64 code1 r1lo <- iselExpr64 e1
+   ChildCode64 code2 r2lo <- iselExpr64 e2
+   (rlo,rhi) <- getNewRegPairNat II32
+   let
+        r1hi = getHiVRegFromLo r1lo
+        r2hi = getHiVRegFromLo r2lo
+        code =  code1 `appOL`
+                code2 `appOL`
+                toOL [ SUBFC rlo r2lo (RIReg r1lo),
+                       SUBFE rhi r2hi r1hi ]
+   return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_UU_Conv W32 W64) [expr]) = do
+    (expr_reg,expr_code) <- getSomeReg expr
+    (rlo, rhi) <- getNewRegPairNat II32
+    let mov_hi = LI rhi (ImmInt 0)
+        mov_lo = MR rlo expr_reg
+    return $ ChildCode64 (expr_code `snocOL` mov_lo `snocOL` mov_hi)
+                         rlo
+
+iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
+    (expr_reg,expr_code) <- getSomeReg expr
+    (rlo, rhi) <- getNewRegPairNat II32
+    let mov_hi = SRA II32 rhi expr_reg (RIImm (ImmInt 31))
+        mov_lo = MR rlo expr_reg
+    return $ ChildCode64 (expr_code `snocOL` mov_lo `snocOL` mov_hi)
+                         rlo
+iselExpr64 expr
+   = pprPanic "iselExpr64(powerpc)" (pprExpr expr)
+
+
+
+getRegister :: CmmExpr -> NatM Register
+getRegister e = do dflags <- getDynFlags
+                   getRegister' dflags e
+
+getRegister' :: DynFlags -> CmmExpr -> NatM Register
+
+getRegister' dflags (CmmReg (CmmGlobal PicBaseReg))
+  | OSAIX <- platformOS (targetPlatform dflags) = do
+        let code dst = toOL [ LD II32 dst tocAddr ]
+            tocAddr = AddrRegImm toc (ImmLit (text "ghc_toc_table[TC]"))
+        return (Any II32 code)
+  | target32Bit (targetPlatform dflags) = do
+      reg <- getPicBaseNat $ archWordFormat (target32Bit (targetPlatform dflags))
+      return (Fixed (archWordFormat (target32Bit (targetPlatform dflags)))
+                    reg nilOL)
+  | otherwise = return (Fixed II64 toc nilOL)
+
+getRegister' dflags (CmmReg reg)
+  = return (Fixed (cmmTypeFormat (cmmRegType dflags reg))
+                  (getRegisterReg (targetPlatform dflags) reg) nilOL)
+
+getRegister' dflags tree@(CmmRegOff _ _)
+  = getRegister' dflags (mangleIndexTree dflags tree)
+
+    -- for 32-bit architectures, support some 64 -> 32 bit conversions:
+    -- TO_W_(x), TO_W_(x >> 32)
+
+getRegister' dflags (CmmMachOp (MO_UU_Conv W64 W32)
+                     [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
+ | target32Bit (targetPlatform dflags) = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister' dflags (CmmMachOp (MO_SS_Conv W64 W32)
+                     [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
+ | target32Bit (targetPlatform dflags) = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister' dflags (CmmMachOp (MO_UU_Conv W64 W32) [x])
+ | target32Bit (targetPlatform dflags) = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 rlo code
+
+getRegister' dflags (CmmMachOp (MO_SS_Conv W64 W32) [x])
+ | target32Bit (targetPlatform dflags) = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 rlo code
+
+getRegister' dflags (CmmLoad mem pk)
+ | not (isWord64 pk) = do
+        let platform = targetPlatform dflags
+        Amode addr addr_code <- getAmode D mem
+        let code dst = ASSERT((targetClassOfReg platform dst == RcDouble) == isFloatType pk)
+                       addr_code `snocOL` LD format dst addr
+        return (Any format code)
+ | not (target32Bit (targetPlatform dflags)) = do
+        Amode addr addr_code <- getAmode DS mem
+        let code dst = addr_code `snocOL` LD II64 dst addr
+        return (Any II64 code)
+
+          where format = cmmTypeFormat pk
+
+-- catch simple cases of zero- or sign-extended load
+getRegister' _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad mem _]) = do
+    Amode addr addr_code <- getAmode D mem
+    return (Any II32 (\dst -> addr_code `snocOL` LD II8 dst addr))
+
+getRegister' _ (CmmMachOp (MO_XX_Conv W8 W32) [CmmLoad mem _]) = do
+    Amode addr addr_code <- getAmode D mem
+    return (Any II32 (\dst -> addr_code `snocOL` LD II8 dst addr))
+
+getRegister' _ (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad mem _]) = do
+    Amode addr addr_code <- getAmode D mem
+    return (Any II64 (\dst -> addr_code `snocOL` LD II8 dst addr))
+
+getRegister' _ (CmmMachOp (MO_XX_Conv W8 W64) [CmmLoad mem _]) = do
+    Amode addr addr_code <- getAmode D mem
+    return (Any II64 (\dst -> addr_code `snocOL` LD II8 dst addr))
+
+-- Note: there is no Load Byte Arithmetic instruction, so no signed case here
+
+getRegister' _ (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad mem _]) = do
+    Amode addr addr_code <- getAmode D mem
+    return (Any II32 (\dst -> addr_code `snocOL` LD II16 dst addr))
+
+getRegister' _ (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad mem _]) = do
+    Amode addr addr_code <- getAmode D mem
+    return (Any II32 (\dst -> addr_code `snocOL` LA II16 dst addr))
+
+getRegister' _ (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad mem _]) = do
+    Amode addr addr_code <- getAmode D mem
+    return (Any II64 (\dst -> addr_code `snocOL` LD II16 dst addr))
+
+getRegister' _ (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad mem _]) = do
+    Amode addr addr_code <- getAmode D mem
+    return (Any II64 (\dst -> addr_code `snocOL` LA II16 dst addr))
+
+getRegister' _ (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad mem _]) = do
+    Amode addr addr_code <- getAmode D mem
+    return (Any II64 (\dst -> addr_code `snocOL` LD II32 dst addr))
+
+getRegister' _ (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad mem _]) = do
+    -- lwa is DS-form. See Note [Power instruction format]
+    Amode addr addr_code <- getAmode DS mem
+    return (Any II64 (\dst -> addr_code `snocOL` LA II32 dst addr))
+
+getRegister' dflags (CmmMachOp mop [x]) -- unary MachOps
+  = case mop of
+      MO_Not rep   -> triv_ucode_int rep NOT
+
+      MO_F_Neg w   -> triv_ucode_float w FNEG
+      MO_S_Neg w   -> triv_ucode_int   w NEG
+
+      MO_FF_Conv W64 W32 -> trivialUCode  FF32 FRSP x
+      MO_FF_Conv W32 W64 -> conversionNop FF64 x
+
+      MO_FS_Conv from to -> coerceFP2Int from to x
+      MO_SF_Conv from to -> coerceInt2FP from to x
+
+      MO_SS_Conv from to
+        | from >= to -> conversionNop (intFormat to) x
+        | otherwise  -> triv_ucode_int to (EXTS (intFormat from))
+
+      MO_UU_Conv from to
+        | from >= to -> conversionNop (intFormat to) x
+        | otherwise  -> clearLeft from to
+
+      MO_XX_Conv _ to -> conversionNop (intFormat to) x
+
+      _ -> panic "PPC.CodeGen.getRegister: no match"
+
+    where
+        triv_ucode_int   width instr = trivialUCode (intFormat    width) instr x
+        triv_ucode_float width instr = trivialUCode (floatFormat  width) instr x
+
+        conversionNop new_format expr
+            = do e_code <- getRegister' dflags expr
+                 return (swizzleRegisterRep e_code new_format)
+
+        clearLeft from to
+            = do (src1, code1) <- getSomeReg x
+                 let arch_fmt  = intFormat (wordWidth dflags)
+                     arch_bits = widthInBits (wordWidth dflags)
+                     size      = widthInBits from
+                     code dst  = code1 `snocOL`
+                                 CLRLI arch_fmt dst src1 (arch_bits - size)
+                 return (Any (intFormat to) code)
+
+getRegister' _ (CmmMachOp mop [x, y]) -- dyadic PrimOps
+  = case mop of
+      MO_F_Eq _ -> condFltReg EQQ x y
+      MO_F_Ne _ -> condFltReg NE  x y
+      MO_F_Gt _ -> condFltReg GTT x y
+      MO_F_Ge _ -> condFltReg GE  x y
+      MO_F_Lt _ -> condFltReg LTT x y
+      MO_F_Le _ -> condFltReg LE  x y
+
+      MO_Eq rep -> condIntReg EQQ rep x y
+      MO_Ne rep -> condIntReg NE  rep x y
+
+      MO_S_Gt rep -> condIntReg GTT rep x y
+      MO_S_Ge rep -> condIntReg GE  rep x y
+      MO_S_Lt rep -> condIntReg LTT rep x y
+      MO_S_Le rep -> condIntReg LE  rep x y
+
+      MO_U_Gt rep -> condIntReg GU  rep x y
+      MO_U_Ge rep -> condIntReg GEU rep x y
+      MO_U_Lt rep -> condIntReg LU  rep x y
+      MO_U_Le rep -> condIntReg LEU rep x y
+
+      MO_F_Add w  -> triv_float w FADD
+      MO_F_Sub w  -> triv_float w FSUB
+      MO_F_Mul w  -> triv_float w FMUL
+      MO_F_Quot w -> triv_float w FDIV
+
+         -- optimize addition with 32-bit immediate
+         -- (needed for PIC)
+      MO_Add W32 ->
+        case y of
+          CmmLit (CmmInt imm immrep) | Just _ <- makeImmediate W32 True imm
+            -> trivialCode W32 True ADD x (CmmLit $ CmmInt imm immrep)
+          CmmLit lit
+            -> do
+                (src, srcCode) <- getSomeReg x
+                let imm = litToImm lit
+                    code dst = srcCode `appOL` toOL [
+                                    ADDIS dst src (HA imm),
+                                    ADD dst dst (RIImm (LO imm))
+                                ]
+                return (Any II32 code)
+          _ -> trivialCode W32 True ADD x y
+
+      MO_Add rep -> trivialCode rep True ADD x y
+      MO_Sub rep ->
+        case y of
+          CmmLit (CmmInt imm immrep) | Just _ <- makeImmediate rep True (-imm)
+            -> trivialCode rep True ADD x (CmmLit $ CmmInt (-imm) immrep)
+          _ -> case x of
+                 CmmLit (CmmInt imm _)
+                   | Just _ <- makeImmediate rep True imm
+                   -- subfi ('subtract from' with immediate) doesn't exist
+                   -> trivialCode rep True SUBFC y x
+                 _ -> trivialCodeNoImm' (intFormat rep) SUBF y x
+
+      MO_Mul rep -> shiftMulCode rep True MULL x y
+      MO_S_MulMayOflo rep -> do
+        (src1, code1) <- getSomeReg x
+        (src2, code2) <- getSomeReg y
+        let
+          format = intFormat rep
+          code dst = code1 `appOL` code2
+                       `appOL` toOL [ MULLO format dst src1 src2
+                                    , MFOV  format dst
+                                    ]
+        return (Any format code)
+
+      MO_S_Quot rep -> divCode rep True x y
+      MO_U_Quot rep -> divCode rep False x y
+
+      MO_S_Rem rep -> remainder rep True x y
+      MO_U_Rem rep -> remainder rep False x y
+
+      MO_And rep   -> case y of
+        (CmmLit (CmmInt imm _)) | imm == -8 || imm == -4
+            -> do
+                (src, srcCode) <- getSomeReg x
+                let clear_mask = if imm == -4 then 2 else 3
+                    fmt = intFormat rep
+                    code dst = srcCode
+                               `appOL` unitOL (CLRRI fmt dst src clear_mask)
+                return (Any fmt code)
+        _ -> trivialCode rep False AND x y
+      MO_Or rep    -> trivialCode rep False OR x y
+      MO_Xor rep   -> trivialCode rep False XOR x y
+
+      MO_Shl rep   -> shiftMulCode rep False SL x y
+      MO_S_Shr rep -> srCode rep True SRA x y
+      MO_U_Shr rep -> srCode rep False SR x y
+      _         -> panic "PPC.CodeGen.getRegister: no match"
+
+  where
+    triv_float :: Width -> (Format -> Reg -> Reg -> Reg -> Instr) -> NatM Register
+    triv_float width instr = trivialCodeNoImm (floatFormat width) instr x y
+
+    remainder :: Width -> Bool -> CmmExpr -> CmmExpr -> NatM Register
+    remainder rep sgn x y = do
+      let fmt = intFormat rep
+      tmp <- getNewRegNat fmt
+      code <- remainderCode rep sgn tmp x y
+      return (Any fmt code)
+
+
+getRegister' _ (CmmLit (CmmInt i rep))
+  | Just imm <- makeImmediate rep True i
+  = let
+        code dst = unitOL (LI dst imm)
+    in
+        return (Any (intFormat rep) code)
+
+getRegister' _ (CmmLit (CmmFloat f frep)) = do
+    lbl <- getNewLabelNat
+    dflags <- getDynFlags
+    dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+    Amode addr addr_code <- getAmode D dynRef
+    let format = floatFormat frep
+        code dst =
+            LDATA (Section ReadOnlyData lbl)
+                  (RawCmmStatics lbl [CmmStaticLit (CmmFloat f frep)])
+            `consOL` (addr_code `snocOL` LD format dst addr)
+    return (Any format code)
+
+getRegister' dflags (CmmLit lit)
+  | target32Bit (targetPlatform dflags)
+  = let rep = cmmLitType dflags lit
+        imm = litToImm lit
+        code dst = toOL [
+              LIS dst (HA imm),
+              ADD dst dst (RIImm (LO imm))
+          ]
+    in return (Any (cmmTypeFormat rep) code)
+  | otherwise
+  = do lbl <- getNewLabelNat
+       dflags <- getDynFlags
+       dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+       Amode addr addr_code <- getAmode D dynRef
+       let rep = cmmLitType dflags lit
+           format = cmmTypeFormat rep
+           code dst =
+            LDATA (Section ReadOnlyData lbl) (RawCmmStatics lbl [CmmStaticLit lit])
+            `consOL` (addr_code `snocOL` LD format dst addr)
+       return (Any format code)
+
+getRegister' _ other = pprPanic "getRegister(ppc)" (pprExpr other)
+
+    -- extend?Rep: wrap integer expression of type `from`
+    -- in a conversion to `to`
+extendSExpr :: Width -> Width -> CmmExpr -> CmmExpr
+extendSExpr from to x = CmmMachOp (MO_SS_Conv from to) [x]
+
+extendUExpr :: Width -> Width -> CmmExpr -> CmmExpr
+extendUExpr from to x = CmmMachOp (MO_UU_Conv from to) [x]
+
+-- -----------------------------------------------------------------------------
+--  The 'Amode' type: Memory addressing modes passed up the tree.
+
+data Amode
+        = Amode AddrMode InstrBlock
+
+{-
+Now, given a tree (the argument to a CmmLoad) that references memory,
+produce a suitable addressing mode.
+
+A Rule of the Game (tm) for Amodes: use of the addr bit must
+immediately follow use of the code part, since the code part puts
+values in registers which the addr then refers to.  So you can't put
+anything in between, lest it overwrite some of those registers.  If
+you need to do some other computation between the code part and use of
+the addr bit, first store the effective address from the amode in a
+temporary, then do the other computation, and then use the temporary:
+
+    code
+    LEA amode, tmp
+    ... other computation ...
+    ... (tmp) ...
+-}
+
+{- Note [Power instruction format]
+In some instructions the 16 bit offset must be a multiple of 4, i.e.
+the two least significant bits must be zero. The "Power ISA" specification
+calls these instruction formats "DS-FORM" and the instructions with
+arbitrary 16 bit offsets are "D-FORM".
+
+The Power ISA specification document can be obtained from www.power.org.
+-}
+data InstrForm = D | DS
+
+getAmode :: InstrForm -> CmmExpr -> NatM Amode
+getAmode inf tree@(CmmRegOff _ _)
+  = do dflags <- getDynFlags
+       getAmode inf (mangleIndexTree dflags tree)
+
+getAmode _ (CmmMachOp (MO_Sub W32) [x, CmmLit (CmmInt i _)])
+  | Just off <- makeImmediate W32 True (-i)
+  = do
+        (reg, code) <- getSomeReg x
+        return (Amode (AddrRegImm reg off) code)
+
+
+getAmode _ (CmmMachOp (MO_Add W32) [x, CmmLit (CmmInt i _)])
+  | Just off <- makeImmediate W32 True i
+  = do
+        (reg, code) <- getSomeReg x
+        return (Amode (AddrRegImm reg off) code)
+
+getAmode D (CmmMachOp (MO_Sub W64) [x, CmmLit (CmmInt i _)])
+  | Just off <- makeImmediate W64 True (-i)
+  = do
+        (reg, code) <- getSomeReg x
+        return (Amode (AddrRegImm reg off) code)
+
+
+getAmode D (CmmMachOp (MO_Add W64) [x, CmmLit (CmmInt i _)])
+  | Just off <- makeImmediate W64 True i
+  = do
+        (reg, code) <- getSomeReg x
+        return (Amode (AddrRegImm reg off) code)
+
+getAmode DS (CmmMachOp (MO_Sub W64) [x, CmmLit (CmmInt i _)])
+  | Just off <- makeImmediate W64 True (-i)
+  = do
+        (reg, code) <- getSomeReg x
+        (reg', off', code')  <-
+                     if i `mod` 4 == 0
+                      then do return (reg, off, code)
+                      else do
+                           tmp <- getNewRegNat II64
+                           return (tmp, ImmInt 0,
+                                  code `snocOL` ADD tmp reg (RIImm off))
+        return (Amode (AddrRegImm reg' off') code')
+
+getAmode DS (CmmMachOp (MO_Add W64) [x, CmmLit (CmmInt i _)])
+  | Just off <- makeImmediate W64 True i
+  = do
+        (reg, code) <- getSomeReg x
+        (reg', off', code')  <-
+                     if i `mod` 4 == 0
+                      then do return (reg, off, code)
+                      else do
+                           tmp <- getNewRegNat II64
+                           return (tmp, ImmInt 0,
+                                  code `snocOL` ADD tmp reg (RIImm off))
+        return (Amode (AddrRegImm reg' off') code')
+
+   -- optimize addition with 32-bit immediate
+   -- (needed for PIC)
+getAmode _ (CmmMachOp (MO_Add W32) [x, CmmLit lit])
+  = do
+        dflags <- getDynFlags
+        (src, srcCode) <- getSomeReg x
+        let imm = litToImm lit
+        case () of
+            _ | OSAIX <- platformOS (targetPlatform dflags)
+              , isCmmLabelType lit ->
+                    -- HA16/LO16 relocations on labels not supported on AIX
+                    return (Amode (AddrRegImm src imm) srcCode)
+              | otherwise -> do
+                    tmp <- getNewRegNat II32
+                    let code = srcCode `snocOL` ADDIS tmp src (HA imm)
+                    return (Amode (AddrRegImm tmp (LO imm)) code)
+  where
+      isCmmLabelType (CmmLabel {})        = True
+      isCmmLabelType (CmmLabelOff {})     = True
+      isCmmLabelType (CmmLabelDiffOff {}) = True
+      isCmmLabelType _                    = False
+
+getAmode _ (CmmLit lit)
+  = do
+        dflags <- getDynFlags
+        case platformArch $ targetPlatform dflags of
+             ArchPPC -> do
+                 tmp <- getNewRegNat II32
+                 let imm = litToImm lit
+                     code = unitOL (LIS tmp (HA imm))
+                 return (Amode (AddrRegImm tmp (LO imm)) code)
+             _        -> do -- TODO: Load from TOC,
+                            -- see getRegister' _ (CmmLit lit)
+                 tmp <- getNewRegNat II64
+                 let imm = litToImm lit
+                     code =  toOL [
+                          LIS tmp (HIGHESTA imm),
+                          OR tmp tmp (RIImm (HIGHERA imm)),
+                          SL  II64 tmp tmp (RIImm (ImmInt 32)),
+                          ORIS tmp tmp (HA imm)
+                          ]
+                 return (Amode (AddrRegImm tmp (LO imm)) code)
+
+getAmode _ (CmmMachOp (MO_Add W32) [x, y])
+  = do
+        (regX, codeX) <- getSomeReg x
+        (regY, codeY) <- getSomeReg y
+        return (Amode (AddrRegReg regX regY) (codeX `appOL` codeY))
+
+getAmode _ (CmmMachOp (MO_Add W64) [x, y])
+  = do
+        (regX, codeX) <- getSomeReg x
+        (regY, codeY) <- getSomeReg y
+        return (Amode (AddrRegReg regX regY) (codeX `appOL` codeY))
+
+getAmode _ other
+  = do
+        (reg, code) <- getSomeReg other
+        let
+            off  = ImmInt 0
+        return (Amode (AddrRegImm reg off) code)
+
+
+--  The 'CondCode' type:  Condition codes passed up the tree.
+data CondCode
+        = CondCode Bool Cond InstrBlock
+
+-- Set up a condition code for a conditional branch.
+
+getCondCode :: CmmExpr -> NatM CondCode
+
+-- almost the same as everywhere else - but we need to
+-- extend small integers to 32 bit or 64 bit first
+
+getCondCode (CmmMachOp mop [x, y])
+  = do
+    case mop of
+      MO_F_Eq W32 -> condFltCode EQQ x y
+      MO_F_Ne W32 -> condFltCode NE  x y
+      MO_F_Gt W32 -> condFltCode GTT x y
+      MO_F_Ge W32 -> condFltCode GE  x y
+      MO_F_Lt W32 -> condFltCode LTT x y
+      MO_F_Le W32 -> condFltCode LE  x y
+
+      MO_F_Eq W64 -> condFltCode EQQ x y
+      MO_F_Ne W64 -> condFltCode NE  x y
+      MO_F_Gt W64 -> condFltCode GTT x y
+      MO_F_Ge W64 -> condFltCode GE  x y
+      MO_F_Lt W64 -> condFltCode LTT x y
+      MO_F_Le W64 -> condFltCode LE  x y
+
+      MO_Eq rep -> condIntCode EQQ rep x y
+      MO_Ne rep -> condIntCode NE  rep x y
+
+      MO_S_Gt rep -> condIntCode GTT rep x y
+      MO_S_Ge rep -> condIntCode GE  rep x y
+      MO_S_Lt rep -> condIntCode LTT rep x y
+      MO_S_Le rep -> condIntCode LE  rep x y
+
+      MO_U_Gt rep -> condIntCode GU  rep x y
+      MO_U_Ge rep -> condIntCode GEU rep x y
+      MO_U_Lt rep -> condIntCode LU  rep x y
+      MO_U_Le rep -> condIntCode LEU rep x y
+
+      _ -> pprPanic "getCondCode(powerpc)" (pprMachOp mop)
+
+getCondCode _ = panic "getCondCode(2)(powerpc)"
+
+
+-- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
+-- passed back up the tree.
+
+condIntCode :: Cond -> Width -> CmmExpr -> CmmExpr -> NatM CondCode
+condIntCode cond width x y = do
+  dflags <- getDynFlags
+  condIntCode' (target32Bit (targetPlatform dflags)) cond width x y
+
+condIntCode' :: Bool -> Cond -> Width -> CmmExpr -> CmmExpr -> NatM CondCode
+
+-- simple code for 64-bit on 32-bit platforms
+condIntCode' True cond W64 x y
+  | condUnsigned cond
+  = do
+      ChildCode64 code_x x_lo <- iselExpr64 x
+      ChildCode64 code_y y_lo <- iselExpr64 y
+      let x_hi = getHiVRegFromLo x_lo
+          y_hi = getHiVRegFromLo y_lo
+      end_lbl <- getBlockIdNat
+      let code = code_x `appOL` code_y `appOL` toOL
+                 [ CMPL II32 x_hi (RIReg y_hi)
+                 , BCC NE end_lbl Nothing
+                 , CMPL II32 x_lo (RIReg y_lo)
+                 , BCC ALWAYS end_lbl Nothing
+
+                 , NEWBLOCK end_lbl
+                 ]
+      return (CondCode False cond code)
+  | otherwise
+  = do
+      ChildCode64 code_x x_lo <- iselExpr64 x
+      ChildCode64 code_y y_lo <- iselExpr64 y
+      let x_hi = getHiVRegFromLo x_lo
+          y_hi = getHiVRegFromLo y_lo
+      end_lbl <- getBlockIdNat
+      cmp_lo  <- getBlockIdNat
+      let code = code_x `appOL` code_y `appOL` toOL
+                 [ CMP II32 x_hi (RIReg y_hi)
+                 , BCC NE end_lbl Nothing
+                 , CMP II32 x_hi (RIImm (ImmInt 0))
+                 , BCC LE cmp_lo Nothing
+                 , CMPL II32 x_lo (RIReg y_lo)
+                 , BCC ALWAYS end_lbl Nothing
+                 , NEWBLOCK cmp_lo
+                 , CMPL II32 y_lo (RIReg x_lo)
+                 , BCC ALWAYS end_lbl Nothing
+
+                 , NEWBLOCK end_lbl
+                 ]
+      return (CondCode False cond code)
+
+-- optimize pointer tag checks. Operation andi. sets condition register
+-- so cmpi ..., 0 is redundant.
+condIntCode' _ cond _ (CmmMachOp (MO_And _) [x, CmmLit (CmmInt imm rep)])
+                 (CmmLit (CmmInt 0 _))
+  | not $ condUnsigned cond,
+    Just src2 <- makeImmediate rep False imm
+  = do
+      (src1, code) <- getSomeReg x
+      let code' = code `snocOL` AND r0 src1 (RIImm src2)
+      return (CondCode False cond code')
+
+condIntCode' _ cond width x (CmmLit (CmmInt y rep))
+  | Just src2 <- makeImmediate rep (not $ condUnsigned cond) y
+  = do
+      let op_len = max W32 width
+      let extend = extendSExpr width op_len
+      (src1, code) <- getSomeReg (extend x)
+      let format = intFormat op_len
+          code' = code `snocOL`
+            (if condUnsigned cond then CMPL else CMP) format src1 (RIImm src2)
+      return (CondCode False cond code')
+
+condIntCode' _ cond width x y = do
+  let op_len = max W32 width
+  let extend = if condUnsigned cond then extendUExpr width op_len
+               else extendSExpr width op_len
+  (src1, code1) <- getSomeReg (extend x)
+  (src2, code2) <- getSomeReg (extend y)
+  let format = intFormat op_len
+      code' = code1 `appOL` code2 `snocOL`
+        (if condUnsigned cond then CMPL else CMP) format src1 (RIReg src2)
+  return (CondCode False cond code')
+
+condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+condFltCode cond x y = do
+    (src1, code1) <- getSomeReg x
+    (src2, code2) <- getSomeReg y
+    let
+        code'  = code1 `appOL` code2 `snocOL` FCMP src1 src2
+        code'' = case cond of -- twiddle CR to handle unordered case
+                    GE -> code' `snocOL` CRNOR ltbit eqbit gtbit
+                    LE -> code' `snocOL` CRNOR gtbit eqbit ltbit
+                    _ -> code'
+                 where
+                    ltbit = 0 ; eqbit = 2 ; gtbit = 1
+    return (CondCode True cond code'')
+
+
+
+-- -----------------------------------------------------------------------------
+-- Generating assignments
+
+-- Assignments are really at the heart of the whole code generation
+-- business.  Almost all top-level nodes of any real importance are
+-- assignments, which correspond to loads, stores, or register
+-- transfers.  If we're really lucky, some of the register transfers
+-- will go away, because we can use the destination register to
+-- complete the code generation for the right hand side.  This only
+-- fails when the right hand side is forced into a fixed register
+-- (e.g. the result of a call).
+
+assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_IntCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+
+assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_FltCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+
+assignMem_IntCode pk addr src = do
+    (srcReg, code) <- getSomeReg src
+    Amode dstAddr addr_code <- case pk of
+                                II64 -> getAmode DS addr
+                                _    -> getAmode D  addr
+    return $ code `appOL` addr_code `snocOL` ST pk srcReg dstAddr
+
+-- dst is a reg, but src could be anything
+assignReg_IntCode _ reg src
+    = do
+        dflags <- getDynFlags
+        let dst = getRegisterReg (targetPlatform dflags) reg
+        r <- getRegister src
+        return $ case r of
+            Any _ code         -> code dst
+            Fixed _ freg fcode -> fcode `snocOL` MR dst freg
+
+
+
+-- Easy, isn't it?
+assignMem_FltCode = assignMem_IntCode
+assignReg_FltCode = assignReg_IntCode
+
+
+
+genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
+
+genJump (CmmLit (CmmLabel lbl)) regs
+  = return (unitOL $ JMP lbl regs)
+
+genJump tree gregs
+  = do
+        dflags <- getDynFlags
+        genJump' tree (platformToGCP (targetPlatform dflags)) gregs
+
+genJump' :: CmmExpr -> GenCCallPlatform -> [Reg] -> NatM InstrBlock
+
+genJump' tree (GCP64ELF 1) regs
+  = do
+        (target,code) <- getSomeReg tree
+        return (code
+               `snocOL` LD II64 r11 (AddrRegImm target (ImmInt 0))
+               `snocOL` LD II64 toc (AddrRegImm target (ImmInt 8))
+               `snocOL` MTCTR r11
+               `snocOL` LD II64 r11 (AddrRegImm target (ImmInt 16))
+               `snocOL` BCTR [] Nothing regs)
+
+genJump' tree (GCP64ELF 2) regs
+  = do
+        (target,code) <- getSomeReg tree
+        return (code
+               `snocOL` MR r12 target
+               `snocOL` MTCTR r12
+               `snocOL` BCTR [] Nothing regs)
+
+genJump' tree _ regs
+  = do
+        (target,code) <- getSomeReg tree
+        return (code `snocOL` MTCTR target `snocOL` BCTR [] Nothing regs)
+
+-- -----------------------------------------------------------------------------
+--  Unconditional branches
+genBranch :: BlockId -> NatM InstrBlock
+genBranch = return . toOL . mkJumpInstr
+
+
+-- -----------------------------------------------------------------------------
+--  Conditional jumps
+
+{-
+Conditional jumps are always to local labels, so we can use branch
+instructions.  We peek at the arguments to decide what kind of
+comparison to do.
+-}
+
+
+genCondJump
+    :: BlockId      -- the branch target
+    -> CmmExpr      -- the condition on which to branch
+    -> Maybe Bool
+    -> NatM InstrBlock
+
+genCondJump id bool prediction = do
+  CondCode _ cond code <- getCondCode bool
+  return (code `snocOL` BCC cond id prediction)
+
+
+
+-- -----------------------------------------------------------------------------
+--  Generating C calls
+
+-- Now the biggest nightmare---calls.  Most of the nastiness is buried in
+-- @get_arg@, which moves the arguments to the correct registers/stack
+-- locations.  Apart from that, the code is easy.
+
+genCCall :: ForeignTarget      -- function to call
+         -> [CmmFormal]        -- where to put the result
+         -> [CmmActual]        -- arguments (of mixed type)
+         -> NatM InstrBlock
+genCCall (PrimTarget MO_ReadBarrier) _ _
+ = return $ unitOL LWSYNC
+genCCall (PrimTarget MO_WriteBarrier) _ _
+ = return $ unitOL LWSYNC
+
+genCCall (PrimTarget MO_Touch) _ _
+ = return $ nilOL
+
+genCCall (PrimTarget (MO_Prefetch_Data _)) _ _
+ = return $ nilOL
+
+genCCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n]
+ = do dflags <- getDynFlags
+      let platform = targetPlatform dflags
+          fmt      = intFormat width
+          reg_dst  = getRegisterReg platform (CmmLocal dst)
+      (instr, n_code) <- case amop of
+            AMO_Add  -> getSomeRegOrImm ADD True reg_dst
+            AMO_Sub  -> case n of
+                CmmLit (CmmInt i _)
+                  | Just imm <- makeImmediate width True (-i)
+                   -> return (ADD reg_dst reg_dst (RIImm imm), nilOL)
+                _
+                   -> do
+                         (n_reg, n_code) <- getSomeReg n
+                         return  (SUBF reg_dst n_reg reg_dst, n_code)
+            AMO_And  -> getSomeRegOrImm AND False reg_dst
+            AMO_Nand -> do (n_reg, n_code) <- getSomeReg n
+                           return (NAND reg_dst reg_dst n_reg, n_code)
+            AMO_Or   -> getSomeRegOrImm OR False reg_dst
+            AMO_Xor  -> getSomeRegOrImm XOR False reg_dst
+      Amode addr_reg addr_code <- getAmodeIndex addr
+      lbl_retry <- getBlockIdNat
+      return $ n_code `appOL` addr_code
+        `appOL` toOL [ HWSYNC
+                     , BCC ALWAYS lbl_retry Nothing
+
+                     , NEWBLOCK lbl_retry
+                     , LDR fmt reg_dst addr_reg
+                     , instr
+                     , STC fmt reg_dst addr_reg
+                     , BCC NE lbl_retry (Just False)
+                     , ISYNC
+                     ]
+         where
+           getAmodeIndex (CmmMachOp (MO_Add _) [x, y])
+             = do
+                 (regX, codeX) <- getSomeReg x
+                 (regY, codeY) <- getSomeReg y
+                 return (Amode (AddrRegReg regX regY) (codeX `appOL` codeY))
+           getAmodeIndex other
+             = do
+                 (reg, code) <- getSomeReg other
+                 return (Amode (AddrRegReg r0 reg) code) -- NB: r0 is 0 here!
+           getSomeRegOrImm op sign dst
+             = case n of
+                 CmmLit (CmmInt i _) | Just imm <- makeImmediate width sign i
+                    -> return (op dst dst (RIImm imm), nilOL)
+                 _
+                    -> do
+                          (n_reg, n_code) <- getSomeReg n
+                          return  (op dst dst (RIReg n_reg), n_code)
+
+genCCall (PrimTarget (MO_AtomicRead width)) [dst] [addr]
+ = do dflags <- getDynFlags
+      let platform = targetPlatform dflags
+          fmt      = intFormat width
+          reg_dst  = getRegisterReg platform (CmmLocal dst)
+          form     = if widthInBits width == 64 then DS else D
+      Amode addr_reg addr_code <- getAmode form addr
+      lbl_end <- getBlockIdNat
+      return $ addr_code `appOL` toOL [ HWSYNC
+                                      , LD fmt reg_dst addr_reg
+                                      , CMP fmt reg_dst (RIReg reg_dst)
+                                      , BCC NE lbl_end (Just False)
+                                      , BCC ALWAYS lbl_end Nothing
+                            -- See Note [Seemingly useless cmp and bne]
+                                      , NEWBLOCK lbl_end
+                                      , ISYNC
+                                      ]
+
+-- Note [Seemingly useless cmp and bne]
+-- In Power ISA, Book II, Section 4.4.1, Instruction Synchronize Instruction
+-- the second paragraph says that isync may complete before storage accesses
+-- "associated" with a preceding instruction have been performed. The cmp
+-- operation and the following bne introduce a data and control dependency
+-- on the load instruction (See also Power ISA, Book II, Appendix B.2.3, Safe
+-- Fetch).
+-- This is also what gcc does.
+
+
+genCCall (PrimTarget (MO_AtomicWrite width)) [] [addr, val] = do
+    code <- assignMem_IntCode (intFormat width) addr val
+    return $ unitOL(HWSYNC) `appOL` code
+
+genCCall (PrimTarget (MO_Clz width)) [dst] [src]
+ = do dflags <- getDynFlags
+      let platform = targetPlatform dflags
+          reg_dst = getRegisterReg platform (CmmLocal dst)
+      if target32Bit platform && width == W64
+        then do
+          ChildCode64 code vr_lo <- iselExpr64 src
+          lbl1 <- getBlockIdNat
+          lbl2 <- getBlockIdNat
+          lbl3 <- getBlockIdNat
+          let vr_hi = getHiVRegFromLo vr_lo
+              cntlz = toOL [ CMPL II32 vr_hi (RIImm (ImmInt 0))
+                           , BCC NE lbl2 Nothing
+                           , BCC ALWAYS lbl1 Nothing
+
+                           , NEWBLOCK lbl1
+                           , CNTLZ II32 reg_dst vr_lo
+                           , ADD reg_dst reg_dst (RIImm (ImmInt 32))
+                           , BCC ALWAYS lbl3 Nothing
+
+                           , NEWBLOCK lbl2
+                           , CNTLZ II32 reg_dst vr_hi
+                           , BCC ALWAYS lbl3 Nothing
+
+                           , NEWBLOCK lbl3
+                           ]
+          return $ code `appOL` cntlz
+        else do
+          let format = if width == W64 then II64 else II32
+          (s_reg, s_code) <- getSomeReg src
+          (pre, reg , post) <-
+            case width of
+              W64 -> return (nilOL, s_reg, nilOL)
+              W32 -> return (nilOL, s_reg, nilOL)
+              W16 -> do
+                reg_tmp <- getNewRegNat format
+                return
+                  ( unitOL $ AND reg_tmp s_reg (RIImm (ImmInt 65535))
+                  , reg_tmp
+                  , unitOL $ ADD reg_dst reg_dst (RIImm (ImmInt (-16)))
+                  )
+              W8  -> do
+                reg_tmp <- getNewRegNat format
+                return
+                  ( unitOL $ AND reg_tmp s_reg (RIImm (ImmInt 255))
+                  , reg_tmp
+                  , unitOL $ ADD reg_dst reg_dst (RIImm (ImmInt (-24)))
+                  )
+              _   -> panic "genCall: Clz wrong format"
+          let cntlz = unitOL (CNTLZ format reg_dst reg)
+          return $ s_code `appOL` pre `appOL` cntlz `appOL` post
+
+genCCall (PrimTarget (MO_Ctz width)) [dst] [src]
+ = do dflags <- getDynFlags
+      let platform = targetPlatform dflags
+          reg_dst = getRegisterReg platform (CmmLocal dst)
+      if target32Bit platform && width == W64
+        then do
+          let format = II32
+          ChildCode64 code vr_lo <- iselExpr64 src
+          lbl1 <- getBlockIdNat
+          lbl2 <- getBlockIdNat
+          lbl3 <- getBlockIdNat
+          x' <- getNewRegNat format
+          x'' <- getNewRegNat format
+          r' <- getNewRegNat format
+          cnttzlo <- cnttz format reg_dst vr_lo
+          let vr_hi = getHiVRegFromLo vr_lo
+              cnttz64 = toOL [ CMPL format vr_lo (RIImm (ImmInt 0))
+                             , BCC NE lbl2 Nothing
+                             , BCC ALWAYS lbl1 Nothing
+
+                             , NEWBLOCK lbl1
+                             , ADD x' vr_hi (RIImm (ImmInt (-1)))
+                             , ANDC x'' x' vr_hi
+                             , CNTLZ format r' x''
+                               -- 32 + (32 - clz(x''))
+                             , SUBFC reg_dst r' (RIImm (ImmInt 64))
+                             , BCC ALWAYS lbl3 Nothing
+
+                             , NEWBLOCK lbl2
+                             ]
+                        `appOL` cnttzlo `appOL`
+                        toOL [ BCC ALWAYS lbl3 Nothing
+
+                             , NEWBLOCK lbl3
+                             ]
+          return $ code `appOL` cnttz64
+        else do
+          let format = if width == W64 then II64 else II32
+          (s_reg, s_code) <- getSomeReg src
+          (reg_ctz, pre_code) <-
+            case width of
+              W64 -> return (s_reg, nilOL)
+              W32 -> return (s_reg, nilOL)
+              W16 -> do
+                reg_tmp <- getNewRegNat format
+                return (reg_tmp, unitOL $ ORIS reg_tmp s_reg (ImmInt 1))
+              W8  -> do
+                reg_tmp <- getNewRegNat format
+                return (reg_tmp, unitOL $ OR reg_tmp s_reg (RIImm (ImmInt 256)))
+              _   -> panic "genCall: Ctz wrong format"
+          ctz_code <- cnttz format reg_dst reg_ctz
+          return $ s_code `appOL` pre_code `appOL` ctz_code
+        where
+          -- cnttz(x) = sizeof(x) - cntlz(~x & (x - 1))
+          -- see Henry S. Warren, Hacker's Delight, p 107
+          cnttz format dst src = do
+            let format_bits = 8 * formatInBytes format
+            x' <- getNewRegNat format
+            x'' <- getNewRegNat format
+            r' <- getNewRegNat format
+            return $ toOL [ ADD x' src (RIImm (ImmInt (-1)))
+                          , ANDC x'' x' src
+                          , CNTLZ format r' x''
+                          , SUBFC dst r' (RIImm (ImmInt (format_bits)))
+                          ]
+
+genCCall target dest_regs argsAndHints
+ = do dflags <- getDynFlags
+      let platform = targetPlatform dflags
+      case target of
+        PrimTarget (MO_S_QuotRem  width) -> divOp1 platform True  width
+                                                   dest_regs argsAndHints
+        PrimTarget (MO_U_QuotRem  width) -> divOp1 platform False width
+                                                   dest_regs argsAndHints
+        PrimTarget (MO_U_QuotRem2 width) -> divOp2 platform width dest_regs
+                                                   argsAndHints
+        PrimTarget (MO_U_Mul2 width) -> multOp2 platform width dest_regs
+                                                argsAndHints
+        PrimTarget (MO_Add2 _) -> add2Op platform dest_regs argsAndHints
+        PrimTarget (MO_AddWordC _) -> addcOp platform dest_regs argsAndHints
+        PrimTarget (MO_SubWordC _) -> subcOp platform dest_regs argsAndHints
+        PrimTarget (MO_AddIntC width) -> addSubCOp ADDO platform width
+                                                   dest_regs argsAndHints
+        PrimTarget (MO_SubIntC width) -> addSubCOp SUBFO platform width
+                                                   dest_regs argsAndHints
+        PrimTarget MO_F64_Fabs -> fabs platform dest_regs argsAndHints
+        PrimTarget MO_F32_Fabs -> fabs platform dest_regs argsAndHints
+        _ -> genCCall' dflags (platformToGCP platform)
+                       target dest_regs argsAndHints
+        where divOp1 platform signed width [res_q, res_r] [arg_x, arg_y]
+                = do let reg_q = getRegisterReg platform (CmmLocal res_q)
+                         reg_r = getRegisterReg platform (CmmLocal res_r)
+                     remainderCode width signed reg_q arg_x arg_y
+                       <*> pure reg_r
+
+              divOp1 _ _ _ _ _
+                = panic "genCCall: Wrong number of arguments for divOp1"
+              divOp2 platform width [res_q, res_r]
+                                    [arg_x_high, arg_x_low, arg_y]
+                = do let reg_q = getRegisterReg platform (CmmLocal res_q)
+                         reg_r = getRegisterReg platform (CmmLocal res_r)
+                         fmt   = intFormat width
+                         half  = 4 * (formatInBytes fmt)
+                     (xh_reg, xh_code) <- getSomeReg arg_x_high
+                     (xl_reg, xl_code) <- getSomeReg arg_x_low
+                     (y_reg, y_code) <- getSomeReg arg_y
+                     s <- getNewRegNat fmt
+                     b <- getNewRegNat fmt
+                     v <- getNewRegNat fmt
+                     vn1 <- getNewRegNat fmt
+                     vn0 <- getNewRegNat fmt
+                     un32 <- getNewRegNat fmt
+                     tmp  <- getNewRegNat fmt
+                     un10 <- getNewRegNat fmt
+                     un1 <- getNewRegNat fmt
+                     un0 <- getNewRegNat fmt
+                     q1 <- getNewRegNat fmt
+                     rhat <- getNewRegNat fmt
+                     tmp1 <- getNewRegNat fmt
+                     q0 <- getNewRegNat fmt
+                     un21 <- getNewRegNat fmt
+                     again1 <- getBlockIdNat
+                     no1 <- getBlockIdNat
+                     then1 <- getBlockIdNat
+                     endif1 <- getBlockIdNat
+                     again2 <- getBlockIdNat
+                     no2 <- getBlockIdNat
+                     then2 <- getBlockIdNat
+                     endif2 <- getBlockIdNat
+                     return $ y_code `appOL` xl_code `appOL` xh_code `appOL`
+                              -- see Hacker's Delight p 196 Figure 9-3
+                              toOL [ -- b = 2 ^ (bits_in_word / 2)
+                                     LI b (ImmInt 1)
+                                   , SL fmt b b (RIImm (ImmInt half))
+                                     -- s = clz(y)
+                                   , CNTLZ fmt s y_reg
+                                     -- v = y << s
+                                   , SL fmt v y_reg (RIReg s)
+                                     -- vn1 = upper half of v
+                                   , SR fmt vn1 v (RIImm (ImmInt half))
+                                     -- vn0 = lower half of v
+                                   , CLRLI fmt vn0 v half
+                                     -- un32 = (u1 << s)
+                                     --      | (u0 >> (bits_in_word - s))
+                                   , SL fmt un32 xh_reg (RIReg s)
+                                   , SUBFC tmp s
+                                        (RIImm (ImmInt (8 * formatInBytes fmt)))
+                                   , SR fmt tmp xl_reg (RIReg tmp)
+                                   , OR un32 un32 (RIReg tmp)
+                                     -- un10 = u0 << s
+                                   , SL fmt un10 xl_reg (RIReg s)
+                                     -- un1 = upper half of un10
+                                   , SR fmt un1 un10 (RIImm (ImmInt half))
+                                     -- un0 = lower half of un10
+                                   , CLRLI fmt un0 un10 half
+                                     -- q1 = un32/vn1
+                                   , DIV fmt False q1 un32 vn1
+                                     -- rhat = un32 - q1*vn1
+                                   , MULL fmt tmp q1 (RIReg vn1)
+                                   , SUBF rhat tmp un32
+                                   , BCC ALWAYS again1 Nothing
+
+                                   , NEWBLOCK again1
+                                     -- if (q1 >= b || q1*vn0 > b*rhat + un1)
+                                   , CMPL fmt q1 (RIReg b)
+                                   , BCC GEU then1 Nothing
+                                   , BCC ALWAYS no1 Nothing
+
+                                   , NEWBLOCK no1
+                                   , MULL fmt tmp q1 (RIReg vn0)
+                                   , SL fmt tmp1 rhat (RIImm (ImmInt half))
+                                   , ADD tmp1 tmp1 (RIReg un1)
+                                   , CMPL fmt tmp (RIReg tmp1)
+                                   , BCC LEU endif1 Nothing
+                                   , BCC ALWAYS then1 Nothing
+
+                                   , NEWBLOCK then1
+                                     -- q1 = q1 - 1
+                                   , ADD q1 q1 (RIImm (ImmInt (-1)))
+                                     -- rhat = rhat + vn1
+                                   , ADD rhat rhat (RIReg vn1)
+                                     -- if (rhat < b) goto again1
+                                   , CMPL fmt rhat (RIReg b)
+                                   , BCC LTT again1 Nothing
+                                   , BCC ALWAYS endif1 Nothing
+
+                                   , NEWBLOCK endif1
+                                     -- un21 = un32*b + un1 - q1*v
+                                   , SL fmt un21 un32 (RIImm (ImmInt half))
+                                   , ADD un21 un21 (RIReg un1)
+                                   , MULL fmt tmp q1 (RIReg v)
+                                   , SUBF un21 tmp un21
+                                     -- compute second quotient digit
+                                     -- q0 = un21/vn1
+                                   , DIV fmt False q0 un21 vn1
+                                     -- rhat = un21- q0*vn1
+                                   , MULL fmt tmp q0 (RIReg vn1)
+                                   , SUBF rhat tmp un21
+                                   , BCC ALWAYS again2 Nothing
+
+                                   , NEWBLOCK again2
+                                     -- if (q0>b || q0*vn0 > b*rhat + un0)
+                                   , CMPL fmt q0 (RIReg b)
+                                   , BCC GEU then2 Nothing
+                                   , BCC ALWAYS no2 Nothing
+
+                                   , NEWBLOCK no2
+                                   , MULL fmt tmp q0 (RIReg vn0)
+                                   , SL fmt tmp1 rhat (RIImm (ImmInt half))
+                                   , ADD tmp1 tmp1 (RIReg un0)
+                                   , CMPL fmt tmp (RIReg tmp1)
+                                   , BCC LEU endif2 Nothing
+                                   , BCC ALWAYS then2 Nothing
+
+                                   , NEWBLOCK then2
+                                     -- q0 = q0 - 1
+                                   , ADD q0 q0 (RIImm (ImmInt (-1)))
+                                     -- rhat = rhat + vn1
+                                   , ADD rhat rhat (RIReg vn1)
+                                     -- if (rhat<b) goto again2
+                                   , CMPL fmt rhat (RIReg b)
+                                   , BCC LTT again2 Nothing
+                                   , BCC ALWAYS endif2 Nothing
+
+                                   , NEWBLOCK endif2
+                                     -- compute remainder
+                                     -- r = (un21*b + un0 - q0*v) >> s
+                                   , SL fmt reg_r un21 (RIImm (ImmInt half))
+                                   , ADD reg_r reg_r (RIReg un0)
+                                   , MULL fmt tmp q0 (RIReg v)
+                                   , SUBF reg_r tmp reg_r
+                                   , SR fmt reg_r reg_r (RIReg s)
+                                     -- compute quotient
+                                     -- q = q1*b + q0
+                                   , SL fmt reg_q q1 (RIImm (ImmInt half))
+                                   , ADD reg_q reg_q (RIReg q0)
+                                   ]
+              divOp2 _ _ _ _
+                = panic "genCCall: Wrong number of arguments for divOp2"
+              multOp2 platform width [res_h, res_l] [arg_x, arg_y]
+                = do let reg_h = getRegisterReg platform (CmmLocal res_h)
+                         reg_l = getRegisterReg platform (CmmLocal res_l)
+                         fmt = intFormat width
+                     (x_reg, x_code) <- getSomeReg arg_x
+                     (y_reg, y_code) <- getSomeReg arg_y
+                     return $ y_code `appOL` x_code
+                            `appOL` toOL [ MULL fmt reg_l x_reg (RIReg y_reg)
+                                         , MULHU fmt reg_h x_reg y_reg
+                                         ]
+              multOp2 _ _ _ _
+                = panic "genCall: Wrong number of arguments for multOp2"
+              add2Op platform [res_h, res_l] [arg_x, arg_y]
+                = do let reg_h = getRegisterReg platform (CmmLocal res_h)
+                         reg_l = getRegisterReg platform (CmmLocal res_l)
+                     (x_reg, x_code) <- getSomeReg arg_x
+                     (y_reg, y_code) <- getSomeReg arg_y
+                     return $ y_code `appOL` x_code
+                            `appOL` toOL [ LI reg_h (ImmInt 0)
+                                         , ADDC reg_l x_reg y_reg
+                                         , ADDZE reg_h reg_h
+                                         ]
+              add2Op _ _ _
+                = panic "genCCall: Wrong number of arguments/results for add2"
+
+              addcOp platform [res_r, res_c] [arg_x, arg_y]
+                = add2Op platform [res_c {-hi-}, res_r {-lo-}] [arg_x, arg_y]
+              addcOp _ _ _
+                = panic "genCCall: Wrong number of arguments/results for addc"
+
+              -- PowerPC subfc sets the carry for rT = ~(rA) + rB + 1,
+              -- which is 0 for borrow and 1 otherwise. We need 1 and 0
+              -- so xor with 1.
+              subcOp platform [res_r, res_c] [arg_x, arg_y]
+                = do let reg_r = getRegisterReg platform (CmmLocal res_r)
+                         reg_c = getRegisterReg platform (CmmLocal res_c)
+                     (x_reg, x_code) <- getSomeReg arg_x
+                     (y_reg, y_code) <- getSomeReg arg_y
+                     return $ y_code `appOL` x_code
+                            `appOL` toOL [ LI reg_c (ImmInt 0)
+                                         , SUBFC reg_r y_reg (RIReg x_reg)
+                                         , ADDZE reg_c reg_c
+                                         , XOR reg_c reg_c (RIImm (ImmInt 1))
+                                         ]
+              subcOp _ _ _
+                = panic "genCCall: Wrong number of arguments/results for subc"
+              addSubCOp instr platform width [res_r, res_c] [arg_x, arg_y]
+                = do let reg_r = getRegisterReg platform (CmmLocal res_r)
+                         reg_c = getRegisterReg platform (CmmLocal res_c)
+                     (x_reg, x_code) <- getSomeReg arg_x
+                     (y_reg, y_code) <- getSomeReg arg_y
+                     return $ y_code `appOL` x_code
+                            `appOL` toOL [ instr reg_r y_reg x_reg,
+                                           -- SUBFO argument order reversed!
+                                           MFOV (intFormat width) reg_c
+                                         ]
+              addSubCOp _ _ _ _ _
+                = panic "genCall: Wrong number of arguments/results for addC"
+              fabs platform [res] [arg]
+                = do let res_r = getRegisterReg platform (CmmLocal res)
+                     (arg_reg, arg_code) <- getSomeReg arg
+                     return $ arg_code `snocOL` FABS res_r arg_reg
+              fabs _ _ _
+                = panic "genCall: Wrong number of arguments/results for fabs"
+
+-- TODO: replace 'Int' by an enum such as 'PPC_64ABI'
+data GenCCallPlatform = GCP32ELF | GCP64ELF !Int | GCPAIX
+
+platformToGCP :: Platform -> GenCCallPlatform
+platformToGCP platform
+  = case platformOS platform of
+      OSAIX    -> GCPAIX
+      _ -> case platformArch platform of
+             ArchPPC           -> GCP32ELF
+             ArchPPC_64 ELF_V1 -> GCP64ELF 1
+             ArchPPC_64 ELF_V2 -> GCP64ELF 2
+             _ -> panic "platformToGCP: Not PowerPC"
+
+
+genCCall'
+    :: DynFlags
+    -> GenCCallPlatform
+    -> ForeignTarget            -- function to call
+    -> [CmmFormal]        -- where to put the result
+    -> [CmmActual]        -- arguments (of mixed type)
+    -> NatM InstrBlock
+
+{-
+    PowerPC Linux uses the System V Release 4 Calling Convention
+    for PowerPC. It is described in the
+    "System V Application Binary Interface PowerPC Processor Supplement".
+
+    PowerPC 64 Linux uses the System V Release 4 Calling Convention for
+    64-bit PowerPC. It is specified in
+    "64-bit PowerPC ELF Application Binary Interface Supplement 1.9"
+    (PPC64 ELF v1.9).
+
+    PowerPC 64 Linux in little endian mode uses the "Power Architecture 64-Bit
+    ELF V2 ABI Specification -- OpenPOWER ABI for Linux Supplement"
+    (PPC64 ELF v2).
+
+    AIX follows the "PowerOpen ABI: Application Binary Interface Big-Endian
+    32-Bit Hardware Implementation"
+
+    All four conventions are similar:
+    Parameters may be passed in general-purpose registers starting at r3, in
+    floating point registers starting at f1, or on the stack.
+
+    But there are substantial differences:
+    * The number of registers used for parameter passing and the exact set of
+      nonvolatile registers differs (see MachRegs.hs).
+    * On AIX and 64-bit ELF, stack space is always reserved for parameters,
+      even if they are passed in registers. The called routine may choose to
+      save parameters from registers to the corresponding space on the stack.
+    * On AIX and 64-bit ELF, a corresponding amount of GPRs is skipped when
+      a floating point parameter is passed in an FPR.
+    * SysV insists on either passing I64 arguments on the stack, or in two GPRs,
+      starting with an odd-numbered GPR. It may skip a GPR to achieve this.
+      AIX just treats an I64 likt two separate I32s (high word first).
+    * I64 and FF64 arguments are 8-byte aligned on the stack for SysV, but only
+      4-byte aligned like everything else on AIX.
+    * The SysV spec claims that FF32 is represented as FF64 on the stack. GCC on
+      PowerPC Linux does not agree, so neither do we.
+
+    According to all conventions, the parameter area should be part of the
+    caller's stack frame, allocated in the caller's prologue code (large enough
+    to hold the parameter lists for all called routines). The NCG already
+    uses the stack for register spilling, leaving 64 bytes free at the top.
+    If we need a larger parameter area than that, we increase the size
+    of the stack frame just before ccalling.
+-}
+
+
+genCCall' dflags gcp target dest_regs args
+  = do
+        (finalStack,passArgumentsCode,usedRegs) <- passArguments
+                                                   (zip3 args argReps argHints)
+                                                   allArgRegs
+                                                   (allFPArgRegs platform)
+                                                   initialStackOffset
+                                                   nilOL []
+
+        (labelOrExpr, reduceToFF32) <- case target of
+            ForeignTarget (CmmLit (CmmLabel lbl)) _ -> do
+                uses_pic_base_implicitly
+                return (Left lbl, False)
+            ForeignTarget expr _ -> do
+                uses_pic_base_implicitly
+                return (Right expr, False)
+            PrimTarget mop -> outOfLineMachOp mop
+
+        let codeBefore = move_sp_down finalStack `appOL` passArgumentsCode
+            codeAfter = move_sp_up finalStack `appOL` moveResult reduceToFF32
+
+        case labelOrExpr of
+            Left lbl -> do -- the linker does all the work for us
+                return (         codeBefore
+                        `snocOL` BL lbl usedRegs
+                        `appOL`  maybeNOP -- some ABI require a NOP after BL
+                        `appOL`  codeAfter)
+            Right dyn -> do -- implement call through function pointer
+                (dynReg, dynCode) <- getSomeReg dyn
+                case gcp of
+                     GCP64ELF 1      -> return ( dynCode
+                       `appOL`  codeBefore
+                       `snocOL` ST spFormat toc (AddrRegImm sp (ImmInt 40))
+                       `snocOL` LD II64 r11 (AddrRegImm dynReg (ImmInt 0))
+                       `snocOL` LD II64 toc (AddrRegImm dynReg (ImmInt 8))
+                       `snocOL` MTCTR r11
+                       `snocOL` LD II64 r11 (AddrRegImm dynReg (ImmInt 16))
+                       `snocOL` BCTRL usedRegs
+                       `snocOL` LD spFormat toc (AddrRegImm sp (ImmInt 40))
+                       `appOL`  codeAfter)
+                     GCP64ELF 2      -> return ( dynCode
+                       `appOL`  codeBefore
+                       `snocOL` ST spFormat toc (AddrRegImm sp (ImmInt 24))
+                       `snocOL` MR r12 dynReg
+                       `snocOL` MTCTR r12
+                       `snocOL` BCTRL usedRegs
+                       `snocOL` LD spFormat toc (AddrRegImm sp (ImmInt 24))
+                       `appOL`  codeAfter)
+                     GCPAIX          -> return ( dynCode
+                       -- AIX/XCOFF follows the PowerOPEN ABI
+                       -- which is quite similar to LinuxPPC64/ELFv1
+                       `appOL`  codeBefore
+                       `snocOL` ST spFormat toc (AddrRegImm sp (ImmInt 20))
+                       `snocOL` LD II32 r11 (AddrRegImm dynReg (ImmInt 0))
+                       `snocOL` LD II32 toc (AddrRegImm dynReg (ImmInt 4))
+                       `snocOL` MTCTR r11
+                       `snocOL` LD II32 r11 (AddrRegImm dynReg (ImmInt 8))
+                       `snocOL` BCTRL usedRegs
+                       `snocOL` LD spFormat toc (AddrRegImm sp (ImmInt 20))
+                       `appOL`  codeAfter)
+                     _               -> return ( dynCode
+                       `snocOL` MTCTR dynReg
+                       `appOL`  codeBefore
+                       `snocOL` BCTRL usedRegs
+                       `appOL`  codeAfter)
+    where
+        platform = targetPlatform dflags
+
+        uses_pic_base_implicitly = do
+            -- See Note [implicit register in PPC PIC code]
+            -- on why we claim to use PIC register here
+            when (positionIndependent dflags && target32Bit platform) $ do
+                _ <- getPicBaseNat $ archWordFormat True
+                return ()
+
+        initialStackOffset = case gcp of
+                             GCPAIX     -> 24
+                             GCP32ELF   -> 8
+                             GCP64ELF 1 -> 48
+                             GCP64ELF 2 -> 32
+                             _ -> panic "genCall': unknown calling convention"
+            -- size of linkage area + size of arguments, in bytes
+        stackDelta finalStack = case gcp of
+                                GCPAIX ->
+                                    roundTo 16 $ (24 +) $ max 32 $ sum $
+                                    map (widthInBytes . typeWidth) argReps
+                                GCP32ELF -> roundTo 16 finalStack
+                                GCP64ELF 1 ->
+                                    roundTo 16 $ (48 +) $ max 64 $ sum $
+                                    map (roundTo 8 . widthInBytes . typeWidth)
+                                        argReps
+                                GCP64ELF 2 ->
+                                    roundTo 16 $ (32 +) $ max 64 $ sum $
+                                    map (roundTo 8 . widthInBytes . typeWidth)
+                                        argReps
+                                _ -> panic "genCall': unknown calling conv."
+
+        argReps = map (cmmExprType dflags) args
+        (argHints, _) = foreignTargetHints target
+
+        roundTo a x | x `mod` a == 0 = x
+                    | otherwise = x + a - (x `mod` a)
+
+        spFormat = if target32Bit platform then II32 else II64
+
+        -- TODO: Do not create a new stack frame if delta is too large.
+        move_sp_down finalStack
+               | delta > stackFrameHeaderSize dflags =
+                        toOL [STU spFormat sp (AddrRegImm sp (ImmInt (-delta))),
+                              DELTA (-delta)]
+               | otherwise = nilOL
+               where delta = stackDelta finalStack
+        move_sp_up finalStack
+               | delta > stackFrameHeaderSize dflags =
+                        toOL [ADD sp sp (RIImm (ImmInt delta)),
+                              DELTA 0]
+               | otherwise = nilOL
+               where delta = stackDelta finalStack
+
+        -- A NOP instruction is required after a call (bl instruction)
+        -- on AIX and 64-Bit Linux.
+        -- If the call is to a function with a different TOC (r2) the
+        -- link editor replaces the NOP instruction with a load of the TOC
+        -- from the stack to restore the TOC.
+        maybeNOP = case gcp of
+           GCP32ELF        -> nilOL
+           -- See Section 3.9.4 of OpenPower ABI
+           GCPAIX          -> unitOL NOP
+           -- See Section 3.5.11 of PPC64 ELF v1.9
+           GCP64ELF 1      -> unitOL NOP
+           -- See Section 2.3.6 of PPC64 ELF v2
+           GCP64ELF 2      -> unitOL NOP
+           _               -> panic "maybeNOP: Unknown PowerPC 64-bit ABI"
+
+        passArguments [] _ _ stackOffset accumCode accumUsed = return (stackOffset, accumCode, accumUsed)
+        passArguments ((arg,arg_ty,_):args) gprs fprs stackOffset
+               accumCode accumUsed | isWord64 arg_ty
+                                     && target32Bit (targetPlatform dflags) =
+            do
+                ChildCode64 code vr_lo <- iselExpr64 arg
+                let vr_hi = getHiVRegFromLo vr_lo
+
+                case gcp of
+                    GCPAIX ->
+                        do let storeWord vr (gpr:_) _ = MR gpr vr
+                               storeWord vr [] offset
+                                   = ST II32 vr (AddrRegImm sp (ImmInt offset))
+                           passArguments args
+                                         (drop 2 gprs)
+                                         fprs
+                                         (stackOffset+8)
+                                         (accumCode `appOL` code
+                                               `snocOL` storeWord vr_hi gprs stackOffset
+                                               `snocOL` storeWord vr_lo (drop 1 gprs) (stackOffset+4))
+                                         ((take 2 gprs) ++ accumUsed)
+                    GCP32ELF ->
+                        do let stackOffset' = roundTo 8 stackOffset
+                               stackCode = accumCode `appOL` code
+                                   `snocOL` ST II32 vr_hi (AddrRegImm sp (ImmInt stackOffset'))
+                                   `snocOL` ST II32 vr_lo (AddrRegImm sp (ImmInt (stackOffset'+4)))
+                               regCode hireg loreg =
+                                   accumCode `appOL` code
+                                       `snocOL` MR hireg vr_hi
+                                       `snocOL` MR loreg vr_lo
+
+                           case gprs of
+                               hireg : loreg : regs | even (length gprs) ->
+                                   passArguments args regs fprs stackOffset
+                                                 (regCode hireg loreg) (hireg : loreg : accumUsed)
+                               _skipped : hireg : loreg : regs ->
+                                   passArguments args regs fprs stackOffset
+                                                 (regCode hireg loreg) (hireg : loreg : accumUsed)
+                               _ -> -- only one or no regs left
+                                   passArguments args [] fprs (stackOffset'+8)
+                                                 stackCode accumUsed
+                    GCP64ELF _ -> panic "passArguments: 32 bit code"
+
+        passArguments ((arg,rep,hint):args) gprs fprs stackOffset accumCode accumUsed
+            | reg : _ <- regs = do
+                register <- getRegister arg_pro
+                let code = case register of
+                            Fixed _ freg fcode -> fcode `snocOL` MR reg freg
+                            Any _ acode -> acode reg
+                    stackOffsetRes = case gcp of
+                                     -- The PowerOpen ABI requires that we
+                                     -- reserve stack slots for register
+                                     -- parameters
+                                     GCPAIX    -> stackOffset + stackBytes
+                                     -- ... the SysV ABI 32-bit doesn't.
+                                     GCP32ELF -> stackOffset
+                                     -- ... but SysV ABI 64-bit does.
+                                     GCP64ELF _ -> stackOffset + stackBytes
+                passArguments args
+                              (drop nGprs gprs)
+                              (drop nFprs fprs)
+                              stackOffsetRes
+                              (accumCode `appOL` code)
+                              (reg : accumUsed)
+            | otherwise = do
+                (vr, code) <- getSomeReg arg_pro
+                passArguments args
+                              (drop nGprs gprs)
+                              (drop nFprs fprs)
+                              (stackOffset' + stackBytes)
+                              (accumCode `appOL` code
+                                         `snocOL` ST format_pro vr stackSlot)
+                              accumUsed
+            where
+                arg_pro
+                   | isBitsType rep = CmmMachOp (conv_op (typeWidth rep) (wordWidth dflags)) [arg]
+                   | otherwise      = arg
+                format_pro
+                   | isBitsType rep = intFormat (wordWidth dflags)
+                   | otherwise      = cmmTypeFormat rep
+                conv_op = case hint of
+                            SignedHint -> MO_SS_Conv
+                            _          -> MO_UU_Conv
+
+                stackOffset' = case gcp of
+                               GCPAIX ->
+                                   -- The 32bit PowerOPEN ABI is happy with
+                                   -- 32bit-alignment ...
+                                   stackOffset
+                               GCP32ELF
+                                   -- ... the SysV ABI requires 8-byte
+                                   -- alignment for doubles.
+                                | isFloatType rep && typeWidth rep == W64 ->
+                                   roundTo 8 stackOffset
+                                | otherwise ->
+                                   stackOffset
+                               GCP64ELF _ ->
+                                   -- Everything on the stack is mapped to
+                                   -- 8-byte aligned doublewords
+                                   stackOffset
+                stackOffset''
+                     | isFloatType rep && typeWidth rep == W32 =
+                         case gcp of
+                         -- The ELF v1 ABI Section 3.2.3 requires:
+                         -- "Single precision floating point values
+                         -- are mapped to the second word in a single
+                         -- doubleword"
+                         GCP64ELF 1      -> stackOffset' + 4
+                         _               -> stackOffset'
+                     | otherwise = stackOffset'
+
+                stackSlot = AddrRegImm sp (ImmInt stackOffset'')
+                (nGprs, nFprs, stackBytes, regs)
+                    = case gcp of
+                      GCPAIX ->
+                          case cmmTypeFormat rep of
+                          II8  -> (1, 0, 4, gprs)
+                          II16 -> (1, 0, 4, gprs)
+                          II32 -> (1, 0, 4, gprs)
+                          -- The PowerOpen ABI requires that we skip a
+                          -- corresponding number of GPRs when we use
+                          -- the FPRs.
+                          --
+                          -- E.g. for a `double` two GPRs are skipped,
+                          -- whereas for a `float` one GPR is skipped
+                          -- when parameters are assigned to
+                          -- registers.
+                          --
+                          -- The PowerOpen ABI specification can be found at
+                          -- ftp://www.sourceware.org/pub/binutils/ppc-docs/ppc-poweropen/
+                          FF32 -> (1, 1, 4, fprs)
+                          FF64 -> (2, 1, 8, fprs)
+                          II64 -> panic "genCCall' passArguments II64"
+
+                      GCP32ELF ->
+                          case cmmTypeFormat rep of
+                          II8  -> (1, 0, 4, gprs)
+                          II16 -> (1, 0, 4, gprs)
+                          II32 -> (1, 0, 4, gprs)
+                          -- ... the SysV ABI doesn't.
+                          FF32 -> (0, 1, 4, fprs)
+                          FF64 -> (0, 1, 8, fprs)
+                          II64 -> panic "genCCall' passArguments II64"
+                      GCP64ELF _ ->
+                          case cmmTypeFormat rep of
+                          II8  -> (1, 0, 8, gprs)
+                          II16 -> (1, 0, 8, gprs)
+                          II32 -> (1, 0, 8, gprs)
+                          II64 -> (1, 0, 8, gprs)
+                          -- The ELFv1 ABI requires that we skip a
+                          -- corresponding number of GPRs when we use
+                          -- the FPRs.
+                          FF32 -> (1, 1, 8, fprs)
+                          FF64 -> (1, 1, 8, fprs)
+
+        moveResult reduceToFF32 =
+            case dest_regs of
+                [] -> nilOL
+                [dest]
+                    | reduceToFF32 && isFloat32 rep   -> unitOL (FRSP r_dest f1)
+                    | isFloat32 rep || isFloat64 rep -> unitOL (MR r_dest f1)
+                    | isWord64 rep && target32Bit (targetPlatform dflags)
+                       -> toOL [MR (getHiVRegFromLo r_dest) r3,
+                                MR r_dest r4]
+                    | otherwise -> unitOL (MR r_dest r3)
+                    where rep = cmmRegType dflags (CmmLocal dest)
+                          r_dest = getRegisterReg platform (CmmLocal dest)
+                _ -> panic "genCCall' moveResult: Bad dest_regs"
+
+        outOfLineMachOp mop =
+            do
+                dflags <- getDynFlags
+                mopExpr <- cmmMakeDynamicReference dflags CallReference $
+                              mkForeignLabel functionName Nothing ForeignLabelInThisPackage IsFunction
+                let mopLabelOrExpr = case mopExpr of
+                        CmmLit (CmmLabel lbl) -> Left lbl
+                        _ -> Right mopExpr
+                return (mopLabelOrExpr, reduce)
+            where
+                (functionName, reduce) = case mop of
+                    MO_F32_Exp   -> (fsLit "exp", True)
+                    MO_F32_ExpM1 -> (fsLit "expm1", True)
+                    MO_F32_Log   -> (fsLit "log", True)
+                    MO_F32_Log1P -> (fsLit "log1p", True)
+                    MO_F32_Sqrt  -> (fsLit "sqrt", True)
+                    MO_F32_Fabs  -> unsupported
+
+                    MO_F32_Sin   -> (fsLit "sin", True)
+                    MO_F32_Cos   -> (fsLit "cos", True)
+                    MO_F32_Tan   -> (fsLit "tan", True)
+
+                    MO_F32_Asin  -> (fsLit "asin", True)
+                    MO_F32_Acos  -> (fsLit "acos", True)
+                    MO_F32_Atan  -> (fsLit "atan", True)
+
+                    MO_F32_Sinh  -> (fsLit "sinh", True)
+                    MO_F32_Cosh  -> (fsLit "cosh", True)
+                    MO_F32_Tanh  -> (fsLit "tanh", True)
+                    MO_F32_Pwr   -> (fsLit "pow", True)
+
+                    MO_F32_Asinh -> (fsLit "asinh", True)
+                    MO_F32_Acosh -> (fsLit "acosh", True)
+                    MO_F32_Atanh -> (fsLit "atanh", True)
+
+                    MO_F64_Exp   -> (fsLit "exp", False)
+                    MO_F64_ExpM1 -> (fsLit "expm1", False)
+                    MO_F64_Log   -> (fsLit "log", False)
+                    MO_F64_Log1P -> (fsLit "log1p", False)
+                    MO_F64_Sqrt  -> (fsLit "sqrt", False)
+                    MO_F64_Fabs  -> unsupported
+
+                    MO_F64_Sin   -> (fsLit "sin", False)
+                    MO_F64_Cos   -> (fsLit "cos", False)
+                    MO_F64_Tan   -> (fsLit "tan", False)
+
+                    MO_F64_Asin  -> (fsLit "asin", False)
+                    MO_F64_Acos  -> (fsLit "acos", False)
+                    MO_F64_Atan  -> (fsLit "atan", False)
+
+                    MO_F64_Sinh  -> (fsLit "sinh", False)
+                    MO_F64_Cosh  -> (fsLit "cosh", False)
+                    MO_F64_Tanh  -> (fsLit "tanh", False)
+                    MO_F64_Pwr   -> (fsLit "pow", False)
+
+                    MO_F64_Asinh -> (fsLit "asinh", False)
+                    MO_F64_Acosh -> (fsLit "acosh", False)
+                    MO_F64_Atanh -> (fsLit "atanh", False)
+
+                    MO_UF_Conv w -> (fsLit $ word2FloatLabel w, False)
+
+                    MO_Memcpy _  -> (fsLit "memcpy", False)
+                    MO_Memset _  -> (fsLit "memset", False)
+                    MO_Memmove _ -> (fsLit "memmove", False)
+                    MO_Memcmp _  -> (fsLit "memcmp", False)
+
+                    MO_BSwap w   -> (fsLit $ bSwapLabel w, False)
+                    MO_BRev w    -> (fsLit $ bRevLabel w, False)
+                    MO_PopCnt w  -> (fsLit $ popCntLabel w, False)
+                    MO_Pdep w    -> (fsLit $ pdepLabel w, False)
+                    MO_Pext w    -> (fsLit $ pextLabel w, False)
+                    MO_Clz _     -> unsupported
+                    MO_Ctz _     -> unsupported
+                    MO_AtomicRMW {} -> unsupported
+                    MO_Cmpxchg w -> (fsLit $ cmpxchgLabel w, False)
+                    MO_AtomicRead _  -> unsupported
+                    MO_AtomicWrite _ -> unsupported
+
+                    MO_S_Mul2    {}  -> unsupported
+                    MO_S_QuotRem {}  -> unsupported
+                    MO_U_QuotRem {}  -> unsupported
+                    MO_U_QuotRem2 {} -> unsupported
+                    MO_Add2 {}       -> unsupported
+                    MO_AddWordC {}   -> unsupported
+                    MO_SubWordC {}   -> unsupported
+                    MO_AddIntC {}    -> unsupported
+                    MO_SubIntC {}    -> unsupported
+                    MO_U_Mul2 {}     -> unsupported
+                    MO_ReadBarrier   -> unsupported
+                    MO_WriteBarrier  -> unsupported
+                    MO_Touch         -> unsupported
+                    MO_Prefetch_Data _ -> unsupported
+                unsupported = panic ("outOfLineCmmOp: " ++ show mop
+                                  ++ " not supported")
+
+-- -----------------------------------------------------------------------------
+-- Generating a table-branch
+
+genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock
+genSwitch dflags expr targets
+  | OSAIX <- platformOS (targetPlatform dflags)
+  = do
+        (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset)
+        let fmt = archWordFormat $ target32Bit $ targetPlatform dflags
+            sha = if target32Bit $ targetPlatform dflags then 2 else 3
+        tmp <- getNewRegNat fmt
+        lbl <- getNewLabelNat
+        dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+        (tableReg,t_code) <- getSomeReg $ dynRef
+        let code = e_code `appOL` t_code `appOL` toOL [
+                            SL fmt tmp reg (RIImm (ImmInt sha)),
+                            LD fmt tmp (AddrRegReg tableReg tmp),
+                            MTCTR tmp,
+                            BCTR ids (Just lbl) []
+                    ]
+        return code
+
+  | (positionIndependent dflags) || (not $ target32Bit $ targetPlatform dflags)
+  = do
+        (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset)
+        let fmt = archWordFormat $ target32Bit $ targetPlatform dflags
+            sha = if target32Bit $ targetPlatform dflags then 2 else 3
+        tmp <- getNewRegNat fmt
+        lbl <- getNewLabelNat
+        dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+        (tableReg,t_code) <- getSomeReg $ dynRef
+        let code = e_code `appOL` t_code `appOL` toOL [
+                            SL fmt tmp reg (RIImm (ImmInt sha)),
+                            LD fmt tmp (AddrRegReg tableReg tmp),
+                            ADD tmp tmp (RIReg tableReg),
+                            MTCTR tmp,
+                            BCTR ids (Just lbl) []
+                    ]
+        return code
+  | otherwise
+  = do
+        (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset)
+        let fmt = archWordFormat $ target32Bit $ targetPlatform dflags
+            sha = if target32Bit $ targetPlatform dflags then 2 else 3
+        tmp <- getNewRegNat fmt
+        lbl <- getNewLabelNat
+        let code = e_code `appOL` toOL [
+                            SL fmt tmp reg (RIImm (ImmInt sha)),
+                            ADDIS tmp tmp (HA (ImmCLbl lbl)),
+                            LD fmt tmp (AddrRegImm tmp (LO (ImmCLbl lbl))),
+                            MTCTR tmp,
+                            BCTR ids (Just lbl) []
+                    ]
+        return code
+  where (offset, ids) = switchTargetsToTable targets
+
+generateJumpTableForInstr :: DynFlags -> Instr
+                          -> Maybe (NatCmmDecl RawCmmStatics Instr)
+generateJumpTableForInstr dflags (BCTR ids (Just lbl) _) =
+    let jumpTable
+            | (positionIndependent dflags)
+              || (not $ target32Bit $ targetPlatform dflags)
+            = map jumpTableEntryRel ids
+            | otherwise = map (jumpTableEntry dflags) ids
+                where jumpTableEntryRel Nothing
+                        = CmmStaticLit (CmmInt 0 (wordWidth dflags))
+                      jumpTableEntryRel (Just blockid)
+                        = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0
+                                         (wordWidth dflags))
+                            where blockLabel = blockLbl blockid
+    in Just (CmmData (Section ReadOnlyData lbl) (RawCmmStatics lbl jumpTable))
+generateJumpTableForInstr _ _ = Nothing
+
+-- -----------------------------------------------------------------------------
+-- 'condIntReg' and 'condFltReg': condition codes into registers
+
+-- Turn those condition codes into integers now (when they appear on
+-- the right hand side of an assignment).
+
+
+
+condReg :: NatM CondCode -> NatM Register
+condReg getCond = do
+    CondCode _ cond cond_code <- getCond
+    dflags <- getDynFlags
+    let
+        code dst = cond_code
+            `appOL` negate_code
+            `appOL` toOL [
+                MFCR dst,
+                RLWINM dst dst (bit + 1) 31 31
+            ]
+
+        negate_code | do_negate = unitOL (CRNOR bit bit bit)
+                    | otherwise = nilOL
+
+        (bit, do_negate) = case cond of
+            LTT -> (0, False)
+            LE  -> (1, True)
+            EQQ -> (2, False)
+            GE  -> (0, True)
+            GTT -> (1, False)
+
+            NE  -> (2, True)
+
+            LU  -> (0, False)
+            LEU -> (1, True)
+            GEU -> (0, True)
+            GU  -> (1, False)
+            _   -> panic "PPC.CodeGen.codeReg: no match"
+
+        format = archWordFormat $ target32Bit $ targetPlatform dflags
+    return (Any format code)
+
+condIntReg :: Cond -> Width -> CmmExpr -> CmmExpr -> NatM Register
+condIntReg cond width x y = condReg (condIntCode cond width x y)
+condFltReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
+condFltReg cond x y = condReg (condFltCode cond x y)
+
+
+
+-- -----------------------------------------------------------------------------
+-- 'trivial*Code': deal with trivial instructions
+
+-- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
+-- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
+-- Only look for constants on the right hand side, because that's
+-- where the generic optimizer will have put them.
+
+-- Similarly, for unary instructions, we don't have to worry about
+-- matching an StInt as the argument, because genericOpt will already
+-- have handled the constant-folding.
+
+
+
+{-
+Wolfgang's PowerPC version of The Rules:
+
+A slightly modified version of The Rules to take advantage of the fact
+that PowerPC instructions work on all registers and don't implicitly
+clobber any fixed registers.
+
+* The only expression for which getRegister returns Fixed is (CmmReg reg).
+
+* If getRegister returns Any, then the code it generates may modify only:
+        (a) fresh temporaries
+        (b) the destination register
+  It may *not* modify global registers, unless the global
+  register happens to be the destination register.
+  It may not clobber any other registers. In fact, only ccalls clobber any
+  fixed registers.
+  Also, it may not modify the counter register (used by genCCall).
+
+  Corollary: If a getRegister for a subexpression returns Fixed, you need
+  not move it to a fresh temporary before evaluating the next subexpression.
+  The Fixed register won't be modified.
+  Therefore, we don't need a counterpart for the x86's getStableReg on PPC.
+
+* SDM's First Rule is valid for PowerPC, too: subexpressions can depend on
+  the value of the destination register.
+-}
+
+trivialCode
+        :: Width
+        -> Bool
+        -> (Reg -> Reg -> RI -> Instr)
+        -> CmmExpr
+        -> CmmExpr
+        -> NatM Register
+
+trivialCode rep signed instr x (CmmLit (CmmInt y _))
+    | Just imm <- makeImmediate rep signed y
+    = do
+        (src1, code1) <- getSomeReg x
+        let code dst = code1 `snocOL` instr dst src1 (RIImm imm)
+        return (Any (intFormat rep) code)
+
+trivialCode rep _ instr x y = do
+    (src1, code1) <- getSomeReg x
+    (src2, code2) <- getSomeReg y
+    let code dst = code1 `appOL` code2 `snocOL` instr dst src1 (RIReg src2)
+    return (Any (intFormat rep) code)
+
+shiftMulCode
+        :: Width
+        -> Bool
+        -> (Format-> Reg -> Reg -> RI -> Instr)
+        -> CmmExpr
+        -> CmmExpr
+        -> NatM Register
+shiftMulCode width sign instr x (CmmLit (CmmInt y _))
+    | Just imm <- makeImmediate width sign y
+    = do
+        (src1, code1) <- getSomeReg x
+        let format = intFormat width
+        let ins_fmt = intFormat (max W32 width)
+        let code dst = code1 `snocOL` instr ins_fmt dst src1 (RIImm imm)
+        return (Any format code)
+
+shiftMulCode width _ instr x y = do
+    (src1, code1) <- getSomeReg x
+    (src2, code2) <- getSomeReg y
+    let format = intFormat width
+    let ins_fmt = intFormat (max W32 width)
+    let code dst = code1 `appOL` code2
+                   `snocOL` instr ins_fmt dst src1 (RIReg src2)
+    return (Any format code)
+
+trivialCodeNoImm' :: Format -> (Reg -> Reg -> Reg -> Instr)
+                 -> CmmExpr -> CmmExpr -> NatM Register
+trivialCodeNoImm' format instr x y = do
+    (src1, code1) <- getSomeReg x
+    (src2, code2) <- getSomeReg y
+    let code dst = code1 `appOL` code2 `snocOL` instr dst src1 src2
+    return (Any format code)
+
+trivialCodeNoImm :: Format -> (Format -> Reg -> Reg -> Reg -> Instr)
+                 -> CmmExpr -> CmmExpr -> NatM Register
+trivialCodeNoImm format instr x y
+  = trivialCodeNoImm' format (instr format) x y
+
+srCode :: Width -> Bool -> (Format-> Reg -> Reg -> RI -> Instr)
+       -> CmmExpr -> CmmExpr -> NatM Register
+srCode width sgn instr x (CmmLit (CmmInt y _))
+    | Just imm <- makeImmediate width sgn y
+    = do
+        let op_len = max W32 width
+            extend = if sgn then extendSExpr else extendUExpr
+        (src1, code1) <- getSomeReg (extend width op_len x)
+        let code dst = code1 `snocOL`
+                       instr (intFormat op_len) dst src1 (RIImm imm)
+        return (Any (intFormat width) code)
+
+srCode width sgn instr x y = do
+  let op_len = max W32 width
+      extend = if sgn then extendSExpr else extendUExpr
+  (src1, code1) <- getSomeReg (extend width op_len x)
+  (src2, code2) <- getSomeReg (extendUExpr width op_len y)
+  -- Note: Shift amount `y` is unsigned
+  let code dst = code1 `appOL` code2 `snocOL`
+                 instr (intFormat op_len) dst src1 (RIReg src2)
+  return (Any (intFormat width) code)
+
+divCode :: Width -> Bool -> CmmExpr -> CmmExpr -> NatM Register
+divCode width sgn x y = do
+  let op_len = max W32 width
+      extend = if sgn then extendSExpr else extendUExpr
+  (src1, code1) <- getSomeReg (extend width op_len x)
+  (src2, code2) <- getSomeReg (extend width op_len y)
+  let code dst = code1 `appOL` code2 `snocOL`
+                 DIV (intFormat op_len) sgn dst src1 src2
+  return (Any (intFormat width) code)
+
+
+trivialUCode :: Format
+             -> (Reg -> Reg -> Instr)
+             -> CmmExpr
+             -> NatM Register
+trivialUCode rep instr x = do
+    (src, code) <- getSomeReg x
+    let code' dst = code `snocOL` instr dst src
+    return (Any rep code')
+
+-- There is no "remainder" instruction on the PPC, so we have to do
+-- it the hard way.
+-- The "sgn" parameter is the signedness for the division instruction
+
+remainderCode :: Width -> Bool -> Reg -> CmmExpr -> CmmExpr
+               -> NatM (Reg -> InstrBlock)
+remainderCode rep sgn reg_q arg_x arg_y = do
+  let op_len = max W32 rep
+      fmt    = intFormat op_len
+      extend = if sgn then extendSExpr else extendUExpr
+  (x_reg, x_code) <- getSomeReg (extend rep op_len arg_x)
+  (y_reg, y_code) <- getSomeReg (extend rep op_len arg_y)
+  return $ \reg_r -> y_code `appOL` x_code
+                     `appOL` toOL [ DIV fmt sgn reg_q x_reg y_reg
+                                  , MULL fmt reg_r reg_q (RIReg y_reg)
+                                  , SUBF reg_r reg_r x_reg
+                                  ]
+
+
+coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
+coerceInt2FP fromRep toRep x = do
+    dflags <- getDynFlags
+    let arch =  platformArch $ targetPlatform dflags
+    coerceInt2FP' arch fromRep toRep x
+
+coerceInt2FP' :: Arch -> Width -> Width -> CmmExpr -> NatM Register
+coerceInt2FP' ArchPPC fromRep toRep x = do
+    (src, code) <- getSomeReg x
+    lbl <- getNewLabelNat
+    itmp <- getNewRegNat II32
+    ftmp <- getNewRegNat FF64
+    dflags <- getDynFlags
+    dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+    Amode addr addr_code <- getAmode D dynRef
+    let
+        code' dst = code `appOL` maybe_exts `appOL` toOL [
+                LDATA (Section ReadOnlyData lbl) $ RawCmmStatics lbl
+                                 [CmmStaticLit (CmmInt 0x43300000 W32),
+                                  CmmStaticLit (CmmInt 0x80000000 W32)],
+                XORIS itmp src (ImmInt 0x8000),
+                ST II32 itmp (spRel dflags 3),
+                LIS itmp (ImmInt 0x4330),
+                ST II32 itmp (spRel dflags 2),
+                LD FF64 ftmp (spRel dflags 2)
+            ] `appOL` addr_code `appOL` toOL [
+                LD FF64 dst addr,
+                FSUB FF64 dst ftmp dst
+            ] `appOL` maybe_frsp dst
+
+        maybe_exts = case fromRep of
+                        W8 ->  unitOL $ EXTS II8 src src
+                        W16 -> unitOL $ EXTS II16 src src
+                        W32 -> nilOL
+                        _       -> panic "PPC.CodeGen.coerceInt2FP: no match"
+
+        maybe_frsp dst
+                = case toRep of
+                        W32 -> unitOL $ FRSP dst dst
+                        W64 -> nilOL
+                        _       -> panic "PPC.CodeGen.coerceInt2FP: no match"
+
+    return (Any (floatFormat toRep) code')
+
+-- On an ELF v1 Linux we use the compiler doubleword in the stack frame
+-- this is the TOC pointer doubleword on ELF v2 Linux. The latter is only
+-- set right before a call and restored right after return from the call.
+-- So it is fine.
+coerceInt2FP' (ArchPPC_64 _) fromRep toRep x = do
+    (src, code) <- getSomeReg x
+    dflags <- getDynFlags
+    let
+        code' dst = code `appOL` maybe_exts `appOL` toOL [
+                ST II64 src (spRel dflags 3),
+                LD FF64 dst (spRel dflags 3),
+                FCFID dst dst
+            ] `appOL` maybe_frsp dst
+
+        maybe_exts = case fromRep of
+                        W8 ->  unitOL $ EXTS II8 src src
+                        W16 -> unitOL $ EXTS II16 src src
+                        W32 -> unitOL $ EXTS II32 src src
+                        W64 -> nilOL
+                        _       -> panic "PPC.CodeGen.coerceInt2FP: no match"
+
+        maybe_frsp dst
+                = case toRep of
+                        W32 -> unitOL $ FRSP dst dst
+                        W64 -> nilOL
+                        _       -> panic "PPC.CodeGen.coerceInt2FP: no match"
+
+    return (Any (floatFormat toRep) code')
+
+coerceInt2FP' _ _ _ _ = panic "PPC.CodeGen.coerceInt2FP: unknown arch"
+
+
+coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
+coerceFP2Int fromRep toRep x = do
+    dflags <- getDynFlags
+    let arch =  platformArch $ targetPlatform dflags
+    coerceFP2Int' arch fromRep toRep x
+
+coerceFP2Int' :: Arch -> Width -> Width -> CmmExpr -> NatM Register
+coerceFP2Int' ArchPPC _ toRep x = do
+    dflags <- getDynFlags
+    -- the reps don't really matter: F*->FF64 and II32->I* are no-ops
+    (src, code) <- getSomeReg x
+    tmp <- getNewRegNat FF64
+    let
+        code' dst = code `appOL` toOL [
+                -- convert to int in FP reg
+            FCTIWZ tmp src,
+                -- store value (64bit) from FP to stack
+            ST FF64 tmp (spRel dflags 2),
+                -- read low word of value (high word is undefined)
+            LD II32 dst (spRel dflags 3)]
+    return (Any (intFormat toRep) code')
+
+coerceFP2Int' (ArchPPC_64 _) _ toRep x = do
+    dflags <- getDynFlags
+    -- the reps don't really matter: F*->FF64 and II64->I* are no-ops
+    (src, code) <- getSomeReg x
+    tmp <- getNewRegNat FF64
+    let
+        code' dst = code `appOL` toOL [
+                -- convert to int in FP reg
+            FCTIDZ tmp src,
+                -- store value (64bit) from FP to compiler word on stack
+            ST FF64 tmp (spRel dflags 3),
+            LD II64 dst (spRel dflags 3)]
+    return (Any (intFormat toRep) code')
+
+coerceFP2Int' _ _ _ _ = panic "PPC.CodeGen.coerceFP2Int: unknown arch"
+
+-- Note [.LCTOC1 in PPC PIC code]
+-- The .LCTOC1 label is defined to point 32768 bytes into the GOT table
+-- to make the most of the PPC's 16-bit displacements.
+-- As 16-bit signed offset is used (usually via addi/lwz instructions)
+-- first element will have '-32768' offset against .LCTOC1.
+
+-- Note [implicit register in PPC PIC code]
+-- PPC generates calls by labels in assembly
+-- in form of:
+--     bl puts+32768@plt
+-- in this form it's not seen directly (by GHC NCG)
+-- that r30 (PicBaseReg) is used,
+-- but r30 is a required part of PLT code setup:
+--   puts+32768@plt:
+--       lwz     r11,-30484(r30) ; offset in .LCTOC1
+--       mtctr   r11
+--       bctr
diff --git a/compiler/GHC/CmmToAsm/PPC/Cond.hs b/compiler/GHC/CmmToAsm/PPC/Cond.hs
new file mode 100644
index 0000000000..e8efa30064
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PPC/Cond.hs
@@ -0,0 +1,63 @@
+module GHC.CmmToAsm.PPC.Cond (
+        Cond(..),
+        condNegate,
+        condUnsigned,
+        condToSigned,
+        condToUnsigned,
+)
+
+where
+
+import GhcPrelude
+
+import Panic
+
+data Cond
+        = ALWAYS
+        | EQQ
+        | GE
+        | GEU
+        | GTT
+        | GU
+        | LE
+        | LEU
+        | LTT
+        | LU
+        | NE
+        deriving Eq
+
+
+condNegate :: Cond -> Cond
+condNegate ALWAYS  = panic "condNegate: ALWAYS"
+condNegate EQQ     = NE
+condNegate GE      = LTT
+condNegate GEU     = LU
+condNegate GTT     = LE
+condNegate GU      = LEU
+condNegate LE      = GTT
+condNegate LEU     = GU
+condNegate LTT     = GE
+condNegate LU      = GEU
+condNegate NE      = EQQ
+
+-- Condition utils
+condUnsigned :: Cond -> Bool
+condUnsigned GU  = True
+condUnsigned LU  = True
+condUnsigned GEU = True
+condUnsigned LEU = True
+condUnsigned _   = False
+
+condToSigned :: Cond -> Cond
+condToSigned GU  = GTT
+condToSigned LU  = LTT
+condToSigned GEU = GE
+condToSigned LEU = LE
+condToSigned x   = x
+
+condToUnsigned :: Cond -> Cond
+condToUnsigned GTT = GU
+condToUnsigned LTT = LU
+condToUnsigned GE  = GEU
+condToUnsigned LE  = LEU
+condToUnsigned x   = x
diff --git a/compiler/GHC/CmmToAsm/PPC/Instr.hs b/compiler/GHC/CmmToAsm/PPC/Instr.hs
new file mode 100644
index 0000000000..b92a952340
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PPC/Instr.hs
@@ -0,0 +1,713 @@
+{-# LANGUAGE CPP #-}
+
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-----------------------------------------------------------------------------
+--
+-- Machine-dependent assembly language
+--
+-- (c) The University of Glasgow 1993-2004
+--
+-----------------------------------------------------------------------------
+
+#include "HsVersions.h"
+
+module GHC.CmmToAsm.PPC.Instr (
+    archWordFormat,
+    RI(..),
+    Instr(..),
+    stackFrameHeaderSize,
+    maxSpillSlots,
+    allocMoreStack,
+    makeFarBranches
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.PPC.Regs
+import GHC.CmmToAsm.PPC.Cond
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Reg.Target
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GHC.Platform.Regs
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Driver.Session
+import GHC.Cmm
+import GHC.Cmm.Info
+import FastString
+import GHC.Cmm.CLabel
+import Outputable
+import GHC.Platform
+import UniqFM (listToUFM, lookupUFM)
+import UniqSupply
+
+import Control.Monad (replicateM)
+import Data.Maybe (fromMaybe)
+
+--------------------------------------------------------------------------------
+-- Format of a PPC memory address.
+--
+archWordFormat :: Bool -> Format
+archWordFormat is32Bit
+ | is32Bit   = II32
+ | otherwise = II64
+
+
+-- | Instruction instance for powerpc
+instance Instruction Instr where
+        regUsageOfInstr         = ppc_regUsageOfInstr
+        patchRegsOfInstr        = ppc_patchRegsOfInstr
+        isJumpishInstr          = ppc_isJumpishInstr
+        jumpDestsOfInstr        = ppc_jumpDestsOfInstr
+        patchJumpInstr          = ppc_patchJumpInstr
+        mkSpillInstr            = ppc_mkSpillInstr
+        mkLoadInstr             = ppc_mkLoadInstr
+        takeDeltaInstr          = ppc_takeDeltaInstr
+        isMetaInstr             = ppc_isMetaInstr
+        mkRegRegMoveInstr _     = ppc_mkRegRegMoveInstr
+        takeRegRegMoveInstr     = ppc_takeRegRegMoveInstr
+        mkJumpInstr             = ppc_mkJumpInstr
+        mkStackAllocInstr       = ppc_mkStackAllocInstr
+        mkStackDeallocInstr     = ppc_mkStackDeallocInstr
+
+
+ppc_mkStackAllocInstr :: Platform -> Int -> [Instr]
+ppc_mkStackAllocInstr platform amount
+  = ppc_mkStackAllocInstr' platform (-amount)
+
+ppc_mkStackDeallocInstr :: Platform -> Int -> [Instr]
+ppc_mkStackDeallocInstr platform amount
+  = ppc_mkStackAllocInstr' platform amount
+
+ppc_mkStackAllocInstr' :: Platform -> Int -> [Instr]
+ppc_mkStackAllocInstr' platform amount
+  | fits16Bits amount
+  = [ LD fmt r0 (AddrRegImm sp zero)
+    , STU fmt r0 (AddrRegImm sp immAmount)
+    ]
+  | otherwise
+  = [ LD fmt r0 (AddrRegImm sp zero)
+    , ADDIS tmp sp (HA immAmount)
+    , ADD tmp tmp (RIImm (LO immAmount))
+    , STU fmt r0 (AddrRegReg sp tmp)
+    ]
+  where
+    fmt = intFormat $ widthFromBytes (platformWordSizeInBytes platform)
+    zero = ImmInt 0
+    tmp = tmpReg platform
+    immAmount = ImmInt amount
+
+--
+-- See note [extra spill slots] in X86/Instr.hs
+--
+allocMoreStack
+  :: Platform
+  -> Int
+  -> NatCmmDecl statics GHC.CmmToAsm.PPC.Instr.Instr
+  -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.PPC.Instr.Instr, [(BlockId,BlockId)])
+
+allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
+allocMoreStack platform slots (CmmProc info lbl live (ListGraph code)) = do
+    let
+        infos   = mapKeys info
+        entries = case code of
+                    [] -> infos
+                    BasicBlock entry _ : _ -- first block is the entry point
+                        | entry `elem` infos -> infos
+                        | otherwise          -> entry : infos
+
+    uniqs <- replicateM (length entries) getUniqueM
+
+    let
+        delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
+            where x = slots * spillSlotSize -- sp delta
+
+        alloc   = mkStackAllocInstr   platform delta
+        dealloc = mkStackDeallocInstr platform delta
+
+        retargetList = (zip entries (map mkBlockId uniqs))
+
+        new_blockmap :: LabelMap BlockId
+        new_blockmap = mapFromList retargetList
+
+        insert_stack_insns (BasicBlock id insns)
+            | Just new_blockid <- mapLookup id new_blockmap
+                = [ BasicBlock id $ alloc ++ [BCC ALWAYS new_blockid Nothing]
+                  , BasicBlock new_blockid block'
+                  ]
+            | otherwise
+                = [ BasicBlock id block' ]
+            where
+              block' = foldr insert_dealloc [] insns
+
+        insert_dealloc insn r
+            -- BCTR might or might not be a non-local jump. For
+            -- "labeled-goto" we use JMP, and for "computed-goto" we
+            -- use MTCTR followed by BCTR. See 'PPC.CodeGen.genJump'.
+            = case insn of
+                JMP _ _           -> dealloc ++ (insn : r)
+                BCTR [] Nothing _ -> dealloc ++ (insn : r)
+                BCTR ids label rs -> BCTR (map (fmap retarget) ids) label rs : r
+                BCCFAR cond b p   -> BCCFAR cond (retarget b) p : r
+                BCC    cond b p   -> BCC    cond (retarget b) p : r
+                _                 -> insn : r
+            -- BL and BCTRL are call-like instructions rather than
+            -- jumps, and are used only for C calls.
+
+        retarget :: BlockId -> BlockId
+        retarget b
+            = fromMaybe b (mapLookup b new_blockmap)
+
+        new_code
+            = concatMap insert_stack_insns code
+
+    -- in
+    return (CmmProc info lbl live (ListGraph new_code),retargetList)
+
+
+-- -----------------------------------------------------------------------------
+-- Machine's assembly language
+
+-- We have a few common "instructions" (nearly all the pseudo-ops) but
+-- mostly all of 'Instr' is machine-specific.
+
+-- Register or immediate
+data RI
+    = RIReg Reg
+    | RIImm Imm
+
+data Instr
+    -- comment pseudo-op
+    = COMMENT FastString
+
+    -- some static data spat out during code
+    -- generation.  Will be extracted before
+    -- pretty-printing.
+    | LDATA   Section RawCmmStatics
+
+    -- start a new basic block.  Useful during
+    -- codegen, removed later.  Preceding
+    -- instruction should be a jump, as per the
+    -- invariants for a BasicBlock (see Cmm).
+    | NEWBLOCK BlockId
+
+    -- specify current stack offset for
+    -- benefit of subsequent passes
+    | DELTA   Int
+
+    -- Loads and stores.
+    | LD      Format Reg AddrMode   -- Load format, dst, src
+    | LDFAR   Format Reg AddrMode   -- Load format, dst, src 32 bit offset
+    | LDR     Format Reg AddrMode   -- Load and reserve format, dst, src
+    | LA      Format Reg AddrMode   -- Load arithmetic format, dst, src
+    | ST      Format Reg AddrMode   -- Store format, src, dst
+    | STFAR   Format Reg AddrMode   -- Store format, src, dst 32 bit offset
+    | STU     Format Reg AddrMode   -- Store with Update format, src, dst
+    | STC     Format Reg AddrMode   -- Store conditional format, src, dst
+    | LIS     Reg Imm               -- Load Immediate Shifted dst, src
+    | LI      Reg Imm               -- Load Immediate dst, src
+    | MR      Reg Reg               -- Move Register dst, src -- also for fmr
+
+    | CMP     Format Reg RI         -- format, src1, src2
+    | CMPL    Format Reg RI         -- format, src1, src2
+
+    | BCC     Cond BlockId (Maybe Bool) -- cond, block, hint
+    | BCCFAR  Cond BlockId (Maybe Bool) -- cond, block, hint
+                                    --   hint:
+                                    --    Just True:  branch likely taken
+                                    --    Just False: branch likely not taken
+                                    --    Nothing:    no hint
+    | JMP     CLabel [Reg]          -- same as branch,
+                                    -- but with CLabel instead of block ID
+                                    -- and live global registers
+    | MTCTR   Reg
+    | BCTR    [Maybe BlockId] (Maybe CLabel) [Reg]
+                                    -- with list of local destinations, and
+                                    -- jump table location if necessary
+    | BL      CLabel [Reg]          -- with list of argument regs
+    | BCTRL   [Reg]
+
+    | ADD     Reg Reg RI            -- dst, src1, src2
+    | ADDO    Reg Reg Reg           -- add and set overflow
+    | ADDC    Reg Reg Reg           -- (carrying) dst, src1, src2
+    | ADDE    Reg Reg Reg           -- (extended) dst, src1, src2
+    | ADDZE   Reg Reg               -- (to zero extended) dst, src
+    | ADDIS   Reg Reg Imm           -- Add Immediate Shifted dst, src1, src2
+    | SUBF    Reg Reg Reg           -- dst, src1, src2 ; dst = src2 - src1
+    | SUBFO   Reg Reg Reg           -- subtract from and set overflow
+    | SUBFC   Reg Reg RI            -- (carrying) dst, src1, src2 ;
+                                    -- dst = src2 - src1
+    | SUBFE   Reg Reg Reg           -- (extended) dst, src1, src2 ;
+                                    -- dst = src2 - src1
+    | MULL    Format Reg Reg RI
+    | MULLO   Format Reg Reg Reg    -- multiply and set overflow
+    | MFOV    Format Reg            -- move overflow bit (1|33) to register
+                                    -- pseudo-instruction; pretty printed as
+                                    -- mfxer dst
+                                    -- extr[w|d]i dst, dst, 1, [1|33]
+    | MULHU   Format Reg Reg Reg
+    | DIV     Format Bool Reg Reg Reg
+    | AND     Reg Reg RI            -- dst, src1, src2
+    | ANDC    Reg Reg Reg           -- AND with complement, dst = src1 & ~ src2
+    | NAND    Reg Reg Reg           -- dst, src1, src2
+    | OR      Reg Reg RI            -- dst, src1, src2
+    | ORIS    Reg Reg Imm           -- OR Immediate Shifted dst, src1, src2
+    | XOR     Reg Reg RI            -- dst, src1, src2
+    | XORIS   Reg Reg Imm           -- XOR Immediate Shifted dst, src1, src2
+
+    | EXTS    Format Reg Reg
+    | CNTLZ   Format Reg Reg
+
+    | NEG     Reg Reg
+    | NOT     Reg Reg
+
+    | SL      Format Reg Reg RI            -- shift left
+    | SR      Format Reg Reg RI            -- shift right
+    | SRA     Format Reg Reg RI            -- shift right arithmetic
+
+    | RLWINM  Reg Reg Int Int Int   -- Rotate Left Word Immediate then AND with Mask
+    | CLRLI   Format Reg Reg Int    -- clear left immediate (extended mnemonic)
+    | CLRRI   Format Reg Reg Int    -- clear right immediate (extended mnemonic)
+
+    | FADD    Format Reg Reg Reg
+    | FSUB    Format Reg Reg Reg
+    | FMUL    Format Reg Reg Reg
+    | FDIV    Format Reg Reg Reg
+    | FABS    Reg Reg               -- abs is the same for single and double
+    | FNEG    Reg Reg               -- negate is the same for single and double prec.
+
+    | FCMP    Reg Reg
+
+    | FCTIWZ  Reg Reg           -- convert to integer word
+    | FCTIDZ  Reg Reg           -- convert to integer double word
+    | FCFID   Reg Reg           -- convert from integer double word
+    | FRSP    Reg Reg           -- reduce to single precision
+                                -- (but destination is a FP register)
+
+    | CRNOR   Int Int Int       -- condition register nor
+    | MFCR    Reg               -- move from condition register
+
+    | MFLR    Reg               -- move from link register
+    | FETCHPC Reg               -- pseudo-instruction:
+                                -- bcl to next insn, mflr reg
+    | HWSYNC                    -- heavy weight sync
+    | ISYNC                     -- instruction synchronize
+    | LWSYNC                    -- memory barrier
+    | NOP                       -- no operation, PowerPC 64 bit
+                                -- needs this as place holder to
+                                -- reload TOC pointer
+
+-- | Get the registers that are being used by this instruction.
+-- regUsage doesn't need to do any trickery for jumps and such.
+-- Just state precisely the regs read and written by that insn.
+-- The consequences of control flow transfers, as far as register
+-- allocation goes, are taken care of by the register allocator.
+--
+ppc_regUsageOfInstr :: Platform -> Instr -> RegUsage
+ppc_regUsageOfInstr platform instr
+ = case instr of
+    LD      _ reg addr       -> usage (regAddr addr, [reg])
+    LDFAR   _ reg addr       -> usage (regAddr addr, [reg])
+    LDR     _ reg addr       -> usage (regAddr addr, [reg])
+    LA      _ reg addr       -> usage (regAddr addr, [reg])
+    ST      _ reg addr       -> usage (reg : regAddr addr, [])
+    STFAR   _ reg addr       -> usage (reg : regAddr addr, [])
+    STU     _ reg addr       -> usage (reg : regAddr addr, [])
+    STC     _ reg addr       -> usage (reg : regAddr addr, [])
+    LIS     reg _            -> usage ([], [reg])
+    LI      reg _            -> usage ([], [reg])
+    MR      reg1 reg2        -> usage ([reg2], [reg1])
+    CMP     _ reg ri         -> usage (reg : regRI ri,[])
+    CMPL    _ reg ri         -> usage (reg : regRI ri,[])
+    BCC     _ _ _            -> noUsage
+    BCCFAR  _ _ _            -> noUsage
+    JMP     _ regs           -> usage (regs, [])
+    MTCTR   reg              -> usage ([reg],[])
+    BCTR    _ _ regs         -> usage (regs, [])
+    BL      _ params         -> usage (params, callClobberedRegs platform)
+    BCTRL   params           -> usage (params, callClobberedRegs platform)
+
+    ADD     reg1 reg2 ri     -> usage (reg2 : regRI ri, [reg1])
+    ADDO    reg1 reg2 reg3   -> usage ([reg2,reg3], [reg1])
+    ADDC    reg1 reg2 reg3   -> usage ([reg2,reg3], [reg1])
+    ADDE    reg1 reg2 reg3   -> usage ([reg2,reg3], [reg1])
+    ADDZE   reg1 reg2        -> usage ([reg2], [reg1])
+    ADDIS   reg1 reg2 _      -> usage ([reg2], [reg1])
+    SUBF    reg1 reg2 reg3   -> usage ([reg2,reg3], [reg1])
+    SUBFO   reg1 reg2 reg3   -> usage ([reg2,reg3], [reg1])
+    SUBFC   reg1 reg2 ri     -> usage (reg2 : regRI ri, [reg1])
+    SUBFE   reg1 reg2 reg3   -> usage ([reg2,reg3], [reg1])
+    MULL    _ reg1 reg2 ri   -> usage (reg2 : regRI ri, [reg1])
+    MULLO   _ reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1])
+    MFOV    _ reg            -> usage ([], [reg])
+    MULHU   _ reg1 reg2 reg3 -> usage ([reg2,reg3], [reg1])
+    DIV     _ _ reg1 reg2 reg3
+                             -> usage ([reg2,reg3], [reg1])
+
+    AND     reg1 reg2 ri    -> usage (reg2 : regRI ri, [reg1])
+    ANDC    reg1 reg2 reg3  -> usage ([reg2,reg3], [reg1])
+    NAND    reg1 reg2 reg3  -> usage ([reg2,reg3], [reg1])
+    OR      reg1 reg2 ri    -> usage (reg2 : regRI ri, [reg1])
+    ORIS    reg1 reg2 _     -> usage ([reg2], [reg1])
+    XOR     reg1 reg2 ri    -> usage (reg2 : regRI ri, [reg1])
+    XORIS   reg1 reg2 _     -> usage ([reg2], [reg1])
+    EXTS    _  reg1 reg2    -> usage ([reg2], [reg1])
+    CNTLZ   _  reg1 reg2    -> usage ([reg2], [reg1])
+    NEG     reg1 reg2       -> usage ([reg2], [reg1])
+    NOT     reg1 reg2       -> usage ([reg2], [reg1])
+    SL      _ reg1 reg2 ri  -> usage (reg2 : regRI ri, [reg1])
+    SR      _ reg1 reg2 ri  -> usage (reg2 : regRI ri, [reg1])
+    SRA     _ reg1 reg2 ri  -> usage (reg2 : regRI ri, [reg1])
+    RLWINM  reg1 reg2 _ _ _ -> usage ([reg2], [reg1])
+    CLRLI   _ reg1 reg2 _   -> usage ([reg2], [reg1])
+    CLRRI   _ reg1 reg2 _   -> usage ([reg2], [reg1])
+
+    FADD    _ r1 r2 r3      -> usage ([r2,r3], [r1])
+    FSUB    _ r1 r2 r3      -> usage ([r2,r3], [r1])
+    FMUL    _ r1 r2 r3      -> usage ([r2,r3], [r1])
+    FDIV    _ r1 r2 r3      -> usage ([r2,r3], [r1])
+    FABS    r1 r2           -> usage ([r2], [r1])
+    FNEG    r1 r2           -> usage ([r2], [r1])
+    FCMP    r1 r2           -> usage ([r1,r2], [])
+    FCTIWZ  r1 r2           -> usage ([r2], [r1])
+    FCTIDZ  r1 r2           -> usage ([r2], [r1])
+    FCFID   r1 r2           -> usage ([r2], [r1])
+    FRSP    r1 r2           -> usage ([r2], [r1])
+    MFCR    reg             -> usage ([], [reg])
+    MFLR    reg             -> usage ([], [reg])
+    FETCHPC reg             -> usage ([], [reg])
+    _                       -> noUsage
+  where
+    usage (src, dst) = RU (filter (interesting platform) src)
+                          (filter (interesting platform) dst)
+    regAddr (AddrRegReg r1 r2) = [r1, r2]
+    regAddr (AddrRegImm r1 _)  = [r1]
+
+    regRI (RIReg r) = [r]
+    regRI  _        = []
+
+interesting :: Platform -> Reg -> Bool
+interesting _        (RegVirtual _)              = True
+interesting platform (RegReal (RealRegSingle i)) = freeReg platform i
+interesting _        (RegReal (RealRegPair{}))
+    = panic "PPC.Instr.interesting: no reg pairs on this arch"
+
+
+
+-- | Apply a given mapping to all the register references in this
+-- instruction.
+ppc_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
+ppc_patchRegsOfInstr instr env
+ = case instr of
+    LD      fmt reg addr    -> LD fmt (env reg) (fixAddr addr)
+    LDFAR   fmt reg addr    -> LDFAR fmt (env reg) (fixAddr addr)
+    LDR     fmt reg addr    -> LDR fmt (env reg) (fixAddr addr)
+    LA      fmt reg addr    -> LA fmt (env reg) (fixAddr addr)
+    ST      fmt reg addr    -> ST fmt (env reg) (fixAddr addr)
+    STFAR   fmt reg addr    -> STFAR fmt (env reg) (fixAddr addr)
+    STU     fmt reg addr    -> STU fmt (env reg) (fixAddr addr)
+    STC     fmt reg addr    -> STC fmt (env reg) (fixAddr addr)
+    LIS     reg imm         -> LIS (env reg) imm
+    LI      reg imm         -> LI (env reg) imm
+    MR      reg1 reg2       -> MR (env reg1) (env reg2)
+    CMP     fmt reg ri      -> CMP fmt (env reg) (fixRI ri)
+    CMPL    fmt reg ri      -> CMPL fmt (env reg) (fixRI ri)
+    BCC     cond lbl p      -> BCC cond lbl p
+    BCCFAR  cond lbl p      -> BCCFAR cond lbl p
+    JMP     l regs          -> JMP l regs -- global regs will not be remapped
+    MTCTR   reg             -> MTCTR (env reg)
+    BCTR    targets lbl rs  -> BCTR targets lbl rs
+    BL      imm argRegs     -> BL imm argRegs    -- argument regs
+    BCTRL   argRegs         -> BCTRL argRegs     -- cannot be remapped
+    ADD     reg1 reg2 ri    -> ADD (env reg1) (env reg2) (fixRI ri)
+    ADDO    reg1 reg2 reg3  -> ADDO (env reg1) (env reg2) (env reg3)
+    ADDC    reg1 reg2 reg3  -> ADDC (env reg1) (env reg2) (env reg3)
+    ADDE    reg1 reg2 reg3  -> ADDE (env reg1) (env reg2) (env reg3)
+    ADDZE   reg1 reg2       -> ADDZE (env reg1) (env reg2)
+    ADDIS   reg1 reg2 imm   -> ADDIS (env reg1) (env reg2) imm
+    SUBF    reg1 reg2 reg3  -> SUBF (env reg1) (env reg2) (env reg3)
+    SUBFO   reg1 reg2 reg3  -> SUBFO (env reg1) (env reg2) (env reg3)
+    SUBFC   reg1 reg2 ri    -> SUBFC (env reg1) (env reg2) (fixRI ri)
+    SUBFE   reg1 reg2 reg3  -> SUBFE (env reg1) (env reg2) (env reg3)
+    MULL    fmt reg1 reg2 ri
+                            -> MULL fmt (env reg1) (env reg2) (fixRI ri)
+    MULLO   fmt reg1 reg2 reg3
+                            -> MULLO fmt (env reg1) (env reg2) (env reg3)
+    MFOV    fmt reg         -> MFOV fmt (env reg)
+    MULHU   fmt reg1 reg2 reg3
+                            -> MULHU fmt (env reg1) (env reg2) (env reg3)
+    DIV     fmt sgn reg1 reg2 reg3
+                            -> DIV fmt sgn (env reg1) (env reg2) (env reg3)
+
+    AND     reg1 reg2 ri    -> AND (env reg1) (env reg2) (fixRI ri)
+    ANDC    reg1 reg2 reg3  -> ANDC (env reg1) (env reg2) (env reg3)
+    NAND    reg1 reg2 reg3  -> NAND (env reg1) (env reg2) (env reg3)
+    OR      reg1 reg2 ri    -> OR  (env reg1) (env reg2) (fixRI ri)
+    ORIS    reg1 reg2 imm   -> ORIS (env reg1) (env reg2) imm
+    XOR     reg1 reg2 ri    -> XOR (env reg1) (env reg2) (fixRI ri)
+    XORIS   reg1 reg2 imm   -> XORIS (env reg1) (env reg2) imm
+    EXTS    fmt reg1 reg2   -> EXTS fmt (env reg1) (env reg2)
+    CNTLZ   fmt reg1 reg2   -> CNTLZ fmt (env reg1) (env reg2)
+    NEG     reg1 reg2       -> NEG (env reg1) (env reg2)
+    NOT     reg1 reg2       -> NOT (env reg1) (env reg2)
+    SL      fmt reg1 reg2 ri
+                            -> SL fmt (env reg1) (env reg2) (fixRI ri)
+    SR      fmt reg1 reg2 ri
+                            -> SR fmt (env reg1) (env reg2) (fixRI ri)
+    SRA     fmt reg1 reg2 ri
+                            -> SRA fmt (env reg1) (env reg2) (fixRI ri)
+    RLWINM  reg1 reg2 sh mb me
+                            -> RLWINM (env reg1) (env reg2) sh mb me
+    CLRLI   fmt reg1 reg2 n -> CLRLI fmt (env reg1) (env reg2) n
+    CLRRI   fmt reg1 reg2 n -> CLRRI fmt (env reg1) (env reg2) n
+    FADD    fmt r1 r2 r3    -> FADD fmt (env r1) (env r2) (env r3)
+    FSUB    fmt r1 r2 r3    -> FSUB fmt (env r1) (env r2) (env r3)
+    FMUL    fmt r1 r2 r3    -> FMUL fmt (env r1) (env r2) (env r3)
+    FDIV    fmt r1 r2 r3    -> FDIV fmt (env r1) (env r2) (env r3)
+    FABS    r1 r2           -> FABS (env r1) (env r2)
+    FNEG    r1 r2           -> FNEG (env r1) (env r2)
+    FCMP    r1 r2           -> FCMP (env r1) (env r2)
+    FCTIWZ  r1 r2           -> FCTIWZ (env r1) (env r2)
+    FCTIDZ  r1 r2           -> FCTIDZ (env r1) (env r2)
+    FCFID   r1 r2           -> FCFID (env r1) (env r2)
+    FRSP    r1 r2           -> FRSP (env r1) (env r2)
+    MFCR    reg             -> MFCR (env reg)
+    MFLR    reg             -> MFLR (env reg)
+    FETCHPC reg             -> FETCHPC (env reg)
+    _                       -> instr
+  where
+    fixAddr (AddrRegReg r1 r2) = AddrRegReg (env r1) (env r2)
+    fixAddr (AddrRegImm r1 i)  = AddrRegImm (env r1) i
+
+    fixRI (RIReg r) = RIReg (env r)
+    fixRI other     = other
+
+
+--------------------------------------------------------------------------------
+-- | Checks whether this instruction is a jump/branch instruction.
+-- One that can change the flow of control in a way that the
+-- register allocator needs to worry about.
+ppc_isJumpishInstr :: Instr -> Bool
+ppc_isJumpishInstr instr
+ = case instr of
+    BCC{}       -> True
+    BCCFAR{}    -> True
+    BCTR{}      -> True
+    BCTRL{}     -> True
+    BL{}        -> True
+    JMP{}       -> True
+    _           -> False
+
+
+-- | Checks whether this instruction is a jump/branch instruction.
+-- One that can change the flow of control in a way that the
+-- register allocator needs to worry about.
+ppc_jumpDestsOfInstr :: Instr -> [BlockId]
+ppc_jumpDestsOfInstr insn
+  = case insn of
+        BCC _ id _       -> [id]
+        BCCFAR _ id _    -> [id]
+        BCTR targets _ _ -> [id | Just id <- targets]
+        _                -> []
+
+
+-- | Change the destination of this jump instruction.
+-- Used in the linear allocator when adding fixup blocks for join
+-- points.
+ppc_patchJumpInstr :: Instr -> (BlockId -> BlockId) -> Instr
+ppc_patchJumpInstr insn patchF
+  = case insn of
+        BCC cc id p     -> BCC cc (patchF id) p
+        BCCFAR cc id p  -> BCCFAR cc (patchF id) p
+        BCTR ids lbl rs -> BCTR (map (fmap patchF) ids) lbl rs
+        _               -> insn
+
+
+-- -----------------------------------------------------------------------------
+
+-- | An instruction to spill a register into a spill slot.
+ppc_mkSpillInstr
+   :: DynFlags
+   -> Reg       -- register to spill
+   -> Int       -- current stack delta
+   -> Int       -- spill slot to use
+   -> Instr
+
+ppc_mkSpillInstr dflags reg delta slot
+  = let platform = targetPlatform dflags
+        off      = spillSlotToOffset dflags slot
+        arch     = platformArch platform
+    in
+    let fmt = case targetClassOfReg platform reg of
+                RcInteger -> case arch of
+                                ArchPPC -> II32
+                                _       -> II64
+                RcDouble  -> FF64
+                _         -> panic "PPC.Instr.mkSpillInstr: no match"
+        instr = case makeImmediate W32 True (off-delta) of
+                Just _  -> ST
+                Nothing -> STFAR -- pseudo instruction: 32 bit offsets
+
+    in instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))
+
+
+ppc_mkLoadInstr
+   :: DynFlags
+   -> Reg       -- register to load
+   -> Int       -- current stack delta
+   -> Int       -- spill slot to use
+   -> Instr
+
+ppc_mkLoadInstr dflags reg delta slot
+  = let platform = targetPlatform dflags
+        off      = spillSlotToOffset dflags slot
+        arch     = platformArch platform
+    in
+    let fmt = case targetClassOfReg platform reg of
+                RcInteger ->  case arch of
+                                 ArchPPC -> II32
+                                 _       -> II64
+                RcDouble  -> FF64
+                _         -> panic "PPC.Instr.mkLoadInstr: no match"
+        instr = case makeImmediate W32 True (off-delta) of
+                Just _  -> LD
+                Nothing -> LDFAR -- pseudo instruction: 32 bit offsets
+
+    in instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))
+
+
+-- | The size of a minimal stackframe header including minimal
+-- parameter save area.
+stackFrameHeaderSize :: DynFlags -> Int
+stackFrameHeaderSize dflags
+  = case platformOS platform of
+      OSAIX    -> 24 + 8 * 4
+      _ -> case platformArch platform of
+                             -- header + parameter save area
+             ArchPPC           -> 64 -- TODO: check ABI spec
+             ArchPPC_64 ELF_V1 -> 48 + 8 * 8
+             ArchPPC_64 ELF_V2 -> 32 + 8 * 8
+             _ -> panic "PPC.stackFrameHeaderSize: not defined for this OS"
+     where platform = targetPlatform dflags
+
+-- | The maximum number of bytes required to spill a register. PPC32
+-- has 32-bit GPRs and 64-bit FPRs, while PPC64 has 64-bit GPRs and
+-- 64-bit FPRs. So the maximum is 8 regardless of platforms unlike
+-- x86. Note that AltiVec's vector registers are 128-bit wide so we
+-- must not use this to spill them.
+spillSlotSize :: Int
+spillSlotSize = 8
+
+-- | The number of spill slots available without allocating more.
+maxSpillSlots :: DynFlags -> Int
+maxSpillSlots dflags
+    = ((rESERVED_C_STACK_BYTES dflags - stackFrameHeaderSize dflags)
+       `div` spillSlotSize) - 1
+--     = 0 -- useful for testing allocMoreStack
+
+-- | The number of bytes that the stack pointer should be aligned
+-- to. This is 16 both on PPC32 and PPC64 ELF (see ELF processor
+-- specific supplements).
+stackAlign :: Int
+stackAlign = 16
+
+-- | Convert a spill slot number to a *byte* offset, with no sign.
+spillSlotToOffset :: DynFlags -> Int -> Int
+spillSlotToOffset dflags slot
+   = stackFrameHeaderSize dflags + spillSlotSize * slot
+
+
+--------------------------------------------------------------------------------
+-- | See if this instruction is telling us the current C stack delta
+ppc_takeDeltaInstr
+    :: Instr
+    -> Maybe Int
+
+ppc_takeDeltaInstr instr
+ = case instr of
+     DELTA i  -> Just i
+     _        -> Nothing
+
+
+ppc_isMetaInstr
+    :: Instr
+    -> Bool
+
+ppc_isMetaInstr instr
+ = case instr of
+    COMMENT{}   -> True
+    LDATA{}     -> True
+    NEWBLOCK{}  -> True
+    DELTA{}     -> True
+    _           -> False
+
+
+-- | Copy the value in a register to another one.
+-- Must work for all register classes.
+ppc_mkRegRegMoveInstr
+    :: Reg
+    -> Reg
+    -> Instr
+
+ppc_mkRegRegMoveInstr src dst
+    = MR dst src
+
+
+-- | Make an unconditional jump instruction.
+ppc_mkJumpInstr
+    :: BlockId
+    -> [Instr]
+
+ppc_mkJumpInstr id
+    = [BCC ALWAYS id Nothing]
+
+
+-- | Take the source and destination from this reg -> reg move instruction
+-- or Nothing if it's not one
+ppc_takeRegRegMoveInstr :: Instr -> Maybe (Reg,Reg)
+ppc_takeRegRegMoveInstr (MR dst src) = Just (src,dst)
+ppc_takeRegRegMoveInstr _  = Nothing
+
+-- -----------------------------------------------------------------------------
+-- Making far branches
+
+-- Conditional branches on PowerPC are limited to +-32KB; if our Procs get too
+-- big, we have to work around this limitation.
+
+makeFarBranches
+        :: LabelMap RawCmmStatics
+        -> [NatBasicBlock Instr]
+        -> [NatBasicBlock Instr]
+makeFarBranches info_env blocks
+    | last blockAddresses < nearLimit = blocks
+    | otherwise = zipWith handleBlock blockAddresses blocks
+    where
+        blockAddresses = scanl (+) 0 $ map blockLen blocks
+        blockLen (BasicBlock _ instrs) = length instrs
+
+        handleBlock addr (BasicBlock id instrs)
+                = BasicBlock id (zipWith makeFar [addr..] instrs)
+
+        makeFar _ (BCC ALWAYS tgt _) = BCC ALWAYS tgt Nothing
+        makeFar addr (BCC cond tgt p)
+            | abs (addr - targetAddr) >= nearLimit
+            = BCCFAR cond tgt p
+            | otherwise
+            = BCC cond tgt p
+            where Just targetAddr = lookupUFM blockAddressMap tgt
+        makeFar _ other            = other
+
+        -- 8192 instructions are allowed; let's keep some distance, as
+        -- we have a few pseudo-insns that are pretty-printed as
+        -- multiple instructions, and it's just not worth the effort
+        -- to calculate things exactly
+        nearLimit = 7000 - mapSize info_env * maxRetInfoTableSizeW
+
+        blockAddressMap = listToUFM $ zip (map blockId blocks) blockAddresses
diff --git a/compiler/GHC/CmmToAsm/PPC/Ppr.hs b/compiler/GHC/CmmToAsm/PPC/Ppr.hs
new file mode 100644
index 0000000000..550bd618ef
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PPC/Ppr.hs
@@ -0,0 +1,994 @@
+-----------------------------------------------------------------------------
+--
+-- Pretty-printing assembly language
+--
+-- (c) The University of Glasgow 1993-2005
+--
+-----------------------------------------------------------------------------
+
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+module GHC.CmmToAsm.PPC.Ppr (pprNatCmmDecl) where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.PPC.Regs
+import GHC.CmmToAsm.PPC.Instr
+import GHC.CmmToAsm.PPC.Cond
+import GHC.CmmToAsm.Ppr
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+import GHC.CmmToAsm.Reg.Target
+
+import GHC.Cmm hiding (topInfoTable)
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import GHC.Cmm.Ppr.Expr () -- For Outputable instances
+
+import Unique                ( pprUniqueAlways, getUnique )
+import GHC.Platform
+import FastString
+import Outputable
+import GHC.Driver.Session
+
+import Data.Word
+import Data.Int
+import Data.Bits
+
+-- -----------------------------------------------------------------------------
+-- Printing this stuff out
+
+pprNatCmmDecl :: NatCmmDecl RawCmmStatics Instr -> SDoc
+pprNatCmmDecl (CmmData section dats) =
+  pprSectionAlign section $$ pprDatas dats
+
+pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
+  case topInfoTable proc of
+    Nothing ->
+       sdocWithPlatform $ \platform ->
+         -- special case for code without info table:
+         pprSectionAlign (Section Text lbl) $$
+         (case platformArch platform of
+            ArchPPC_64 ELF_V1 -> pprFunctionDescriptor lbl
+            ArchPPC_64 ELF_V2 -> pprFunctionPrologue lbl
+            _ -> pprLabel lbl) $$ -- blocks guaranteed not null,
+                                     -- so label needed
+         vcat (map (pprBasicBlock top_info) blocks)
+
+    Just (RawCmmStatics info_lbl _) ->
+      sdocWithPlatform $ \platform ->
+      pprSectionAlign (Section Text info_lbl) $$
+      (if platformHasSubsectionsViaSymbols platform
+          then ppr (mkDeadStripPreventer info_lbl) <> char ':'
+          else empty) $$
+      vcat (map (pprBasicBlock top_info) blocks) $$
+      -- above: Even the first block gets a label, because with branch-chain
+      -- elimination, it might be the target of a goto.
+      (if platformHasSubsectionsViaSymbols platform
+       then
+       -- See Note [Subsections Via Symbols] in X86/Ppr.hs
+                text "\t.long "
+            <+> ppr info_lbl
+            <+> char '-'
+            <+> ppr (mkDeadStripPreventer info_lbl)
+       else empty)
+
+pprFunctionDescriptor :: CLabel -> SDoc
+pprFunctionDescriptor lab = pprGloblDecl lab
+                        $$  text "\t.section \".opd\", \"aw\""
+                        $$  text "\t.align 3"
+                        $$  ppr lab <> char ':'
+                        $$  text "\t.quad ."
+                        <>  ppr lab
+                        <>  text ",.TOC.@tocbase,0"
+                        $$  text "\t.previous"
+                        $$  text "\t.type"
+                        <+> ppr lab
+                        <>  text ", @function"
+                        $$  char '.' <> ppr lab <> char ':'
+
+pprFunctionPrologue :: CLabel ->SDoc
+pprFunctionPrologue lab =  pprGloblDecl lab
+                        $$  text ".type "
+                        <> ppr lab
+                        <> text ", @function"
+                        $$ ppr lab <> char ':'
+                        $$ text "0:\taddis\t" <> pprReg toc
+                        <> text ",12,.TOC.-0b@ha"
+                        $$ text "\taddi\t" <> pprReg toc
+                        <> char ',' <> pprReg toc <> text ",.TOC.-0b@l"
+                        $$ text "\t.localentry\t" <> ppr lab
+                        <> text ",.-" <> ppr lab
+
+pprBasicBlock :: LabelMap RawCmmStatics -> NatBasicBlock Instr -> SDoc
+pprBasicBlock info_env (BasicBlock blockid instrs)
+  = maybe_infotable $$
+    pprLabel (blockLbl blockid) $$
+    vcat (map pprInstr instrs)
+  where
+    maybe_infotable = case mapLookup blockid info_env of
+       Nothing   -> empty
+       Just (RawCmmStatics info_lbl info) ->
+           pprAlignForSection Text $$
+           vcat (map pprData info) $$
+           pprLabel info_lbl
+
+
+
+pprDatas :: RawCmmStatics -> SDoc
+-- See note [emit-time elimination of static indirections] in CLabel.
+pprDatas (RawCmmStatics alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _])
+  | lbl == mkIndStaticInfoLabel
+  , let labelInd (CmmLabelOff l _) = Just l
+        labelInd (CmmLabel l) = Just l
+        labelInd _ = Nothing
+  , Just ind' <- labelInd ind
+  , alias `mayRedirectTo` ind'
+  = pprGloblDecl alias
+    $$ text ".equiv" <+> ppr alias <> comma <> ppr (CmmLabel ind')
+pprDatas (RawCmmStatics lbl dats) = vcat (pprLabel lbl : map pprData dats)
+
+pprData :: CmmStatic -> SDoc
+pprData (CmmString str)          = pprBytes str
+pprData (CmmUninitialised bytes) = text ".space " <> int bytes
+pprData (CmmStaticLit lit)       = pprDataItem lit
+
+pprGloblDecl :: CLabel -> SDoc
+pprGloblDecl lbl
+  | not (externallyVisibleCLabel lbl) = empty
+  | otherwise = text ".globl " <> ppr lbl
+
+pprTypeAndSizeDecl :: CLabel -> SDoc
+pprTypeAndSizeDecl lbl
+  = sdocWithPlatform $ \platform ->
+    if platformOS platform == OSLinux && externallyVisibleCLabel lbl
+    then text ".type " <>
+         ppr lbl <> text ", @object"
+    else empty
+
+pprLabel :: CLabel -> SDoc
+pprLabel lbl = pprGloblDecl lbl
+            $$ pprTypeAndSizeDecl lbl
+            $$ (ppr lbl <> char ':')
+
+-- -----------------------------------------------------------------------------
+-- pprInstr: print an 'Instr'
+
+instance Outputable Instr where
+    ppr instr = pprInstr instr
+
+
+pprReg :: Reg -> SDoc
+
+pprReg r
+  = case r of
+      RegReal    (RealRegSingle i) -> ppr_reg_no i
+      RegReal    (RealRegPair{})   -> panic "PPC.pprReg: no reg pairs on this arch"
+      RegVirtual (VirtualRegI  u)  -> text "%vI_"   <> pprUniqueAlways u
+      RegVirtual (VirtualRegHi u)  -> text "%vHi_"  <> pprUniqueAlways u
+      RegVirtual (VirtualRegF  u)  -> text "%vF_"   <> pprUniqueAlways u
+      RegVirtual (VirtualRegD  u)  -> text "%vD_"   <> pprUniqueAlways u
+
+  where
+    ppr_reg_no :: Int -> SDoc
+    ppr_reg_no i
+         | i <= 31   = int i      -- GPRs
+         | i <= 63   = int (i-32) -- FPRs
+         | otherwise = text "very naughty powerpc register"
+
+
+
+pprFormat :: Format -> SDoc
+pprFormat x
+ = ptext (case x of
+                II8  -> sLit "b"
+                II16 -> sLit "h"
+                II32 -> sLit "w"
+                II64 -> sLit "d"
+                FF32 -> sLit "fs"
+                FF64 -> sLit "fd")
+
+
+pprCond :: Cond -> SDoc
+pprCond c
+ = ptext (case c of {
+                ALWAYS  -> sLit "";
+                EQQ     -> sLit "eq";  NE    -> sLit "ne";
+                LTT     -> sLit "lt";  GE    -> sLit "ge";
+                GTT     -> sLit "gt";  LE    -> sLit "le";
+                LU      -> sLit "lt";  GEU   -> sLit "ge";
+                GU      -> sLit "gt";  LEU   -> sLit "le"; })
+
+
+pprImm :: Imm -> SDoc
+
+pprImm (ImmInt i)     = int i
+pprImm (ImmInteger i) = integer i
+pprImm (ImmCLbl l)    = ppr l
+pprImm (ImmIndex l i) = ppr l <> char '+' <> int i
+pprImm (ImmLit s)     = s
+
+pprImm (ImmFloat _)  = text "naughty float immediate"
+pprImm (ImmDouble _) = text "naughty double immediate"
+
+pprImm (ImmConstantSum a b) = pprImm a <> char '+' <> pprImm b
+pprImm (ImmConstantDiff a b) = pprImm a <> char '-'
+                   <> lparen <> pprImm b <> rparen
+
+pprImm (LO (ImmInt i))     = pprImm (LO (ImmInteger (toInteger i)))
+pprImm (LO (ImmInteger i)) = pprImm (ImmInteger (toInteger lo16))
+  where
+    lo16 = fromInteger (i .&. 0xffff) :: Int16
+
+pprImm (LO i)
+  = pprImm i <> text "@l"
+
+pprImm (HI i)
+  = pprImm i <> text "@h"
+
+pprImm (HA (ImmInt i))     = pprImm (HA (ImmInteger (toInteger i)))
+pprImm (HA (ImmInteger i)) = pprImm (ImmInteger ha16)
+  where
+    ha16 = if lo16 >= 0x8000 then hi16+1 else hi16
+    hi16 = (i `shiftR` 16)
+    lo16 = i .&. 0xffff
+
+pprImm (HA i)
+  = pprImm i <> text "@ha"
+
+pprImm (HIGHERA i)
+  = pprImm i <> text "@highera"
+
+pprImm (HIGHESTA i)
+  = pprImm i <> text "@highesta"
+
+
+pprAddr :: AddrMode -> SDoc
+pprAddr (AddrRegReg r1 r2)
+  = pprReg r1 <> char ',' <+> pprReg r2
+pprAddr (AddrRegImm r1 (ImmInt i))
+  = hcat [ int i, char '(', pprReg r1, char ')' ]
+pprAddr (AddrRegImm r1 (ImmInteger i))
+  = hcat [ integer i, char '(', pprReg r1, char ')' ]
+pprAddr (AddrRegImm r1 imm)
+  = hcat [ pprImm imm, char '(', pprReg r1, char ')' ]
+
+
+pprSectionAlign :: Section -> SDoc
+pprSectionAlign sec@(Section seg _) =
+ sdocWithPlatform $ \platform ->
+   pprSectionHeader platform sec $$
+   pprAlignForSection seg
+
+-- | Print appropriate alignment for the given section type.
+pprAlignForSection :: SectionType -> SDoc
+pprAlignForSection seg =
+ sdocWithPlatform $ \platform ->
+ let ppc64    = not $ target32Bit platform
+ in ptext $ case seg of
+       Text              -> sLit ".align 2"
+       Data
+        | ppc64          -> sLit ".align 3"
+        | otherwise      -> sLit ".align 2"
+       ReadOnlyData
+        | ppc64          -> sLit ".align 3"
+        | otherwise      -> sLit ".align 2"
+       RelocatableReadOnlyData
+        | ppc64          -> sLit ".align 3"
+        | otherwise      -> sLit ".align 2"
+       UninitialisedData
+        | ppc64          -> sLit ".align 3"
+        | otherwise      -> sLit ".align 2"
+       ReadOnlyData16    -> sLit ".align 4"
+       -- TODO: This is copied from the ReadOnlyData case, but it can likely be
+       -- made more efficient.
+       CString
+        | ppc64          -> sLit ".align 3"
+        | otherwise      -> sLit ".align 2"
+       OtherSection _    -> panic "PprMach.pprSectionAlign: unknown section"
+
+pprDataItem :: CmmLit -> SDoc
+pprDataItem lit
+  = sdocWithDynFlags $ \dflags ->
+    vcat (ppr_item (cmmTypeFormat $ cmmLitType dflags lit) lit dflags)
+    where
+        imm = litToImm lit
+        archPPC_64 dflags = not $ target32Bit $ targetPlatform dflags
+
+        ppr_item II8   _ _ = [text "\t.byte\t" <> pprImm imm]
+
+        ppr_item II32  _ _ = [text "\t.long\t" <> pprImm imm]
+
+        ppr_item II64 _ dflags
+           | archPPC_64 dflags = [text "\t.quad\t" <> pprImm imm]
+
+
+        ppr_item FF32 (CmmFloat r _) _
+           = let bs = floatToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+        ppr_item FF64 (CmmFloat r _) _
+           = let bs = doubleToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+        ppr_item II16 _ _      = [text "\t.short\t" <> pprImm imm]
+
+        ppr_item II64 (CmmInt x _) dflags
+           | not(archPPC_64 dflags) =
+                [text "\t.long\t"
+                    <> int (fromIntegral
+                        (fromIntegral (x `shiftR` 32) :: Word32)),
+                 text "\t.long\t"
+                    <> int (fromIntegral (fromIntegral x :: Word32))]
+
+        ppr_item _ _ _
+                = panic "PPC.Ppr.pprDataItem: no match"
+
+
+pprInstr :: Instr -> SDoc
+
+pprInstr (COMMENT _) = empty -- nuke 'em
+{-
+pprInstr (COMMENT s) =
+     if platformOS platform == OSLinux
+     then text "# " <> ftext s
+     else text "; " <> ftext s
+-}
+pprInstr (DELTA d)
+   = pprInstr (COMMENT (mkFastString ("\tdelta = " ++ show d)))
+
+pprInstr (NEWBLOCK _)
+   = panic "PprMach.pprInstr: NEWBLOCK"
+
+pprInstr (LDATA _ _)
+   = panic "PprMach.pprInstr: LDATA"
+
+{-
+pprInstr (SPILL reg slot)
+   = hcat [
+           text "\tSPILL",
+        char '\t',
+        pprReg reg,
+        comma,
+        text "SLOT" <> parens (int slot)]
+
+pprInstr (RELOAD slot reg)
+   = hcat [
+           text "\tRELOAD",
+        char '\t',
+        text "SLOT" <> parens (int slot),
+        comma,
+        pprReg reg]
+-}
+
+pprInstr (LD fmt reg addr) = hcat [
+        char '\t',
+        text "l",
+        ptext (case fmt of
+            II8  -> sLit "bz"
+            II16 -> sLit "hz"
+            II32 -> sLit "wz"
+            II64 -> sLit "d"
+            FF32 -> sLit "fs"
+            FF64 -> sLit "fd"
+            ),
+        case addr of AddrRegImm _ _ -> empty
+                     AddrRegReg _ _ -> char 'x',
+        char '\t',
+        pprReg reg,
+        text ", ",
+        pprAddr addr
+    ]
+
+pprInstr (LDFAR fmt reg (AddrRegImm source off)) =
+   sdocWithPlatform $ \platform -> vcat [
+         pprInstr (ADDIS (tmpReg platform) source (HA off)),
+         pprInstr (LD fmt reg (AddrRegImm (tmpReg platform) (LO off)))
+    ]
+pprInstr (LDFAR _ _ _) =
+   panic "PPC.Ppr.pprInstr LDFAR: no match"
+
+pprInstr (LDR fmt reg1 addr) = hcat [
+  text "\tl",
+  case fmt of
+    II32 -> char 'w'
+    II64 -> char 'd'
+    _    -> panic "PPC.Ppr.Instr LDR: no match",
+  text "arx\t",
+  pprReg reg1,
+  text ", ",
+  pprAddr addr
+  ]
+
+pprInstr (LA fmt reg addr) = hcat [
+        char '\t',
+        text "l",
+        ptext (case fmt of
+            II8  -> sLit "ba"
+            II16 -> sLit "ha"
+            II32 -> sLit "wa"
+            II64 -> sLit "d"
+            FF32 -> sLit "fs"
+            FF64 -> sLit "fd"
+            ),
+        case addr of AddrRegImm _ _ -> empty
+                     AddrRegReg _ _ -> char 'x',
+        char '\t',
+        pprReg reg,
+        text ", ",
+        pprAddr addr
+    ]
+pprInstr (ST fmt reg addr) = hcat [
+        char '\t',
+        text "st",
+        pprFormat fmt,
+        case addr of AddrRegImm _ _ -> empty
+                     AddrRegReg _ _ -> char 'x',
+        char '\t',
+        pprReg reg,
+        text ", ",
+        pprAddr addr
+    ]
+pprInstr (STFAR fmt reg (AddrRegImm source off)) =
+   sdocWithPlatform $ \platform -> vcat [
+         pprInstr (ADDIS (tmpReg platform) source (HA off)),
+         pprInstr (ST fmt reg (AddrRegImm (tmpReg platform) (LO off)))
+    ]
+pprInstr (STFAR _ _ _) =
+   panic "PPC.Ppr.pprInstr STFAR: no match"
+pprInstr (STU fmt reg addr) = hcat [
+        char '\t',
+        text "st",
+        pprFormat fmt,
+        char 'u',
+        case addr of AddrRegImm _ _ -> empty
+                     AddrRegReg _ _ -> char 'x',
+        char '\t',
+        pprReg reg,
+        text ", ",
+        pprAddr addr
+    ]
+pprInstr (STC fmt reg1 addr) = hcat [
+  text "\tst",
+  case fmt of
+    II32 -> char 'w'
+    II64 -> char 'd'
+    _    -> panic "PPC.Ppr.Instr STC: no match",
+  text "cx.\t",
+  pprReg reg1,
+  text ", ",
+  pprAddr addr
+  ]
+pprInstr (LIS reg imm) = hcat [
+        char '\t',
+        text "lis",
+        char '\t',
+        pprReg reg,
+        text ", ",
+        pprImm imm
+    ]
+pprInstr (LI reg imm) = hcat [
+        char '\t',
+        text "li",
+        char '\t',
+        pprReg reg,
+        text ", ",
+        pprImm imm
+    ]
+pprInstr (MR reg1 reg2)
+    | reg1 == reg2 = empty
+    | otherwise = hcat [
+        char '\t',
+        sdocWithPlatform $ \platform ->
+        case targetClassOfReg platform reg1 of
+            RcInteger -> text "mr"
+            _ -> text "fmr",
+        char '\t',
+        pprReg reg1,
+        text ", ",
+        pprReg reg2
+    ]
+pprInstr (CMP fmt reg ri) = hcat [
+        char '\t',
+        op,
+        char '\t',
+        pprReg reg,
+        text ", ",
+        pprRI ri
+    ]
+    where
+        op = hcat [
+                text "cmp",
+                pprFormat fmt,
+                case ri of
+                    RIReg _ -> empty
+                    RIImm _ -> char 'i'
+            ]
+pprInstr (CMPL fmt reg ri) = hcat [
+        char '\t',
+        op,
+        char '\t',
+        pprReg reg,
+        text ", ",
+        pprRI ri
+    ]
+    where
+        op = hcat [
+                text "cmpl",
+                pprFormat fmt,
+                case ri of
+                    RIReg _ -> empty
+                    RIImm _ -> char 'i'
+            ]
+pprInstr (BCC cond blockid prediction) = hcat [
+        char '\t',
+        text "b",
+        pprCond cond,
+        pprPrediction prediction,
+        char '\t',
+        ppr lbl
+    ]
+    where lbl = mkLocalBlockLabel (getUnique blockid)
+          pprPrediction p = case p of
+            Nothing    -> empty
+            Just True  -> char '+'
+            Just False -> char '-'
+
+pprInstr (BCCFAR cond blockid prediction) = vcat [
+        hcat [
+            text "\tb",
+            pprCond (condNegate cond),
+            neg_prediction,
+            text "\t$+8"
+        ],
+        hcat [
+            text "\tb\t",
+            ppr lbl
+        ]
+    ]
+    where lbl = mkLocalBlockLabel (getUnique blockid)
+          neg_prediction = case prediction of
+            Nothing    -> empty
+            Just True  -> char '-'
+            Just False -> char '+'
+
+pprInstr (JMP lbl _)
+  -- We never jump to ForeignLabels; if we ever do, c.f. handling for "BL"
+  | isForeignLabel lbl = panic "PPC.Ppr.pprInstr: JMP to ForeignLabel"
+  | otherwise =
+    hcat [ -- an alias for b that takes a CLabel
+        char '\t',
+        text "b",
+        char '\t',
+        ppr lbl
+    ]
+
+pprInstr (MTCTR reg) = hcat [
+        char '\t',
+        text "mtctr",
+        char '\t',
+        pprReg reg
+    ]
+pprInstr (BCTR _ _ _) = hcat [
+        char '\t',
+        text "bctr"
+    ]
+pprInstr (BL lbl _) = do
+    sdocWithPlatform $ \platform -> case platformOS platform of
+        OSAIX ->
+          -- On AIX, "printf" denotes a function-descriptor (for use
+          -- by function pointers), whereas the actual entry-code
+          -- address is denoted by the dot-prefixed ".printf" label.
+          -- Moreover, the PPC NCG only ever emits a BL instruction
+          -- for calling C ABI functions. Most of the time these calls
+          -- originate from FFI imports and have a 'ForeignLabel',
+          -- but when profiling the codegen inserts calls via
+          -- 'emitRtsCallGen' which are 'CmmLabel's even though
+          -- they'd technically be more like 'ForeignLabel's.
+          hcat [
+            text "\tbl\t.",
+            ppr lbl
+          ]
+        _ ->
+          hcat [
+            text "\tbl\t",
+            ppr lbl
+          ]
+pprInstr (BCTRL _) = hcat [
+        char '\t',
+        text "bctrl"
+    ]
+pprInstr (ADD reg1 reg2 ri) = pprLogic (sLit "add") reg1 reg2 ri
+pprInstr (ADDIS reg1 reg2 imm) = hcat [
+        char '\t',
+        text "addis",
+        char '\t',
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        pprImm imm
+    ]
+
+pprInstr (ADDO reg1 reg2 reg3) = pprLogic (sLit "addo") reg1 reg2 (RIReg reg3)
+pprInstr (ADDC reg1 reg2 reg3) = pprLogic (sLit "addc") reg1 reg2 (RIReg reg3)
+pprInstr (ADDE reg1 reg2 reg3) = pprLogic (sLit "adde") reg1 reg2 (RIReg reg3)
+pprInstr (ADDZE reg1 reg2) = pprUnary (sLit "addze") reg1 reg2
+pprInstr (SUBF reg1 reg2 reg3) = pprLogic (sLit "subf") reg1 reg2 (RIReg reg3)
+pprInstr (SUBFO reg1 reg2 reg3) = pprLogic (sLit "subfo") reg1 reg2 (RIReg reg3)
+pprInstr (SUBFC reg1 reg2 ri) = hcat [
+        char '\t',
+        text "subf",
+        case ri of
+            RIReg _ -> empty
+            RIImm _ -> char 'i',
+        text "c\t",
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        pprRI ri
+    ]
+pprInstr (SUBFE reg1 reg2 reg3) = pprLogic (sLit "subfe") reg1 reg2 (RIReg reg3)
+pprInstr (MULL fmt reg1 reg2 ri) = pprMul fmt reg1 reg2 ri
+pprInstr (MULLO fmt reg1 reg2 reg3) = hcat [
+        char '\t',
+        text "mull",
+        case fmt of
+          II32 -> char 'w'
+          II64 -> char 'd'
+          _    -> panic "PPC: illegal format",
+        text "o\t",
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        pprReg reg3
+    ]
+pprInstr (MFOV fmt reg) = vcat [
+        hcat [
+            char '\t',
+            text "mfxer",
+            char '\t',
+            pprReg reg
+            ],
+        hcat [
+            char '\t',
+            text "extr",
+            case fmt of
+              II32 -> char 'w'
+              II64 -> char 'd'
+              _    -> panic "PPC: illegal format",
+            text "i\t",
+            pprReg reg,
+            text ", ",
+            pprReg reg,
+            text ", 1, ",
+            case fmt of
+              II32 -> text "1"
+              II64 -> text "33"
+              _    -> panic "PPC: illegal format"
+            ]
+        ]
+
+pprInstr (MULHU fmt reg1 reg2 reg3) = hcat [
+        char '\t',
+        text "mulh",
+        case fmt of
+          II32 -> char 'w'
+          II64 -> char 'd'
+          _    -> panic "PPC: illegal format",
+        text "u\t",
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        pprReg reg3
+    ]
+
+pprInstr (DIV fmt sgn reg1 reg2 reg3) = pprDiv fmt sgn reg1 reg2 reg3
+
+        -- for some reason, "andi" doesn't exist.
+        -- we'll use "andi." instead.
+pprInstr (AND reg1 reg2 (RIImm imm)) = hcat [
+        char '\t',
+        text "andi.",
+        char '\t',
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        pprImm imm
+    ]
+pprInstr (AND reg1 reg2 ri) = pprLogic (sLit "and") reg1 reg2 ri
+pprInstr (ANDC reg1 reg2 reg3) = pprLogic (sLit "andc") reg1 reg2 (RIReg reg3)
+pprInstr (NAND reg1 reg2 reg3) = pprLogic (sLit "nand") reg1 reg2 (RIReg reg3)
+
+pprInstr (OR reg1 reg2 ri) = pprLogic (sLit "or") reg1 reg2 ri
+pprInstr (XOR reg1 reg2 ri) = pprLogic (sLit "xor") reg1 reg2 ri
+
+pprInstr (ORIS reg1 reg2 imm) = hcat [
+        char '\t',
+        text "oris",
+        char '\t',
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        pprImm imm
+    ]
+
+pprInstr (XORIS reg1 reg2 imm) = hcat [
+        char '\t',
+        text "xoris",
+        char '\t',
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        pprImm imm
+    ]
+
+pprInstr (EXTS fmt reg1 reg2) = hcat [
+        char '\t',
+        text "exts",
+        pprFormat fmt,
+        char '\t',
+        pprReg reg1,
+        text ", ",
+        pprReg reg2
+    ]
+pprInstr (CNTLZ fmt reg1 reg2) = hcat [
+        char '\t',
+        text "cntlz",
+        case fmt of
+          II32 -> char 'w'
+          II64 -> char 'd'
+          _    -> panic "PPC: illegal format",
+        char '\t',
+        pprReg reg1,
+        text ", ",
+        pprReg reg2
+    ]
+
+pprInstr (NEG reg1 reg2) = pprUnary (sLit "neg") reg1 reg2
+pprInstr (NOT reg1 reg2) = pprUnary (sLit "not") reg1 reg2
+
+pprInstr (SR II32 reg1 reg2 (RIImm (ImmInt i))) | i < 0  || i > 31 =
+    -- Handle the case where we are asked to shift a 32 bit register by
+    -- less than zero or more than 31 bits. We convert this into a clear
+    -- of the destination register.
+    -- Fixes ticket https://gitlab.haskell.org/ghc/ghc/issues/5900
+    pprInstr (XOR reg1 reg2 (RIReg reg2))
+
+pprInstr (SL II32 reg1 reg2 (RIImm (ImmInt i))) | i < 0  || i > 31 =
+    -- As above for SR, but for left shifts.
+    -- Fixes ticket https://gitlab.haskell.org/ghc/ghc/issues/10870
+    pprInstr (XOR reg1 reg2 (RIReg reg2))
+
+pprInstr (SRA II32 reg1 reg2 (RIImm (ImmInt i))) | i > 31 =
+    -- PT: I don't know what to do for negative shift amounts:
+    -- For now just panic.
+    --
+    -- For shift amounts greater than 31 set all bit to the
+    -- value of the sign bit, this also what sraw does.
+    pprInstr (SRA II32 reg1 reg2 (RIImm (ImmInt 31)))
+
+pprInstr (SL fmt reg1 reg2 ri) =
+         let op = case fmt of
+                       II32 -> "slw"
+                       II64 -> "sld"
+                       _    -> panic "PPC.Ppr.pprInstr: shift illegal size"
+         in pprLogic (sLit op) reg1 reg2 (limitShiftRI fmt ri)
+
+pprInstr (SR fmt reg1 reg2 ri) =
+         let op = case fmt of
+                       II32 -> "srw"
+                       II64 -> "srd"
+                       _    -> panic "PPC.Ppr.pprInstr: shift illegal size"
+         in pprLogic (sLit op) reg1 reg2 (limitShiftRI fmt ri)
+
+pprInstr (SRA fmt reg1 reg2 ri) =
+         let op = case fmt of
+                       II32 -> "sraw"
+                       II64 -> "srad"
+                       _    -> panic "PPC.Ppr.pprInstr: shift illegal size"
+         in pprLogic (sLit op) reg1 reg2 (limitShiftRI fmt ri)
+
+pprInstr (RLWINM reg1 reg2 sh mb me) = hcat [
+        text "\trlwinm\t",
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        int sh,
+        text ", ",
+        int mb,
+        text ", ",
+        int me
+    ]
+
+pprInstr (CLRLI fmt reg1 reg2 n) = hcat [
+        text "\tclrl",
+        pprFormat fmt,
+        text "i ",
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        int n
+    ]
+pprInstr (CLRRI fmt reg1 reg2 n) = hcat [
+        text "\tclrr",
+        pprFormat fmt,
+        text "i ",
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        int n
+    ]
+
+pprInstr (FADD fmt reg1 reg2 reg3) = pprBinaryF (sLit "fadd") fmt reg1 reg2 reg3
+pprInstr (FSUB fmt reg1 reg2 reg3) = pprBinaryF (sLit "fsub") fmt reg1 reg2 reg3
+pprInstr (FMUL fmt reg1 reg2 reg3) = pprBinaryF (sLit "fmul") fmt reg1 reg2 reg3
+pprInstr (FDIV fmt reg1 reg2 reg3) = pprBinaryF (sLit "fdiv") fmt reg1 reg2 reg3
+pprInstr (FABS reg1 reg2) = pprUnary (sLit "fabs") reg1 reg2
+pprInstr (FNEG reg1 reg2) = pprUnary (sLit "fneg") reg1 reg2
+
+pprInstr (FCMP reg1 reg2) = hcat [
+        char '\t',
+        text "fcmpu\t0, ",
+            -- Note: we're using fcmpu, not fcmpo
+            -- The difference is with fcmpo, compare with NaN is an invalid operation.
+            -- We don't handle invalid fp ops, so we don't care.
+            -- Moreover, we use `fcmpu 0, ...` rather than `fcmpu cr0, ...` for
+            -- better portability since some non-GNU assembler (such as
+            -- IBM's `as`) tend not to support the symbolic register name cr0.
+            -- This matches the syntax that GCC seems to emit for PPC targets.
+        pprReg reg1,
+        text ", ",
+        pprReg reg2
+    ]
+
+pprInstr (FCTIWZ reg1 reg2) = pprUnary (sLit "fctiwz") reg1 reg2
+pprInstr (FCTIDZ reg1 reg2) = pprUnary (sLit "fctidz") reg1 reg2
+pprInstr (FCFID reg1 reg2) = pprUnary (sLit "fcfid") reg1 reg2
+pprInstr (FRSP reg1 reg2) = pprUnary (sLit "frsp") reg1 reg2
+
+pprInstr (CRNOR dst src1 src2) = hcat [
+        text "\tcrnor\t",
+        int dst,
+        text ", ",
+        int src1,
+        text ", ",
+        int src2
+    ]
+
+pprInstr (MFCR reg) = hcat [
+        char '\t',
+        text "mfcr",
+        char '\t',
+        pprReg reg
+    ]
+
+pprInstr (MFLR reg) = hcat [
+        char '\t',
+        text "mflr",
+        char '\t',
+        pprReg reg
+    ]
+
+pprInstr (FETCHPC reg) = vcat [
+        text "\tbcl\t20,31,1f",
+        hcat [ text "1:\tmflr\t", pprReg reg ]
+    ]
+
+pprInstr HWSYNC = text "\tsync"
+
+pprInstr ISYNC  = text "\tisync"
+
+pprInstr LWSYNC = text "\tlwsync"
+
+pprInstr NOP = text "\tnop"
+
+
+pprLogic :: PtrString -> Reg -> Reg -> RI -> SDoc
+pprLogic op reg1 reg2 ri = hcat [
+        char '\t',
+        ptext op,
+        case ri of
+            RIReg _ -> empty
+            RIImm _ -> char 'i',
+        char '\t',
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        pprRI ri
+    ]
+
+
+pprMul :: Format -> Reg -> Reg -> RI -> SDoc
+pprMul fmt reg1 reg2 ri = hcat [
+        char '\t',
+        text "mull",
+        case ri of
+            RIReg _ -> case fmt of
+              II32 -> char 'w'
+              II64 -> char 'd'
+              _    -> panic "PPC: illegal format"
+            RIImm _ -> char 'i',
+        char '\t',
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        pprRI ri
+    ]
+
+
+pprDiv :: Format -> Bool -> Reg -> Reg -> Reg -> SDoc
+pprDiv fmt sgn reg1 reg2 reg3 = hcat [
+        char '\t',
+        text "div",
+        case fmt of
+          II32 -> char 'w'
+          II64 -> char 'd'
+          _    -> panic "PPC: illegal format",
+        if sgn then empty else char 'u',
+        char '\t',
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        pprReg reg3
+    ]
+
+
+pprUnary :: PtrString -> Reg -> Reg -> SDoc
+pprUnary op reg1 reg2 = hcat [
+        char '\t',
+        ptext op,
+        char '\t',
+        pprReg reg1,
+        text ", ",
+        pprReg reg2
+    ]
+
+
+pprBinaryF :: PtrString -> Format -> Reg -> Reg -> Reg -> SDoc
+pprBinaryF op fmt reg1 reg2 reg3 = hcat [
+        char '\t',
+        ptext op,
+        pprFFormat fmt,
+        char '\t',
+        pprReg reg1,
+        text ", ",
+        pprReg reg2,
+        text ", ",
+        pprReg reg3
+    ]
+
+pprRI :: RI -> SDoc
+pprRI (RIReg r) = pprReg r
+pprRI (RIImm r) = pprImm r
+
+
+pprFFormat :: Format -> SDoc
+pprFFormat FF64     = empty
+pprFFormat FF32     = char 's'
+pprFFormat _        = panic "PPC.Ppr.pprFFormat: no match"
+
+    -- limit immediate argument for shift instruction to range 0..63
+    -- for 64 bit size and 0..32 otherwise
+limitShiftRI :: Format -> RI -> RI
+limitShiftRI II64 (RIImm (ImmInt i)) | i > 63 || i < 0 =
+  panic $ "PPC.Ppr: Shift by " ++ show i ++ " bits is not allowed."
+limitShiftRI II32 (RIImm (ImmInt i)) | i > 31 || i < 0 =
+  panic $ "PPC.Ppr: 32 bit: Shift by " ++ show i ++ " bits is not allowed."
+limitShiftRI _ x = x
diff --git a/compiler/GHC/CmmToAsm/PPC/RegInfo.hs b/compiler/GHC/CmmToAsm/PPC/RegInfo.hs
new file mode 100644
index 0000000000..a75040d703
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PPC/RegInfo.hs
@@ -0,0 +1,80 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Machine-specific parts of the register allocator
+--
+-- (c) The University of Glasgow 1996-2004
+--
+-----------------------------------------------------------------------------
+module GHC.CmmToAsm.PPC.RegInfo (
+        JumpDest( DestBlockId ), getJumpDestBlockId,
+        canShortcut,
+        shortcutJump,
+
+        shortcutStatics
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.PPC.Instr
+
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.CLabel
+
+import Unique
+import Outputable (ppr, text, Outputable, (<>))
+
+data JumpDest = DestBlockId BlockId
+
+-- Debug Instance
+instance Outputable JumpDest where
+  ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid
+
+getJumpDestBlockId :: JumpDest -> Maybe BlockId
+getJumpDestBlockId (DestBlockId bid) = Just bid
+
+canShortcut :: Instr -> Maybe JumpDest
+canShortcut _ = Nothing
+
+shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
+shortcutJump _ other = other
+
+
+-- Here because it knows about JumpDest
+shortcutStatics :: (BlockId -> Maybe JumpDest) -> RawCmmStatics -> RawCmmStatics
+shortcutStatics fn (RawCmmStatics lbl statics)
+  = RawCmmStatics lbl $ map (shortcutStatic fn) statics
+  -- we need to get the jump tables, so apply the mapping to the entries
+  -- of a CmmData too.
+
+shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel
+shortcutLabel fn lab
+  | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn blkId
+  | otherwise                              = lab
+
+shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic
+shortcutStatic fn (CmmStaticLit (CmmLabel lab))
+  = CmmStaticLit (CmmLabel (shortcutLabel fn lab))
+shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w))
+  = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w)
+        -- slightly dodgy, we're ignoring the second label, but this
+        -- works with the way we use CmmLabelDiffOff for jump tables now.
+shortcutStatic _ other_static
+        = other_static
+
+shortBlockId
+        :: (BlockId -> Maybe JumpDest)
+        -> BlockId
+        -> CLabel
+
+shortBlockId fn blockid =
+   case fn blockid of
+      Nothing -> mkLocalBlockLabel uq
+      Just (DestBlockId blockid')  -> shortBlockId fn blockid'
+   where uq = getUnique blockid
diff --git a/compiler/GHC/CmmToAsm/PPC/Regs.hs b/compiler/GHC/CmmToAsm/PPC/Regs.hs
new file mode 100644
index 0000000000..8a9a859665
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/PPC/Regs.hs
@@ -0,0 +1,333 @@
+{-# LANGUAGE CPP #-}
+
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 1994-2004
+--
+-- -----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.PPC.Regs (
+        -- squeeze functions
+        virtualRegSqueeze,
+        realRegSqueeze,
+
+        mkVirtualReg,
+        regDotColor,
+
+        -- immediates
+        Imm(..),
+        strImmLit,
+        litToImm,
+
+        -- addressing modes
+        AddrMode(..),
+        addrOffset,
+
+        -- registers
+        spRel,
+        argRegs,
+        allArgRegs,
+        callClobberedRegs,
+        allMachRegNos,
+        classOfRealReg,
+        showReg,
+
+        -- machine specific
+        allFPArgRegs,
+        fits16Bits,
+        makeImmediate,
+        fReg,
+        r0, sp, toc, r3, r4, r11, r12, r30,
+        tmpReg,
+        f1,
+
+        allocatableRegs
+
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+import GHC.CmmToAsm.Format
+
+import GHC.Cmm
+import GHC.Cmm.CLabel           ( CLabel )
+import Unique
+
+import GHC.Platform.Regs
+import GHC.Driver.Session
+import Outputable
+import GHC.Platform
+
+import Data.Word        ( Word8, Word16, Word32, Word64 )
+import Data.Int         ( Int8, Int16, Int32, Int64 )
+
+
+-- squeese functions for the graph allocator -----------------------------------
+
+-- | regSqueeze_class reg
+--      Calculate the maximum number of register colors that could be
+--      denied to a node of this class due to having this reg
+--      as a neighbour.
+--
+{-# INLINE virtualRegSqueeze #-}
+virtualRegSqueeze :: RegClass -> VirtualReg -> Int
+virtualRegSqueeze cls vr
+ = case cls of
+        RcInteger
+         -> case vr of
+                VirtualRegI{}           -> 1
+                VirtualRegHi{}          -> 1
+                _other                  -> 0
+
+        RcDouble
+         -> case vr of
+                VirtualRegD{}           -> 1
+                VirtualRegF{}           -> 0
+                _other                  -> 0
+
+        _other -> 0
+
+{-# INLINE realRegSqueeze #-}
+realRegSqueeze :: RegClass -> RealReg -> Int
+realRegSqueeze cls rr
+ = case cls of
+        RcInteger
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo < 32    -> 1     -- first fp reg is 32
+                        | otherwise     -> 0
+
+                RealRegPair{}           -> 0
+
+        RcDouble
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo < 32    -> 0
+                        | otherwise     -> 1
+
+                RealRegPair{}           -> 0
+
+        _other -> 0
+
+mkVirtualReg :: Unique -> Format -> VirtualReg
+mkVirtualReg u format
+   | not (isFloatFormat format) = VirtualRegI u
+   | otherwise
+   = case format of
+        FF32    -> VirtualRegD u
+        FF64    -> VirtualRegD u
+        _       -> panic "mkVirtualReg"
+
+regDotColor :: RealReg -> SDoc
+regDotColor reg
+ = case classOfRealReg reg of
+        RcInteger       -> text "blue"
+        RcFloat         -> text "red"
+        RcDouble        -> text "green"
+
+
+
+-- immediates ------------------------------------------------------------------
+data Imm
+        = ImmInt        Int
+        | ImmInteger    Integer     -- Sigh.
+        | ImmCLbl       CLabel      -- AbstractC Label (with baggage)
+        | ImmLit        SDoc        -- Simple string
+        | ImmIndex    CLabel Int
+        | ImmFloat      Rational
+        | ImmDouble     Rational
+        | ImmConstantSum Imm Imm
+        | ImmConstantDiff Imm Imm
+        | LO Imm
+        | HI Imm
+        | HA Imm        {- high halfword adjusted -}
+        | HIGHERA Imm
+        | HIGHESTA Imm
+
+
+strImmLit :: String -> Imm
+strImmLit s = ImmLit (text s)
+
+
+litToImm :: CmmLit -> Imm
+litToImm (CmmInt i w)        = ImmInteger (narrowS w i)
+                -- narrow to the width: a CmmInt might be out of
+                -- range, but we assume that ImmInteger only contains
+                -- in-range values.  A signed value should be fine here.
+litToImm (CmmFloat f W32)    = ImmFloat f
+litToImm (CmmFloat f W64)    = ImmDouble f
+litToImm (CmmLabel l)        = ImmCLbl l
+litToImm (CmmLabelOff l off) = ImmIndex l off
+litToImm (CmmLabelDiffOff l1 l2 off _)
+                             = ImmConstantSum
+                               (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2))
+                               (ImmInt off)
+litToImm _                   = panic "PPC.Regs.litToImm: no match"
+
+
+-- addressing modes ------------------------------------------------------------
+
+data AddrMode
+        = AddrRegReg    Reg Reg
+        | AddrRegImm    Reg Imm
+
+
+addrOffset :: AddrMode -> Int -> Maybe AddrMode
+addrOffset addr off
+  = case addr of
+      AddrRegImm r (ImmInt n)
+       | fits16Bits n2 -> Just (AddrRegImm r (ImmInt n2))
+       | otherwise     -> Nothing
+       where n2 = n + off
+
+      AddrRegImm r (ImmInteger n)
+       | fits16Bits n2 -> Just (AddrRegImm r (ImmInt (fromInteger n2)))
+       | otherwise     -> Nothing
+       where n2 = n + toInteger off
+
+      _ -> Nothing
+
+
+-- registers -------------------------------------------------------------------
+-- @spRel@ gives us a stack relative addressing mode for volatile
+-- temporaries and for excess call arguments.  @fpRel@, where
+-- applicable, is the same but for the frame pointer.
+
+spRel :: DynFlags
+      -> Int    -- desired stack offset in words, positive or negative
+      -> AddrMode
+
+spRel dflags n = AddrRegImm sp (ImmInt (n * wORD_SIZE dflags))
+
+
+-- argRegs is the set of regs which are read for an n-argument call to C.
+-- For archs which pass all args on the stack (x86), is empty.
+-- Sparc passes up to the first 6 args in regs.
+argRegs :: RegNo -> [Reg]
+argRegs 0 = []
+argRegs 1 = map regSingle [3]
+argRegs 2 = map regSingle [3,4]
+argRegs 3 = map regSingle [3..5]
+argRegs 4 = map regSingle [3..6]
+argRegs 5 = map regSingle [3..7]
+argRegs 6 = map regSingle [3..8]
+argRegs 7 = map regSingle [3..9]
+argRegs 8 = map regSingle [3..10]
+argRegs _ = panic "MachRegs.argRegs(powerpc): don't know about >8 arguments!"
+
+
+allArgRegs :: [Reg]
+allArgRegs = map regSingle [3..10]
+
+
+-- these are the regs which we cannot assume stay alive over a C call.
+callClobberedRegs :: Platform -> [Reg]
+callClobberedRegs _platform
+  = map regSingle (0:[2..12] ++ map fReg [0..13])
+
+
+allMachRegNos   :: [RegNo]
+allMachRegNos   = [0..63]
+
+
+{-# INLINE classOfRealReg      #-}
+classOfRealReg :: RealReg -> RegClass
+classOfRealReg (RealRegSingle i)
+        | i < 32        = RcInteger
+        | otherwise     = RcDouble
+
+classOfRealReg (RealRegPair{})
+        = panic "regClass(ppr): no reg pairs on this architecture"
+
+showReg :: RegNo -> String
+showReg n
+    | n >= 0 && n <= 31   = "%r" ++ show n
+    | n >= 32 && n <= 63  = "%f" ++ show (n - 32)
+    | otherwise           = "%unknown_powerpc_real_reg_" ++ show n
+
+
+
+-- machine specific ------------------------------------------------------------
+
+allFPArgRegs :: Platform -> [Reg]
+allFPArgRegs platform
+    = case platformOS platform of
+      OSAIX    -> map (regSingle . fReg) [1..13]
+      _        -> case platformArch platform of
+        ArchPPC      -> map (regSingle . fReg) [1..8]
+        ArchPPC_64 _ -> map (regSingle . fReg) [1..13]
+        _            -> panic "PPC.Regs.allFPArgRegs: unknown PPC Linux"
+
+fits16Bits :: Integral a => a -> Bool
+fits16Bits x = x >= -32768 && x < 32768
+
+makeImmediate :: Integral a => Width -> Bool -> a -> Maybe Imm
+makeImmediate rep signed x = fmap ImmInt (toI16 rep signed)
+    where
+        narrow W64 False = fromIntegral (fromIntegral x :: Word64)
+        narrow W32 False = fromIntegral (fromIntegral x :: Word32)
+        narrow W16 False = fromIntegral (fromIntegral x :: Word16)
+        narrow W8  False = fromIntegral (fromIntegral x :: Word8)
+        narrow W64 True  = fromIntegral (fromIntegral x :: Int64)
+        narrow W32 True  = fromIntegral (fromIntegral x :: Int32)
+        narrow W16 True  = fromIntegral (fromIntegral x :: Int16)
+        narrow W8  True  = fromIntegral (fromIntegral x :: Int8)
+        narrow _   _     = panic "PPC.Regs.narrow: no match"
+
+        narrowed = narrow rep signed
+
+        toI16 W32 True
+            | narrowed >= -32768 && narrowed < 32768 = Just narrowed
+            | otherwise = Nothing
+        toI16 W32 False
+            | narrowed >= 0 && narrowed < 65536 = Just narrowed
+            | otherwise = Nothing
+        toI16 W64 True
+            | narrowed >= -32768 && narrowed < 32768 = Just narrowed
+            | otherwise = Nothing
+        toI16 W64 False
+            | narrowed >= 0 && narrowed < 65536 = Just narrowed
+            | otherwise = Nothing
+        toI16 _ _  = Just narrowed
+
+
+{-
+The PowerPC has 64 registers of interest; 32 integer registers and 32 floating
+point registers.
+-}
+
+fReg :: Int -> RegNo
+fReg x = (32 + x)
+
+r0, sp, toc, r3, r4, r11, r12, r30, f1 :: Reg
+r0      = regSingle 0
+sp      = regSingle 1
+toc     = regSingle 2
+r3      = regSingle 3
+r4      = regSingle 4
+r11     = regSingle 11
+r12     = regSingle 12
+r30     = regSingle 30
+f1      = regSingle $ fReg 1
+
+-- allocatableRegs is allMachRegNos with the fixed-use regs removed.
+-- i.e., these are the regs for which we are prepared to allow the
+-- register allocator to attempt to map VRegs to.
+allocatableRegs :: Platform -> [RealReg]
+allocatableRegs platform
+   = let isFree i = freeReg platform i
+     in  map RealRegSingle $ filter isFree allMachRegNos
+
+-- temporary register for compiler use
+tmpReg :: Platform -> Reg
+tmpReg platform =
+       case platformArch platform of
+       ArchPPC      -> regSingle 13
+       ArchPPC_64 _ -> regSingle 30
+       _            -> panic "PPC.Regs.tmpReg: unknown arch"
diff --git a/compiler/GHC/CmmToAsm/Ppr.hs b/compiler/GHC/CmmToAsm/Ppr.hs
new file mode 100644
index 0000000000..636d2e4e3a
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Ppr.hs
@@ -0,0 +1,275 @@
+{-# LANGUAGE MagicHash #-}
+
+-----------------------------------------------------------------------------
+--
+-- Pretty-printing assembly language
+--
+-- (c) The University of Glasgow 1993-2005
+--
+-----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.Ppr (
+        castFloatToWord8Array,
+        castDoubleToWord8Array,
+        floatToBytes,
+        doubleToBytes,
+        pprASCII,
+        pprBytes,
+        pprSectionHeader
+)
+
+where
+
+import GhcPrelude
+
+import AsmUtils
+import GHC.Cmm.CLabel
+import GHC.Cmm
+import GHC.Driver.Session
+import FastString
+import Outputable
+import GHC.Platform
+import FileCleanup
+
+import qualified Data.Array.Unsafe as U ( castSTUArray )
+import Data.Array.ST
+
+import Control.Monad.ST
+
+import Data.Word
+import Data.Bits
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as BS
+import GHC.Exts
+import GHC.Word
+import System.IO.Unsafe
+
+
+
+-- -----------------------------------------------------------------------------
+-- Converting floating-point literals to integrals for printing
+
+castFloatToWord8Array :: STUArray s Int Float -> ST s (STUArray s Int Word8)
+castFloatToWord8Array = U.castSTUArray
+
+castDoubleToWord8Array :: STUArray s Int Double -> ST s (STUArray s Int Word8)
+castDoubleToWord8Array = U.castSTUArray
+
+-- floatToBytes and doubleToBytes convert to the host's byte
+-- order.  Providing that we're not cross-compiling for a
+-- target with the opposite endianness, this should work ok
+-- on all targets.
+
+-- ToDo: this stuff is very similar to the shenanigans in PprAbs,
+-- could they be merged?
+
+floatToBytes :: Float -> [Int]
+floatToBytes f
+   = runST (do
+        arr <- newArray_ ((0::Int),3)
+        writeArray arr 0 f
+        arr <- castFloatToWord8Array arr
+        i0 <- readArray arr 0
+        i1 <- readArray arr 1
+        i2 <- readArray arr 2
+        i3 <- readArray arr 3
+        return (map fromIntegral [i0,i1,i2,i3])
+     )
+
+doubleToBytes :: Double -> [Int]
+doubleToBytes d
+   = runST (do
+        arr <- newArray_ ((0::Int),7)
+        writeArray arr 0 d
+        arr <- castDoubleToWord8Array arr
+        i0 <- readArray arr 0
+        i1 <- readArray arr 1
+        i2 <- readArray arr 2
+        i3 <- readArray arr 3
+        i4 <- readArray arr 4
+        i5 <- readArray arr 5
+        i6 <- readArray arr 6
+        i7 <- readArray arr 7
+        return (map fromIntegral [i0,i1,i2,i3,i4,i5,i6,i7])
+     )
+
+-- ---------------------------------------------------------------------------
+-- Printing ASCII strings.
+--
+-- Print as a string and escape non-printable characters.
+-- This is similar to charToC in Utils.
+
+pprASCII :: ByteString -> SDoc
+pprASCII str
+  -- Transform this given literal bytestring to escaped string and construct
+  -- the literal SDoc directly.
+  -- See #14741
+  -- and Note [Pretty print ASCII when AsmCodeGen]
+  = text $ BS.foldr (\w s -> do1 w ++ s) "" str
+    where
+       do1 :: Word8 -> String
+       do1 w | 0x09 == w = "\\t"
+             | 0x0A == w = "\\n"
+             | 0x22 == w = "\\\""
+             | 0x5C == w = "\\\\"
+               -- ASCII printable characters range
+             | w >= 0x20 && w <= 0x7E = [chr' w]
+             | otherwise = '\\' : octal w
+
+       -- we know that the Chars we create are in the ASCII range
+       -- so we bypass the check in "chr"
+       chr' :: Word8 -> Char
+       chr' (W8# w#) = C# (chr# (word2Int# w#))
+
+       octal :: Word8 -> String
+       octal w = [ chr' (ord0 + (w `unsafeShiftR` 6) .&. 0x07)
+                 , chr' (ord0 + (w `unsafeShiftR` 3) .&. 0x07)
+                 , chr' (ord0 + w .&. 0x07)
+                 ]
+       ord0 = 0x30 -- = ord '0'
+
+-- | Pretty print binary data.
+--
+-- Use either the ".string" directive or a ".incbin" directive.
+-- See Note [Embedding large binary blobs]
+--
+-- A NULL byte is added after the binary data.
+--
+pprBytes :: ByteString -> SDoc
+pprBytes bs = sdocWithDynFlags $ \dflags ->
+  if binBlobThreshold dflags == 0
+     || fromIntegral (BS.length bs) <= binBlobThreshold dflags
+    then text "\t.string " <> doubleQuotes (pprASCII bs)
+    else unsafePerformIO $ do
+      bFile <- newTempName dflags TFL_CurrentModule ".dat"
+      BS.writeFile bFile bs
+      return $ text "\t.incbin "
+         <> pprFilePathString bFile -- proper escape (see #16389)
+         <> text "\n\t.byte 0"
+
+{-
+Note [Embedding large binary blobs]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To embed a blob of binary data (e.g. an UTF-8 encoded string) into the generated
+code object, we have several options:
+
+   1. Generate a ".byte" directive for each byte. This is what was done in the past
+      (see Note [Pretty print ASCII when AsmCodeGen]).
+
+   2. Generate a single ".string"/".asciz" directive for the whole sequence of
+      bytes. Bytes in the ASCII printable range are rendered as characters and
+      other values are escaped (e.g., "\t", "\077", etc.).
+
+   3. Create a temporary file into which we dump the binary data and generate a
+      single ".incbin" directive. The assembler will include the binary file for
+      us in the generated output object.
+
+Now the code generator uses either (2) or (3), depending on the binary blob
+size.  Using (3) for small blobs adds too much overhead (see benchmark results
+in #16190), so we only do it when the size is above a threshold (500K at the
+time of writing).
+
+The threshold is configurable via the `-fbinary-blob-threshold` flag.
+
+-}
+
+
+{-
+Note [Pretty print ASCII when AsmCodeGen]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Previously, when generating assembly code, we created SDoc with
+`(ptext . sLit)` for every bytes in literal bytestring, then
+combine them using `hcat`.
+
+When handling literal bytestrings with millions of bytes,
+millions of SDoc would be created and to combine, leading to
+high memory usage.
+
+Now we escape the given bytestring to string directly and construct
+SDoc only once. This improvement could dramatically decrease the
+memory allocation from 4.7GB to 1.3GB when embedding a 3MB literal
+string in source code. See #14741 for profiling results.
+-}
+
+-- ----------------------------------------------------------------------------
+-- Printing section headers.
+--
+-- If -split-section was specified, include the suffix label, otherwise just
+-- print the section type. For Darwin, where subsections-for-symbols are
+-- used instead, only print section type.
+--
+-- For string literals, additional flags are specified to enable merging of
+-- identical strings in the linker. With -split-sections each string also gets
+-- a unique section to allow strings from unused code to be GC'd.
+
+pprSectionHeader :: Platform -> Section -> SDoc
+pprSectionHeader platform (Section t suffix) =
+ case platformOS platform of
+   OSAIX     -> pprXcoffSectionHeader t
+   OSDarwin  -> pprDarwinSectionHeader t
+   OSMinGW32 -> pprGNUSectionHeader (char '$') t suffix
+   _         -> pprGNUSectionHeader (char '.') t suffix
+
+pprGNUSectionHeader :: SDoc -> SectionType -> CLabel -> SDoc
+pprGNUSectionHeader sep t suffix = sdocWithDynFlags $ \dflags ->
+  let splitSections = gopt Opt_SplitSections dflags
+      subsection | splitSections = sep <> ppr suffix
+                 | otherwise     = empty
+  in  text ".section " <> ptext (header dflags) <> subsection <>
+      flags dflags
+  where
+    header dflags = case t of
+      Text -> sLit ".text"
+      Data -> sLit ".data"
+      ReadOnlyData  | OSMinGW32 <- platformOS (targetPlatform dflags)
+                                -> sLit ".rdata"
+                    | otherwise -> sLit ".rodata"
+      RelocatableReadOnlyData | OSMinGW32 <- platformOS (targetPlatform dflags)
+                                -- Concept does not exist on Windows,
+                                -- So map these to R/O data.
+                                          -> sLit ".rdata$rel.ro"
+                              | otherwise -> sLit ".data.rel.ro"
+      UninitialisedData -> sLit ".bss"
+      ReadOnlyData16 | OSMinGW32 <- platformOS (targetPlatform dflags)
+                                 -> sLit ".rdata$cst16"
+                     | otherwise -> sLit ".rodata.cst16"
+      CString
+        | OSMinGW32 <- platformOS (targetPlatform dflags)
+                    -> sLit ".rdata"
+        | otherwise -> sLit ".rodata.str"
+      OtherSection _ ->
+        panic "PprBase.pprGNUSectionHeader: unknown section type"
+    flags dflags = case t of
+      CString
+        | OSMinGW32 <- platformOS (targetPlatform dflags)
+                    -> empty
+        | otherwise -> text ",\"aMS\"," <> sectionType "progbits" <> text ",1"
+      _ -> empty
+
+-- XCOFF doesn't support relocating label-differences, so we place all
+-- RO sections into .text[PR] sections
+pprXcoffSectionHeader :: SectionType -> SDoc
+pprXcoffSectionHeader t = text $ case t of
+     Text                    -> ".csect .text[PR]"
+     Data                    -> ".csect .data[RW]"
+     ReadOnlyData            -> ".csect .text[PR] # ReadOnlyData"
+     RelocatableReadOnlyData -> ".csect .text[PR] # RelocatableReadOnlyData"
+     ReadOnlyData16          -> ".csect .text[PR] # ReadOnlyData16"
+     CString                 -> ".csect .text[PR] # CString"
+     UninitialisedData       -> ".csect .data[BS]"
+     OtherSection _          ->
+       panic "PprBase.pprXcoffSectionHeader: unknown section type"
+
+pprDarwinSectionHeader :: SectionType -> SDoc
+pprDarwinSectionHeader t =
+  ptext $ case t of
+     Text -> sLit ".text"
+     Data -> sLit ".data"
+     ReadOnlyData -> sLit ".const"
+     RelocatableReadOnlyData -> sLit ".const_data"
+     UninitialisedData -> sLit ".data"
+     ReadOnlyData16 -> sLit ".const"
+     CString -> sLit ".section\t__TEXT,__cstring,cstring_literals"
+     OtherSection _ ->
+       panic "PprBase.pprDarwinSectionHeader: unknown section type"
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph.hs b/compiler/GHC/CmmToAsm/Reg/Graph.hs
new file mode 100644
index 0000000000..6dfe84cf95
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph.hs
@@ -0,0 +1,472 @@
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+
+-- | Graph coloring register allocator.
+module GHC.CmmToAsm.Reg.Graph (
+        regAlloc
+) where
+import GhcPrelude
+
+import qualified GraphColor as Color
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Reg.Graph.Spill
+import GHC.CmmToAsm.Reg.Graph.SpillClean
+import GHC.CmmToAsm.Reg.Graph.SpillCost
+import GHC.CmmToAsm.Reg.Graph.Stats
+import GHC.CmmToAsm.Reg.Graph.TrivColorable
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Reg.Target
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import Bag
+import GHC.Driver.Session
+import Outputable
+import GHC.Platform
+import UniqFM
+import UniqSet
+import UniqSupply
+import Util (seqList)
+import GHC.CmmToAsm.CFG
+
+import Data.Maybe
+import Control.Monad
+
+
+-- | The maximum number of build\/spill cycles we'll allow.
+--
+--   It should only take 3 or 4 cycles for the allocator to converge.
+--   If it takes any longer than this it's probably in an infinite loop,
+--   so it's better just to bail out and report a bug.
+maxSpinCount    :: Int
+maxSpinCount    = 10
+
+
+-- | The top level of the graph coloring register allocator.
+regAlloc
+        :: (Outputable statics, Outputable instr, Instruction instr)
+        => DynFlags
+        -> UniqFM (UniqSet RealReg)     -- ^ registers we can use for allocation
+        -> UniqSet Int                  -- ^ set of available spill slots.
+        -> Int                          -- ^ current number of spill slots
+        -> [LiveCmmDecl statics instr]  -- ^ code annotated with liveness information.
+        -> Maybe CFG                    -- ^ CFG of basic blocks if available
+        -> UniqSM ( [NatCmmDecl statics instr]
+                  , Maybe Int, [RegAllocStats statics instr] )
+           -- ^ code with registers allocated, additional stacks required
+           -- and stats for each stage of allocation
+
+regAlloc dflags regsFree slotsFree slotsCount code cfg
+ = do
+        -- TODO: the regClass function is currently hard coded to the default
+        --       target architecture. Would prefer to determine this from dflags.
+        --       There are other uses of targetRegClass later in this module.
+        let platform = targetPlatform dflags
+            triv = trivColorable platform
+                        (targetVirtualRegSqueeze platform)
+                        (targetRealRegSqueeze platform)
+
+        (code_final, debug_codeGraphs, slotsCount', _)
+                <- regAlloc_spin dflags 0
+                        triv
+                        regsFree slotsFree slotsCount [] code cfg
+
+        let needStack
+                | slotsCount == slotsCount'
+                = Nothing
+                | otherwise
+                = Just slotsCount'
+
+        return  ( code_final
+                , needStack
+                , reverse debug_codeGraphs )
+
+
+-- | Perform solver iterations for the graph coloring allocator.
+--
+--   We extract a register conflict graph from the provided cmm code,
+--   and try to colour it. If that works then we use the solution rewrite
+--   the code with real hregs. If coloring doesn't work we add spill code
+--   and try to colour it again. After `maxSpinCount` iterations we give up.
+--
+regAlloc_spin
+        :: forall instr statics.
+           (Instruction instr,
+            Outputable instr,
+            Outputable statics)
+        => DynFlags
+        -> Int  -- ^ Number of solver iterations we've already performed.
+        -> Color.Triv VirtualReg RegClass RealReg
+                -- ^ Function for calculating whether a register is trivially
+                --   colourable.
+        -> UniqFM (UniqSet RealReg)      -- ^ Free registers that we can allocate.
+        -> UniqSet Int                   -- ^ Free stack slots that we can use.
+        -> Int                           -- ^ Number of spill slots in use
+        -> [RegAllocStats statics instr] -- ^ Current regalloc stats to add to.
+        -> [LiveCmmDecl statics instr]   -- ^ Liveness annotated code to allocate.
+        -> Maybe CFG
+        -> UniqSM ( [NatCmmDecl statics instr]
+                  , [RegAllocStats statics instr]
+                  , Int                  -- Slots in use
+                  , Color.Graph VirtualReg RegClass RealReg)
+
+regAlloc_spin dflags spinCount triv regsFree slotsFree slotsCount debug_codeGraphs code cfg
+ = do
+        let platform = targetPlatform dflags
+
+        -- If any of these dump flags are turned on we want to hang on to
+        -- intermediate structures in the allocator - otherwise tell the
+        -- allocator to ditch them early so we don't end up creating space leaks.
+        let dump = or
+                [ dopt Opt_D_dump_asm_regalloc_stages dflags
+                , dopt Opt_D_dump_asm_stats dflags
+                , dopt Opt_D_dump_asm_conflicts dflags ]
+
+        -- Check that we're not running off down the garden path.
+        when (spinCount > maxSpinCount)
+         $ pprPanic "regAlloc_spin: max build/spill cycle count exceeded."
+           (  text "It looks like the register allocator is stuck in an infinite loop."
+           $$ text "max cycles  = " <> int maxSpinCount
+           $$ text "regsFree    = " <> (hcat $ punctuate space $ map ppr
+                                             $ nonDetEltsUniqSet $ unionManyUniqSets
+                                             $ nonDetEltsUFM regsFree)
+              -- This is non-deterministic but we do not
+              -- currently support deterministic code-generation.
+              -- See Note [Unique Determinism and code generation]
+           $$ text "slotsFree   = " <> ppr (sizeUniqSet slotsFree))
+
+        -- Build the register conflict graph from the cmm code.
+        (graph  :: Color.Graph VirtualReg RegClass RealReg)
+                <- {-# SCC "BuildGraph" #-} buildGraph code
+
+        -- VERY IMPORTANT:
+        --   We really do want the graph to be fully evaluated _before_ we
+        --   start coloring. If we don't do this now then when the call to
+        --   Color.colorGraph forces bits of it, the heap will be filled with
+        --   half evaluated pieces of graph and zillions of apply thunks.
+        seqGraph graph `seq` return ()
+
+        -- Build a map of the cost of spilling each instruction.
+        -- This is a lazy binding, so the map will only be computed if we
+        -- actually have to spill to the stack.
+        let spillCosts  = foldl' plusSpillCostInfo zeroSpillCostInfo
+                        $ map (slurpSpillCostInfo platform cfg) code
+
+        -- The function to choose regs to leave uncolored.
+        let spill       = chooseSpill spillCosts
+
+        -- Record startup state in our log.
+        let stat1
+             = if spinCount == 0
+                 then   Just $ RegAllocStatsStart
+                        { raLiveCmm     = code
+                        , raGraph       = graph
+                        , raSpillCosts  = spillCosts }
+                 else   Nothing
+
+        -- Try and color the graph.
+        let (graph_colored, rsSpill, rmCoalesce)
+                = {-# SCC "ColorGraph" #-}
+                  Color.colorGraph
+                       (gopt Opt_RegsIterative dflags)
+                       spinCount
+                       regsFree triv spill graph
+
+        -- Rewrite registers in the code that have been coalesced.
+        let patchF reg
+                | RegVirtual vr <- reg
+                = case lookupUFM rmCoalesce vr of
+                        Just vr'        -> patchF (RegVirtual vr')
+                        Nothing         -> reg
+
+                | otherwise
+                = reg
+
+        let (code_coalesced :: [LiveCmmDecl statics instr])
+                = map (patchEraseLive patchF) code
+
+        -- Check whether we've found a coloring.
+        if isEmptyUniqSet rsSpill
+
+         -- Coloring was successful because no registers needed to be spilled.
+         then do
+                -- if -fasm-lint is turned on then validate the graph.
+                -- This checks for bugs in the graph allocator itself.
+                let graph_colored_lint  =
+                        if gopt Opt_DoAsmLinting dflags
+                                then Color.validateGraph (text "")
+                                        True    -- Require all nodes to be colored.
+                                        graph_colored
+                                else graph_colored
+
+                -- Rewrite the code to use real hregs, using the colored graph.
+                let code_patched
+                        = map (patchRegsFromGraph platform graph_colored_lint)
+                              code_coalesced
+
+                -- Clean out unneeded SPILL/RELOAD meta instructions.
+                --   The spill code generator just spills the entire live range
+                --   of a vreg, but it might not need to be on the stack for
+                --   its entire lifetime.
+                let code_spillclean
+                        = map (cleanSpills platform) code_patched
+
+                -- Strip off liveness information from the allocated code.
+                -- Also rewrite SPILL/RELOAD meta instructions into real machine
+                -- instructions along the way
+                let code_final
+                        = map (stripLive dflags) code_spillclean
+
+                -- Record what happened in this stage for debugging
+                let stat
+                     =  RegAllocStatsColored
+                        { raCode                = code
+                        , raGraph               = graph
+                        , raGraphColored        = graph_colored_lint
+                        , raCoalesced           = rmCoalesce
+                        , raCodeCoalesced       = code_coalesced
+                        , raPatched             = code_patched
+                        , raSpillClean          = code_spillclean
+                        , raFinal               = code_final
+                        , raSRMs                = foldl' addSRM (0, 0, 0)
+                                                $ map countSRMs code_spillclean }
+
+                -- Bundle up all the register allocator statistics.
+                --   .. but make sure to drop them on the floor if they're not
+                --      needed, otherwise we'll get a space leak.
+                let statList =
+                        if dump then [stat] ++ maybeToList stat1 ++ debug_codeGraphs
+                                else []
+
+                -- Ensure all the statistics are evaluated, to avoid space leaks.
+                seqList statList (return ())
+
+                return  ( code_final
+                        , statList
+                        , slotsCount
+                        , graph_colored_lint)
+
+         -- Coloring was unsuccessful. We need to spill some register to the
+         -- stack, make a new graph, and try to color it again.
+         else do
+                -- if -fasm-lint is turned on then validate the graph
+                let graph_colored_lint  =
+                        if gopt Opt_DoAsmLinting dflags
+                                then Color.validateGraph (text "")
+                                        False   -- don't require nodes to be colored
+                                        graph_colored
+                                else graph_colored
+
+                -- Spill uncolored regs to the stack.
+                (code_spilled, slotsFree', slotsCount', spillStats)
+                        <- regSpill platform code_coalesced slotsFree slotsCount rsSpill
+
+                -- Recalculate liveness information.
+                -- NOTE: we have to reverse the SCCs here to get them back into
+                --       the reverse-dependency order required by computeLiveness.
+                --       If they're not in the correct order that function will panic.
+                code_relive     <- mapM (regLiveness platform . reverseBlocksInTops)
+                                        code_spilled
+
+                -- Record what happened in this stage for debugging.
+                let stat        =
+                        RegAllocStatsSpill
+                        { raCode        = code
+                        , raGraph       = graph_colored_lint
+                        , raCoalesced   = rmCoalesce
+                        , raSpillStats  = spillStats
+                        , raSpillCosts  = spillCosts
+                        , raSpilled     = code_spilled }
+
+                -- Bundle up all the register allocator statistics.
+                --   .. but make sure to drop them on the floor if they're not
+                --      needed, otherwise we'll get a space leak.
+                let statList =
+                        if dump
+                                then [stat] ++ maybeToList stat1 ++ debug_codeGraphs
+                                else []
+
+                -- Ensure all the statistics are evaluated, to avoid space leaks.
+                seqList statList (return ())
+
+                regAlloc_spin dflags (spinCount + 1) triv regsFree slotsFree'
+                              slotsCount' statList code_relive cfg
+
+
+-- | Build a graph from the liveness and coalesce information in this code.
+buildGraph
+        :: Instruction instr
+        => [LiveCmmDecl statics instr]
+        -> UniqSM (Color.Graph VirtualReg RegClass RealReg)
+
+buildGraph code
+ = do
+        -- Slurp out the conflicts and reg->reg moves from this code.
+        let (conflictList, moveList) =
+                unzip $ map slurpConflicts code
+
+        -- Slurp out the spill/reload coalesces.
+        let moveList2           = map slurpReloadCoalesce code
+
+        -- Add the reg-reg conflicts to the graph.
+        let conflictBag         = unionManyBags conflictList
+        let graph_conflict
+                = foldr graphAddConflictSet Color.initGraph conflictBag
+
+        -- Add the coalescences edges to the graph.
+        let moveBag
+                = unionBags (unionManyBags moveList2)
+                            (unionManyBags moveList)
+
+        let graph_coalesce
+                = foldr graphAddCoalesce graph_conflict moveBag
+
+        return  graph_coalesce
+
+
+-- | Add some conflict edges to the graph.
+--   Conflicts between virtual and real regs are recorded as exclusions.
+graphAddConflictSet
+        :: UniqSet Reg
+        -> Color.Graph VirtualReg RegClass RealReg
+        -> Color.Graph VirtualReg RegClass RealReg
+
+graphAddConflictSet set graph
+ = let  virtuals        = mkUniqSet
+                        [ vr | RegVirtual vr <- nonDetEltsUniqSet set ]
+
+        graph1  = Color.addConflicts virtuals classOfVirtualReg graph
+
+        graph2  = foldr (\(r1, r2) -> Color.addExclusion r1 classOfVirtualReg r2)
+                        graph1
+                        [ (vr, rr)
+                                | RegVirtual vr <- nonDetEltsUniqSet set
+                                , RegReal    rr <- nonDetEltsUniqSet set]
+                          -- See Note [Unique Determinism and code generation]
+
+   in   graph2
+
+
+-- | Add some coalesence edges to the graph
+--   Coalesences between virtual and real regs are recorded as preferences.
+graphAddCoalesce
+        :: (Reg, Reg)
+        -> Color.Graph VirtualReg RegClass RealReg
+        -> Color.Graph VirtualReg RegClass RealReg
+
+graphAddCoalesce (r1, r2) graph
+        | RegReal rr            <- r1
+        , RegVirtual vr         <- r2
+        = Color.addPreference (vr, classOfVirtualReg vr) rr graph
+
+        | RegReal rr            <- r2
+        , RegVirtual vr         <- r1
+        = Color.addPreference (vr, classOfVirtualReg vr) rr graph
+
+        | RegVirtual vr1        <- r1
+        , RegVirtual vr2        <- r2
+        = Color.addCoalesce
+                (vr1, classOfVirtualReg vr1)
+                (vr2, classOfVirtualReg vr2)
+                graph
+
+        -- We can't coalesce two real regs, but there could well be existing
+        --      hreg,hreg moves in the input code. We'll just ignore these
+        --      for coalescing purposes.
+        | RegReal _             <- r1
+        , RegReal _             <- r2
+        = graph
+
+#if __GLASGOW_HASKELL__ <= 810
+        | otherwise
+        = panic "graphAddCoalesce"
+#endif
+
+
+-- | Patch registers in code using the reg -> reg mapping in this graph.
+patchRegsFromGraph
+        :: (Outputable statics, Outputable instr, Instruction instr)
+        => Platform -> Color.Graph VirtualReg RegClass RealReg
+        -> LiveCmmDecl statics instr -> LiveCmmDecl statics instr
+
+patchRegsFromGraph platform graph code
+ = patchEraseLive patchF code
+ where
+        -- Function to lookup the hardreg for a virtual reg from the graph.
+        patchF reg
+                -- leave real regs alone.
+                | RegReal{}     <- reg
+                = reg
+
+                -- this virtual has a regular node in the graph.
+                | RegVirtual vr <- reg
+                , Just node     <- Color.lookupNode graph vr
+                = case Color.nodeColor node of
+                        Just color      -> RegReal    color
+                        Nothing         -> RegVirtual vr
+
+                -- no node in the graph for this virtual, bad news.
+                | otherwise
+                = pprPanic "patchRegsFromGraph: register mapping failed."
+                        (  text "There is no node in the graph for register "
+                                <> ppr reg
+                        $$ ppr code
+                        $$ Color.dotGraph
+                                (\_ -> text "white")
+                                (trivColorable platform
+                                        (targetVirtualRegSqueeze platform)
+                                        (targetRealRegSqueeze platform))
+                                graph)
+
+
+-----
+-- for when laziness just isn't what you wanted...
+--  We need to deepSeq the whole graph before trying to colour it to avoid
+--  space leaks.
+seqGraph :: Color.Graph VirtualReg RegClass RealReg -> ()
+seqGraph graph          = seqNodes (nonDetEltsUFM (Color.graphMap graph))
+   -- See Note [Unique Determinism and code generation]
+
+seqNodes :: [Color.Node VirtualReg RegClass RealReg] -> ()
+seqNodes ns
+ = case ns of
+        []              -> ()
+        (n : ns)        -> seqNode n `seq` seqNodes ns
+
+seqNode :: Color.Node VirtualReg RegClass RealReg -> ()
+seqNode node
+        =     seqVirtualReg     (Color.nodeId node)
+        `seq` seqRegClass       (Color.nodeClass node)
+        `seq` seqMaybeRealReg   (Color.nodeColor node)
+        `seq` (seqVirtualRegList (nonDetEltsUniqSet (Color.nodeConflicts node)))
+        `seq` (seqRealRegList    (nonDetEltsUniqSet (Color.nodeExclusions node)))
+        `seq` (seqRealRegList (Color.nodePreference node))
+        `seq` (seqVirtualRegList (nonDetEltsUniqSet (Color.nodeCoalesce node)))
+              -- It's OK to use nonDetEltsUniqSet for seq
+
+seqVirtualReg :: VirtualReg -> ()
+seqVirtualReg reg = reg `seq` ()
+
+seqRealReg :: RealReg -> ()
+seqRealReg reg = reg `seq` ()
+
+seqRegClass :: RegClass -> ()
+seqRegClass c = c `seq` ()
+
+seqMaybeRealReg :: Maybe RealReg -> ()
+seqMaybeRealReg mr
+ = case mr of
+        Nothing         -> ()
+        Just r          -> seqRealReg r
+
+seqVirtualRegList :: [VirtualReg] -> ()
+seqVirtualRegList rs
+ = case rs of
+        []              -> ()
+        (r : rs)        -> seqVirtualReg r `seq` seqVirtualRegList rs
+
+seqRealRegList :: [RealReg] -> ()
+seqRealRegList rs
+ = case rs of
+        []              -> ()
+        (r : rs)        -> seqRealReg r `seq` seqRealRegList rs
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/Base.hs b/compiler/GHC/CmmToAsm/Reg/Graph/Base.hs
new file mode 100644
index 0000000000..95fa174415
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/Base.hs
@@ -0,0 +1,163 @@
+
+-- | Utils for calculating general worst, bound, squeese and free, functions.
+--
+--   as per: "A Generalized Algorithm for Graph-Coloring Register Allocation"
+--           Michael Smith, Normal Ramsey, Glenn Holloway.
+--           PLDI 2004
+--
+--   These general versions are not used in GHC proper because they are too slow.
+--   Instead, hand written optimised versions are provided for each architecture
+--   in MachRegs*.hs
+--
+--   This code is here because we can test the architecture specific code against
+--   it.
+--
+module GHC.CmmToAsm.Reg.Graph.Base (
+        RegClass(..),
+        Reg(..),
+        RegSub(..),
+
+        worst,
+        bound,
+        squeese
+) where
+
+import GhcPrelude
+
+import UniqSet
+import UniqFM
+import Unique
+import MonadUtils (concatMapM)
+
+
+-- Some basic register classes.
+--      These aren't necessarily in 1-to-1 correspondence with the allocatable
+--      RegClasses in MachRegs.hs
+data RegClass
+        -- general purpose regs
+        = ClassG32      -- 32 bit GPRs
+        | ClassG16      -- 16 bit GPRs
+        | ClassG8       -- 8  bit GPRs
+
+        -- floating point regs
+        | ClassF64      -- 64 bit FPRs
+        deriving (Show, Eq, Enum)
+
+
+-- | A register of some class
+data Reg
+        -- a register of some class
+        = Reg RegClass Int
+
+        -- a sub-component of one of the other regs
+        | RegSub RegSub Reg
+        deriving (Show, Eq)
+
+
+-- | so we can put regs in UniqSets
+instance Uniquable Reg where
+        getUnique (Reg c i)
+         = mkRegSingleUnique
+         $ fromEnum c * 1000 + i
+
+        getUnique (RegSub s (Reg c i))
+         = mkRegSubUnique
+         $ fromEnum s * 10000 + fromEnum c * 1000 + i
+
+        getUnique (RegSub _ (RegSub _ _))
+          = error "RegArchBase.getUnique: can't have a sub-reg of a sub-reg."
+
+
+-- | A subcomponent of another register
+data RegSub
+        = SubL16        -- lowest 16 bits
+        | SubL8         -- lowest  8 bits
+        | SubL8H        -- second lowest 8 bits
+        deriving (Show, Enum, Ord, Eq)
+
+
+-- | Worst case displacement
+--
+--      a node N of classN has some number of neighbors,
+--      all of which are from classC.
+--
+--      (worst neighbors classN classC) is the maximum number of potential
+--      colors for N that can be lost by coloring its neighbors.
+--
+-- This should be hand coded/cached for each particular architecture,
+--      because the compute time is very long..
+worst   :: (RegClass    -> UniqSet Reg)
+        -> (Reg         -> UniqSet Reg)
+        -> Int -> RegClass -> RegClass -> Int
+
+worst regsOfClass regAlias neighbors classN classC
+ = let  regAliasS regs  = unionManyUniqSets
+                        $ map regAlias
+                        $ nonDetEltsUniqSet regs
+                        -- This is non-deterministic but we do not
+                        -- currently support deterministic code-generation.
+                        -- See Note [Unique Determinism and code generation]
+
+        -- all the regs in classes N, C
+        regsN           = regsOfClass classN
+        regsC           = regsOfClass classC
+
+        -- all the possible subsets of c which have size < m
+        regsS           = filter (\s -> sizeUniqSet s >= 1
+                                     && sizeUniqSet s <= neighbors)
+                        $ powersetLS regsC
+
+        -- for each of the subsets of C, the regs which conflict
+        -- with posiblities for N
+        regsS_conflict
+                = map (\s -> intersectUniqSets regsN (regAliasS s)) regsS
+
+  in    maximum $ map sizeUniqSet $ regsS_conflict
+
+
+-- | For a node N of classN and neighbors of classesC
+--      (bound classN classesC) is the maximum number of potential
+--      colors for N that can be lost by coloring its neighbors.
+bound   :: (RegClass    -> UniqSet Reg)
+        -> (Reg         -> UniqSet Reg)
+        -> RegClass -> [RegClass] -> Int
+
+bound regsOfClass regAlias classN classesC
+ = let  regAliasS regs  = unionManyUniqSets
+                        $ map regAlias
+                        $ nonDetEltsUFM regs
+                        -- See Note [Unique Determinism and code generation]
+
+        regsC_aliases
+                = unionManyUniqSets
+                $ map (regAliasS . getUniqSet . regsOfClass) classesC
+
+        overlap = intersectUniqSets (regsOfClass classN) regsC_aliases
+
+   in   sizeUniqSet overlap
+
+
+-- | The total squeese on a particular node with a list of neighbors.
+--
+--   A version of this should be constructed for each particular architecture,
+--   possibly including uses of bound, so that alised registers don't get
+--   counted twice, as per the paper.
+squeese :: (RegClass    -> UniqSet Reg)
+        -> (Reg         -> UniqSet Reg)
+        -> RegClass -> [(Int, RegClass)] -> Int
+
+squeese regsOfClass regAlias classN countCs
+        = sum
+        $ map (\(i, classC) -> worst regsOfClass regAlias i classN classC)
+        $ countCs
+
+
+-- | powerset (for lists)
+powersetL :: [a] -> [[a]]
+powersetL       = concatMapM (\x -> [[],[x]])
+
+
+-- | powersetLS (list of sets)
+powersetLS :: Uniquable a => UniqSet a -> [UniqSet a]
+powersetLS s    = map mkUniqSet $ powersetL $ nonDetEltsUniqSet s
+  -- See Note [Unique Determinism and code generation]
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/Coalesce.hs b/compiler/GHC/CmmToAsm/Reg/Graph/Coalesce.hs
new file mode 100644
index 0000000000..d223137dd0
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/Coalesce.hs
@@ -0,0 +1,99 @@
+-- | Register coalescing.
+module GHC.CmmToAsm.Reg.Graph.Coalesce (
+        regCoalesce,
+        slurpJoinMovs
+) where
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+
+import GHC.Cmm
+import Bag
+import Digraph
+import UniqFM
+import UniqSet
+import UniqSupply
+
+
+-- | Do register coalescing on this top level thing
+--
+--   For Reg -> Reg moves, if the first reg dies at the same time the
+--   second reg is born then the mov only serves to join live ranges.
+--   The two regs can be renamed to be the same and the move instruction
+--   safely erased.
+regCoalesce
+        :: Instruction instr
+        => [LiveCmmDecl statics instr]
+        -> UniqSM [LiveCmmDecl statics instr]
+
+regCoalesce code
+ = do
+        let joins       = foldl' unionBags emptyBag
+                        $ map slurpJoinMovs code
+
+        let alloc       = foldl' buildAlloc emptyUFM
+                        $ bagToList joins
+
+        let patched     = map (patchEraseLive (sinkReg alloc)) code
+
+        return patched
+
+
+-- | Add a v1 = v2 register renaming to the map.
+--   The register with the lowest lexical name is set as the
+--   canonical version.
+buildAlloc :: UniqFM Reg -> (Reg, Reg) -> UniqFM Reg
+buildAlloc fm (r1, r2)
+ = let  rmin    = min r1 r2
+        rmax    = max r1 r2
+   in   addToUFM fm rmax rmin
+
+
+-- | Determine the canonical name for a register by following
+--   v1 = v2 renamings in this map.
+sinkReg :: UniqFM Reg -> Reg -> Reg
+sinkReg fm r
+ = case lookupUFM fm r of
+        Nothing -> r
+        Just r' -> sinkReg fm r'
+
+
+-- | Slurp out mov instructions that only serve to join live ranges.
+--
+--   During a mov, if the source reg dies and the destination reg is
+--   born then we can rename the two regs to the same thing and
+--   eliminate the move.
+slurpJoinMovs
+        :: Instruction instr
+        => LiveCmmDecl statics instr
+        -> Bag (Reg, Reg)
+
+slurpJoinMovs live
+        = slurpCmm emptyBag live
+ where
+        slurpCmm   rs  CmmData{}
+         = rs
+
+        slurpCmm   rs (CmmProc _ _ _ sccs)
+         = foldl' slurpBlock rs (flattenSCCs sccs)
+
+        slurpBlock rs (BasicBlock _ instrs)
+         = foldl' slurpLI    rs instrs
+
+        slurpLI    rs (LiveInstr _      Nothing)    = rs
+        slurpLI    rs (LiveInstr instr (Just live))
+                | Just (r1, r2) <- takeRegRegMoveInstr instr
+                , elementOfUniqSet r1 $ liveDieRead live
+                , elementOfUniqSet r2 $ liveBorn live
+
+                -- only coalesce movs between two virtuals for now,
+                -- else we end up with allocatable regs in the live
+                -- regs list..
+                , isVirtualReg r1 && isVirtualReg r2
+                = consBag (r1, r2) rs
+
+                | otherwise
+                = rs
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/Spill.hs b/compiler/GHC/CmmToAsm/Reg/Graph/Spill.hs
new file mode 100644
index 0000000000..a0e11433f7
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/Spill.hs
@@ -0,0 +1,382 @@
+
+-- | When there aren't enough registers to hold all the vregs we have to spill
+--   some of those vregs to slots on the stack. This module is used modify the
+--   code to use those slots.
+module GHC.CmmToAsm.Reg.Graph.Spill (
+        regSpill,
+        SpillStats(..),
+        accSpillSL
+) where
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+import GHC.Cmm hiding (RegSet)
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+
+import MonadUtils
+import State
+import Unique
+import UniqFM
+import UniqSet
+import UniqSupply
+import Outputable
+import GHC.Platform
+
+import Data.List
+import Data.Maybe
+import Data.IntSet              (IntSet)
+import qualified Data.IntSet    as IntSet
+
+
+-- | Spill all these virtual regs to stack slots.
+--
+--   Bumps the number of required stack slots if required.
+--
+--
+--   TODO: See if we can split some of the live ranges instead of just globally
+--         spilling the virtual reg. This might make the spill cleaner's job easier.
+--
+--   TODO: On CISCy x86 and x86_64 we don't necessarily have to add a mov instruction
+--         when making spills. If an instr is using a spilled virtual we may be able to
+--         address the spill slot directly.
+--
+regSpill
+        :: Instruction instr
+        => Platform
+        -> [LiveCmmDecl statics instr]  -- ^ the code
+        -> UniqSet Int                  -- ^ available stack slots
+        -> Int                          -- ^ current number of spill slots.
+        -> UniqSet VirtualReg           -- ^ the regs to spill
+        -> UniqSM
+            ([LiveCmmDecl statics instr]
+                 -- code with SPILL and RELOAD meta instructions added.
+            , UniqSet Int               -- left over slots
+            , Int                       -- slot count in use now.
+            , SpillStats )              -- stats about what happened during spilling
+
+regSpill platform code slotsFree slotCount regs
+
+        -- Not enough slots to spill these regs.
+        | sizeUniqSet slotsFree < sizeUniqSet regs
+        = -- pprTrace "Bumping slot count:" (ppr slotCount <> text " -> " <> ppr (slotCount+512)) $
+          let slotsFree' = (addListToUniqSet slotsFree [slotCount+1 .. slotCount+512])
+          in regSpill platform code slotsFree' (slotCount+512) regs
+
+        | otherwise
+        = do
+                -- Allocate a slot for each of the spilled regs.
+                let slots       = take (sizeUniqSet regs) $ nonDetEltsUniqSet slotsFree
+                let regSlotMap  = listToUFM
+                                $ zip (nonDetEltsUniqSet regs) slots
+                    -- This is non-deterministic but we do not
+                    -- currently support deterministic code-generation.
+                    -- See Note [Unique Determinism and code generation]
+
+                -- Grab the unique supply from the monad.
+                us      <- getUniqueSupplyM
+
+                -- Run the spiller on all the blocks.
+                let (code', state')     =
+                        runState (mapM (regSpill_top platform regSlotMap) code)
+                                 (initSpillS us)
+
+                return  ( code'
+                        , minusUniqSet slotsFree (mkUniqSet slots)
+                        , slotCount
+                        , makeSpillStats state')
+
+
+-- | Spill some registers to stack slots in a top-level thing.
+regSpill_top
+        :: Instruction instr
+        => Platform
+        -> RegMap Int
+                -- ^ map of vregs to slots they're being spilled to.
+        -> LiveCmmDecl statics instr
+                -- ^ the top level thing.
+        -> SpillM (LiveCmmDecl statics instr)
+
+regSpill_top platform regSlotMap cmm
+ = case cmm of
+        CmmData{}
+         -> return cmm
+
+        CmmProc info label live sccs
+         |  LiveInfo static firstId liveVRegsOnEntry liveSlotsOnEntry <- info
+         -> do
+                -- The liveVRegsOnEntry contains the set of vregs that are live
+                -- on entry to each basic block. If we spill one of those vregs
+                -- we remove it from that set and add the corresponding slot
+                -- number to the liveSlotsOnEntry set. The spill cleaner needs
+                -- this information to erase unneeded spill and reload instructions
+                -- after we've done a successful allocation.
+                let liveSlotsOnEntry' :: BlockMap IntSet
+                    liveSlotsOnEntry'
+                        = mapFoldlWithKey patchLiveSlot
+                                          liveSlotsOnEntry liveVRegsOnEntry
+
+                let info'
+                        = LiveInfo static firstId
+                                liveVRegsOnEntry
+                                liveSlotsOnEntry'
+
+                -- Apply the spiller to all the basic blocks in the CmmProc.
+                sccs'   <- mapM (mapSCCM (regSpill_block platform regSlotMap)) sccs
+
+                return  $ CmmProc info' label live sccs'
+
+ where  -- Given a BlockId and the set of registers live in it,
+        -- if registers in this block are being spilled to stack slots,
+        -- then record the fact that these slots are now live in those blocks
+        -- in the given slotmap.
+        patchLiveSlot
+                :: BlockMap IntSet -> BlockId -> RegSet -> BlockMap IntSet
+
+        patchLiveSlot slotMap blockId regsLive
+         = let
+                -- Slots that are already recorded as being live.
+                curSlotsLive    = fromMaybe IntSet.empty
+                                $ mapLookup blockId slotMap
+
+                moreSlotsLive   = IntSet.fromList
+                                $ catMaybes
+                                $ map (lookupUFM regSlotMap)
+                                $ nonDetEltsUniqSet regsLive
+                    -- See Note [Unique Determinism and code generation]
+
+                slotMap'
+                 = mapInsert blockId (IntSet.union curSlotsLive moreSlotsLive)
+                             slotMap
+
+           in   slotMap'
+
+
+-- | Spill some registers to stack slots in a basic block.
+regSpill_block
+        :: Instruction instr
+        => Platform
+        -> UniqFM Int   -- ^ map of vregs to slots they're being spilled to.
+        -> LiveBasicBlock instr
+        -> SpillM (LiveBasicBlock instr)
+
+regSpill_block platform regSlotMap (BasicBlock i instrs)
+ = do   instrss'        <- mapM (regSpill_instr platform regSlotMap) instrs
+        return  $ BasicBlock i (concat instrss')
+
+
+-- | Spill some registers to stack slots in a single instruction.
+--   If the instruction uses registers that need to be spilled, then it is
+--   prefixed (or postfixed) with the appropriate RELOAD or SPILL meta
+--   instructions.
+regSpill_instr
+        :: Instruction instr
+        => Platform
+        -> UniqFM Int -- ^ map of vregs to slots they're being spilled to.
+        -> LiveInstr instr
+        -> SpillM [LiveInstr instr]
+
+regSpill_instr _ _ li@(LiveInstr _ Nothing)
+ = do   return [li]
+
+regSpill_instr platform regSlotMap
+        (LiveInstr instr (Just _))
+ = do
+        -- work out which regs are read and written in this instr
+        let RU rlRead rlWritten = regUsageOfInstr platform instr
+
+        -- sometimes a register is listed as being read more than once,
+        --      nub this so we don't end up inserting two lots of spill code.
+        let rsRead_             = nub rlRead
+        let rsWritten_          = nub rlWritten
+
+        -- if a reg is modified, it appears in both lists, want to undo this..
+        let rsRead              = rsRead_    \\ rsWritten_
+        let rsWritten           = rsWritten_ \\ rsRead_
+        let rsModify            = intersect rsRead_ rsWritten_
+
+        -- work out if any of the regs being used are currently being spilled.
+        let rsSpillRead         = filter (\r -> elemUFM r regSlotMap) rsRead
+        let rsSpillWritten      = filter (\r -> elemUFM r regSlotMap) rsWritten
+        let rsSpillModify       = filter (\r -> elemUFM r regSlotMap) rsModify
+
+        -- rewrite the instr and work out spill code.
+        (instr1, prepost1)      <- mapAccumLM (spillRead   regSlotMap) instr  rsSpillRead
+        (instr2, prepost2)      <- mapAccumLM (spillWrite  regSlotMap) instr1 rsSpillWritten
+        (instr3, prepost3)      <- mapAccumLM (spillModify regSlotMap) instr2 rsSpillModify
+
+        let (mPrefixes, mPostfixes)     = unzip (prepost1 ++ prepost2 ++ prepost3)
+        let prefixes                    = concat mPrefixes
+        let postfixes                   = concat mPostfixes
+
+        -- final code
+        let instrs'     =  prefixes
+                        ++ [LiveInstr instr3 Nothing]
+                        ++ postfixes
+
+        return $ instrs'
+
+
+-- | Add a RELOAD met a instruction to load a value for an instruction that
+--   writes to a vreg that is being spilled.
+spillRead
+        :: Instruction instr
+        => UniqFM Int
+        -> instr
+        -> Reg
+        -> SpillM (instr, ([LiveInstr instr'], [LiveInstr instr']))
+
+spillRead regSlotMap instr reg
+ | Just slot     <- lookupUFM regSlotMap reg
+ = do    (instr', nReg)  <- patchInstr reg instr
+
+         modify $ \s -> s
+                { stateSpillSL  = addToUFM_C accSpillSL (stateSpillSL s) reg (reg, 0, 1) }
+
+         return  ( instr'
+                 , ( [LiveInstr (RELOAD slot nReg) Nothing]
+                 , []) )
+
+ | otherwise     = panic "RegSpill.spillRead: no slot defined for spilled reg"
+
+
+-- | Add a SPILL meta instruction to store a value for an instruction that
+--   writes to a vreg that is being spilled.
+spillWrite
+        :: Instruction instr
+        => UniqFM Int
+        -> instr
+        -> Reg
+        -> SpillM (instr, ([LiveInstr instr'], [LiveInstr instr']))
+
+spillWrite regSlotMap instr reg
+ | Just slot     <- lookupUFM regSlotMap reg
+ = do    (instr', nReg)  <- patchInstr reg instr
+
+         modify $ \s -> s
+                { stateSpillSL  = addToUFM_C accSpillSL (stateSpillSL s) reg (reg, 1, 0) }
+
+         return  ( instr'
+                 , ( []
+                   , [LiveInstr (SPILL nReg slot) Nothing]))
+
+ | otherwise     = panic "RegSpill.spillWrite: no slot defined for spilled reg"
+
+
+-- | Add both RELOAD and SPILL meta instructions for an instruction that
+--   both reads and writes to a vreg that is being spilled.
+spillModify
+        :: Instruction instr
+        => UniqFM Int
+        -> instr
+        -> Reg
+        -> SpillM (instr, ([LiveInstr instr'], [LiveInstr instr']))
+
+spillModify regSlotMap instr reg
+ | Just slot     <- lookupUFM regSlotMap reg
+ = do    (instr', nReg)  <- patchInstr reg instr
+
+         modify $ \s -> s
+                { stateSpillSL  = addToUFM_C accSpillSL (stateSpillSL s) reg (reg, 1, 1) }
+
+         return  ( instr'
+                 , ( [LiveInstr (RELOAD slot nReg) Nothing]
+                   , [LiveInstr (SPILL nReg slot) Nothing]))
+
+ | otherwise     = panic "RegSpill.spillModify: no slot defined for spilled reg"
+
+
+-- | Rewrite uses of this virtual reg in an instr to use a different
+--   virtual reg.
+patchInstr
+        :: Instruction instr
+        => Reg -> instr -> SpillM (instr, Reg)
+
+patchInstr reg instr
+ = do   nUnique         <- newUnique
+
+        -- The register we're rewriting is supposed to be virtual.
+        -- If it's not then something has gone horribly wrong.
+        let nReg
+             = case reg of
+                RegVirtual vr
+                 -> RegVirtual (renameVirtualReg nUnique vr)
+
+                RegReal{}
+                 -> panic "RegAlloc.Graph.Spill.patchIntr: not patching real reg"
+
+        let instr'      = patchReg1 reg nReg instr
+        return          (instr', nReg)
+
+
+patchReg1
+        :: Instruction instr
+        => Reg -> Reg -> instr -> instr
+
+patchReg1 old new instr
+ = let  patchF r
+                | r == old      = new
+                | otherwise     = r
+   in   patchRegsOfInstr instr patchF
+
+
+-- Spiller monad --------------------------------------------------------------
+-- | State monad for the spill code generator.
+type SpillM a
+        = State SpillS a
+
+-- | Spill code generator state.
+data SpillS
+        = SpillS
+        { -- | Unique supply for generating fresh vregs.
+          stateUS       :: UniqSupply
+
+          -- | Spilled vreg vs the number of times it was loaded, stored.
+        , stateSpillSL  :: UniqFM (Reg, Int, Int) }
+
+
+-- | Create a new spiller state.
+initSpillS :: UniqSupply -> SpillS
+initSpillS uniqueSupply
+        = SpillS
+        { stateUS       = uniqueSupply
+        , stateSpillSL  = emptyUFM }
+
+
+-- | Allocate a new unique in the spiller monad.
+newUnique :: SpillM Unique
+newUnique
+ = do   us      <- gets stateUS
+        case takeUniqFromSupply us of
+         (uniq, us')
+          -> do modify $ \s -> s { stateUS = us' }
+                return uniq
+
+
+-- | Add a spill/reload count to a stats record for a register.
+accSpillSL :: (Reg, Int, Int) -> (Reg, Int, Int) -> (Reg, Int, Int)
+accSpillSL (r1, s1, l1) (_, s2, l2)
+        = (r1, s1 + s2, l1 + l2)
+
+
+-- Spiller stats --------------------------------------------------------------
+-- | Spiller statistics.
+--   Tells us what registers were spilled.
+data SpillStats
+        = SpillStats
+        { spillStoreLoad        :: UniqFM (Reg, Int, Int) }
+
+
+-- | Extract spiller statistics from the spiller state.
+makeSpillStats :: SpillS -> SpillStats
+makeSpillStats s
+        = SpillStats
+        { spillStoreLoad        = stateSpillSL s }
+
+
+instance Outputable SpillStats where
+ ppr stats
+        = pprUFM (spillStoreLoad stats)
+                 (vcat . map (\(r, s, l) -> ppr r <+> int s <+> int l))
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/SpillClean.hs b/compiler/GHC/CmmToAsm/Reg/Graph/SpillClean.hs
new file mode 100644
index 0000000000..6d14c7194b
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/SpillClean.hs
@@ -0,0 +1,616 @@
+{-# LANGUAGE CPP #-}
+
+-- | Clean out unneeded spill\/reload instructions.
+--
+--   Handling of join points
+--   ~~~~~~~~~~~~~~~~~~~~~~~
+--
+--   B1:                          B2:
+--    ...                          ...
+--       RELOAD SLOT(0), %r1          RELOAD SLOT(0), %r1
+--       ... A ...                    ... B ...
+--       jump B3                      jump B3
+--
+--                B3: ... C ...
+--                    RELOAD SLOT(0), %r1
+--                    ...
+--
+--   The Plan
+--   ~~~~~~~~
+--   As long as %r1 hasn't been written to in A, B or C then we don't need
+--   the reload in B3.
+--
+--   What we really care about here is that on the entry to B3, %r1 will
+--   always have the same value that is in SLOT(0) (ie, %r1 is _valid_)
+--
+--   This also works if the reloads in B1\/B2 were spills instead, because
+--   spilling %r1 to a slot makes that slot have the same value as %r1.
+--
+module GHC.CmmToAsm.Reg.Graph.SpillClean (
+        cleanSpills
+) where
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import UniqSet
+import UniqFM
+import Unique
+import State
+import Outputable
+import GHC.Platform
+import GHC.Cmm.Dataflow.Collections
+
+import Data.List
+import Data.Maybe
+import Data.IntSet              (IntSet)
+import qualified Data.IntSet    as IntSet
+
+
+-- | The identification number of a spill slot.
+--   A value is stored in a spill slot when we don't have a free
+--   register to hold it.
+type Slot = Int
+
+
+-- | Clean out unneeded spill\/reloads from this top level thing.
+cleanSpills
+        :: Instruction instr
+        => Platform
+        -> LiveCmmDecl statics instr
+        -> LiveCmmDecl statics instr
+
+cleanSpills platform cmm
+        = evalState (cleanSpin platform 0 cmm) initCleanS
+
+
+-- | Do one pass of cleaning.
+cleanSpin
+        :: Instruction instr
+        => Platform
+        -> Int                              -- ^ Iteration number for the cleaner.
+        -> LiveCmmDecl statics instr        -- ^ Liveness annotated code to clean.
+        -> CleanM (LiveCmmDecl statics instr)
+
+cleanSpin platform spinCount code
+ = do
+        -- Initialise count of cleaned spill and reload instructions.
+        modify $ \s -> s
+                { sCleanedSpillsAcc     = 0
+                , sCleanedReloadsAcc    = 0
+                , sReloadedBy           = emptyUFM }
+
+        code_forward    <- mapBlockTopM (cleanBlockForward platform) code
+        code_backward   <- cleanTopBackward code_forward
+
+        -- During the cleaning of each block we collected information about
+        -- what regs were valid across each jump. Based on this, work out
+        -- whether it will be safe to erase reloads after join points for
+        -- the next pass.
+        collateJoinPoints
+
+        -- Remember how many spill and reload instructions we cleaned in this pass.
+        spills          <- gets sCleanedSpillsAcc
+        reloads         <- gets sCleanedReloadsAcc
+        modify $ \s -> s
+                { sCleanedCount = (spills, reloads) : sCleanedCount s }
+
+        -- If nothing was cleaned in this pass or the last one
+        --      then we're done and it's time to bail out.
+        cleanedCount    <- gets sCleanedCount
+        if take 2 cleanedCount == [(0, 0), (0, 0)]
+           then return code
+
+        -- otherwise go around again
+           else cleanSpin platform (spinCount + 1) code_backward
+
+
+-------------------------------------------------------------------------------
+-- | Clean out unneeded reload instructions,
+--   while walking forward over the code.
+cleanBlockForward
+        :: Instruction instr
+        => Platform
+        -> LiveBasicBlock instr
+        -> CleanM (LiveBasicBlock instr)
+
+cleanBlockForward platform (BasicBlock blockId instrs)
+ = do
+        -- See if we have a valid association for the entry to this block.
+        jumpValid       <- gets sJumpValid
+        let assoc       = case lookupUFM jumpValid blockId of
+                                Just assoc      -> assoc
+                                Nothing         -> emptyAssoc
+
+        instrs_reload   <- cleanForward platform blockId assoc [] instrs
+        return  $ BasicBlock blockId instrs_reload
+
+
+
+-- | Clean out unneeded reload instructions.
+--
+--   Walking forwards across the code
+--     On a reload, if we know a reg already has the same value as a slot
+--     then we don't need to do the reload.
+--
+cleanForward
+        :: Instruction instr
+        => Platform
+        -> BlockId                  -- ^ the block that we're currently in
+        -> Assoc Store              -- ^ two store locations are associated if
+                                    --     they have the same value
+        -> [LiveInstr instr]        -- ^ acc
+        -> [LiveInstr instr]        -- ^ instrs to clean (in backwards order)
+        -> CleanM [LiveInstr instr] -- ^ cleaned instrs  (in forward   order)
+
+cleanForward _ _ _ acc []
+        = return acc
+
+-- Rewrite live range joins via spill slots to just a spill and a reg-reg move
+-- hopefully the spill will be also be cleaned in the next pass
+cleanForward platform blockId assoc acc (li1 : li2 : instrs)
+
+        | LiveInstr (SPILL  reg1  slot1) _      <- li1
+        , LiveInstr (RELOAD slot2 reg2)  _      <- li2
+        , slot1 == slot2
+        = do
+                modify $ \s -> s { sCleanedReloadsAcc = sCleanedReloadsAcc s + 1 }
+                cleanForward platform blockId assoc acc
+                 $ li1 : LiveInstr (mkRegRegMoveInstr platform reg1 reg2) Nothing
+                       : instrs
+
+cleanForward platform blockId assoc acc (li@(LiveInstr i1 _) : instrs)
+        | Just (r1, r2) <- takeRegRegMoveInstr i1
+        = if r1 == r2
+                -- Erase any left over nop reg reg moves while we're here
+                -- this will also catch any nop moves that the previous case
+                -- happens to add.
+                then cleanForward platform blockId assoc acc instrs
+
+                -- If r1 has the same value as some slots and we copy r1 to r2,
+                --      then r2 is now associated with those slots instead
+                else do let assoc'      = addAssoc (SReg r1) (SReg r2)
+                                        $ delAssoc (SReg r2)
+                                        $ assoc
+
+                        cleanForward platform blockId assoc' (li : acc) instrs
+
+
+cleanForward platform blockId assoc acc (li : instrs)
+
+        -- Update association due to the spill.
+        | LiveInstr (SPILL reg slot) _  <- li
+        = let   assoc'  = addAssoc (SReg reg)  (SSlot slot)
+                        $ delAssoc (SSlot slot)
+                        $ assoc
+          in    cleanForward platform blockId assoc' (li : acc) instrs
+
+        -- Clean a reload instr.
+        | LiveInstr (RELOAD{}) _        <- li
+        = do    (assoc', mli)   <- cleanReload platform blockId assoc li
+                case mli of
+                 Nothing        -> cleanForward platform blockId assoc' acc
+                                                instrs
+
+                 Just li'       -> cleanForward platform blockId assoc' (li' : acc)
+                                                instrs
+
+        -- Remember the association over a jump.
+        | LiveInstr instr _     <- li
+        , targets               <- jumpDestsOfInstr instr
+        , not $ null targets
+        = do    mapM_ (accJumpValid assoc) targets
+                cleanForward platform blockId assoc (li : acc) instrs
+
+        -- Writing to a reg changes its value.
+        | LiveInstr instr _     <- li
+        , RU _ written          <- regUsageOfInstr platform instr
+        = let assoc'    = foldr delAssoc assoc (map SReg $ nub written)
+          in  cleanForward platform blockId assoc' (li : acc) instrs
+
+
+
+-- | Try and rewrite a reload instruction to something more pleasing
+cleanReload
+        :: Instruction instr
+        => Platform
+        -> BlockId
+        -> Assoc Store
+        -> LiveInstr instr
+        -> CleanM (Assoc Store, Maybe (LiveInstr instr))
+
+cleanReload platform blockId assoc li@(LiveInstr (RELOAD slot reg) _)
+
+        -- If the reg we're reloading already has the same value as the slot
+        --      then we can erase the instruction outright.
+        | elemAssoc (SSlot slot) (SReg reg) assoc
+        = do    modify  $ \s -> s { sCleanedReloadsAcc = sCleanedReloadsAcc s + 1 }
+                return  (assoc, Nothing)
+
+        -- If we can find another reg with the same value as this slot then
+        --      do a move instead of a reload.
+        | Just reg2     <- findRegOfSlot assoc slot
+        = do    modify $ \s -> s { sCleanedReloadsAcc = sCleanedReloadsAcc s + 1 }
+
+                let assoc'      = addAssoc (SReg reg) (SReg reg2)
+                                $ delAssoc (SReg reg)
+                                $ assoc
+
+                return  ( assoc'
+                        , Just $ LiveInstr (mkRegRegMoveInstr platform reg2 reg) Nothing)
+
+        -- Gotta keep this instr.
+        | otherwise
+        = do    -- Update the association.
+                let assoc'
+                        = addAssoc (SReg reg)  (SSlot slot)
+                                -- doing the reload makes reg and slot the same value
+                        $ delAssoc (SReg reg)
+                                -- reg value changes on reload
+                        $ assoc
+
+                -- Remember that this block reloads from this slot.
+                accBlockReloadsSlot blockId slot
+
+                return  (assoc', Just li)
+
+cleanReload _ _ _ _
+        = panic "RegSpillClean.cleanReload: unhandled instr"
+
+
+-------------------------------------------------------------------------------
+-- | Clean out unneeded spill instructions,
+--   while walking backwards over the code.
+--
+--      If there were no reloads from a slot between a spill and the last one
+--      then the slot was never read and we don't need the spill.
+--
+--      SPILL   r0 -> s1
+--      RELOAD  s1 -> r2
+--      SPILL   r3 -> s1        <--- don't need this spill
+--      SPILL   r4 -> s1
+--      RELOAD  s1 -> r5
+--
+--      Maintain a set of
+--              "slots which were spilled to but not reloaded from yet"
+--
+--      Walking backwards across the code:
+--       a) On a reload from a slot, remove it from the set.
+--
+--       a) On a spill from a slot
+--              If the slot is in set then we can erase the spill,
+--               because it won't be reloaded from until after the next spill.
+--
+--              otherwise
+--               keep the spill and add the slot to the set
+--
+-- TODO: This is mostly inter-block
+--       we should really be updating the noReloads set as we cross jumps also.
+--
+-- TODO: generate noReloads from liveSlotsOnEntry
+--
+cleanTopBackward
+        :: Instruction instr
+        => LiveCmmDecl statics instr
+        -> CleanM (LiveCmmDecl statics instr)
+
+cleanTopBackward cmm
+ = case cmm of
+        CmmData{}
+         -> return cmm
+
+        CmmProc info label live sccs
+         | LiveInfo _ _ _ liveSlotsOnEntry <- info
+         -> do  sccs'   <- mapM (mapSCCM (cleanBlockBackward liveSlotsOnEntry)) sccs
+                return  $ CmmProc info label live sccs'
+
+
+cleanBlockBackward
+        :: Instruction instr
+        => BlockMap IntSet
+        -> LiveBasicBlock instr
+        -> CleanM (LiveBasicBlock instr)
+
+cleanBlockBackward liveSlotsOnEntry (BasicBlock blockId instrs)
+ = do   instrs_spill    <- cleanBackward liveSlotsOnEntry  emptyUniqSet  [] instrs
+        return  $ BasicBlock blockId instrs_spill
+
+
+
+cleanBackward
+        :: Instruction instr
+        => BlockMap IntSet          -- ^ Slots live on entry to each block
+        -> UniqSet Int              -- ^ Slots that have been spilled, but not reloaded from
+        -> [LiveInstr instr]        -- ^ acc
+        -> [LiveInstr instr]        -- ^ Instrs to clean (in forwards order)
+        -> CleanM [LiveInstr instr] -- ^ Cleaned instrs  (in backwards order)
+
+cleanBackward liveSlotsOnEntry noReloads acc lis
+ = do   reloadedBy      <- gets sReloadedBy
+        cleanBackward' liveSlotsOnEntry reloadedBy noReloads acc lis
+
+
+cleanBackward'
+        :: Instruction instr
+        => BlockMap IntSet
+        -> UniqFM [BlockId]
+        -> UniqSet Int
+        -> [LiveInstr instr]
+        -> [LiveInstr instr]
+        -> State CleanS [LiveInstr instr]
+
+cleanBackward' _ _ _      acc []
+        = return  acc
+
+cleanBackward' liveSlotsOnEntry reloadedBy noReloads acc (li : instrs)
+
+        -- If nothing ever reloads from this slot then we don't need the spill.
+        | LiveInstr (SPILL _ slot) _    <- li
+        , Nothing       <- lookupUFM reloadedBy (SSlot slot)
+        = do    modify $ \s -> s { sCleanedSpillsAcc = sCleanedSpillsAcc s + 1 }
+                cleanBackward liveSlotsOnEntry noReloads acc instrs
+
+        | LiveInstr (SPILL _ slot) _    <- li
+        = if elementOfUniqSet slot noReloads
+
+           -- We can erase this spill because the slot won't be read until
+           -- after the next one
+           then do
+                modify $ \s -> s { sCleanedSpillsAcc = sCleanedSpillsAcc s + 1 }
+                cleanBackward liveSlotsOnEntry noReloads acc instrs
+
+           else do
+                -- This slot is being spilled to, but we haven't seen any reloads yet.
+                let noReloads'  = addOneToUniqSet noReloads slot
+                cleanBackward liveSlotsOnEntry noReloads' (li : acc) instrs
+
+        -- if we reload from a slot then it's no longer unused
+        | LiveInstr (RELOAD slot _) _   <- li
+        , noReloads'            <- delOneFromUniqSet noReloads slot
+        = cleanBackward liveSlotsOnEntry noReloads' (li : acc) instrs
+
+        -- If a slot is live in a jump target then assume it's reloaded there.
+        --
+        -- TODO: A real dataflow analysis would do a better job here.
+        --       If the target block _ever_ used the slot then we assume
+        --       it always does, but if those reloads are cleaned the slot
+        --       liveness map doesn't get updated.
+        | LiveInstr instr _     <- li
+        , targets               <- jumpDestsOfInstr instr
+        = do
+                let slotsReloadedByTargets
+                        = IntSet.unions
+                        $ catMaybes
+                        $ map (flip mapLookup liveSlotsOnEntry)
+                        $ targets
+
+                let noReloads'
+                        = foldl' delOneFromUniqSet noReloads
+                        $ IntSet.toList slotsReloadedByTargets
+
+                cleanBackward liveSlotsOnEntry noReloads' (li : acc) instrs
+
+#if __GLASGOW_HASKELL__ <= 810
+        -- some other instruction
+        | otherwise
+        = cleanBackward liveSlotsOnEntry noReloads (li : acc) instrs
+#endif
+
+
+-- | Combine the associations from all the inward control flow edges.
+--
+collateJoinPoints :: CleanM ()
+collateJoinPoints
+ = modify $ \s -> s
+        { sJumpValid    = mapUFM intersects (sJumpValidAcc s)
+        , sJumpValidAcc = emptyUFM }
+
+intersects :: [Assoc Store]     -> Assoc Store
+intersects []           = emptyAssoc
+intersects assocs       = foldl1' intersectAssoc assocs
+
+
+-- | See if we have a reg with the same value as this slot in the association table.
+findRegOfSlot :: Assoc Store -> Int -> Maybe Reg
+findRegOfSlot assoc slot
+        | close                 <- closeAssoc (SSlot slot) assoc
+        , Just (SReg reg)       <- find isStoreReg $ nonDetEltsUniqSet close
+           -- See Note [Unique Determinism and code generation]
+        = Just reg
+
+        | otherwise
+        = Nothing
+
+
+-------------------------------------------------------------------------------
+-- | Cleaner monad.
+type CleanM
+        = State CleanS
+
+-- | Cleaner state.
+data CleanS
+        = CleanS
+        { -- | Regs which are valid at the start of each block.
+          sJumpValid            :: UniqFM (Assoc Store)
+
+          -- | Collecting up what regs were valid across each jump.
+          --    in the next pass we can collate these and write the results
+          --    to sJumpValid.
+        , sJumpValidAcc         :: UniqFM [Assoc Store]
+
+          -- | Map of (slot -> blocks which reload from this slot)
+          --    used to decide if whether slot spilled to will ever be
+          --    reloaded from on this path.
+        , sReloadedBy           :: UniqFM [BlockId]
+
+          -- | Spills and reloads cleaned each pass (latest at front)
+        , sCleanedCount         :: [(Int, Int)]
+
+          -- | Spills and reloads that have been cleaned in this pass so far.
+        , sCleanedSpillsAcc     :: Int
+        , sCleanedReloadsAcc    :: Int }
+
+
+-- | Construct the initial cleaner state.
+initCleanS :: CleanS
+initCleanS
+        = CleanS
+        { sJumpValid            = emptyUFM
+        , sJumpValidAcc         = emptyUFM
+
+        , sReloadedBy           = emptyUFM
+
+        , sCleanedCount         = []
+
+        , sCleanedSpillsAcc     = 0
+        , sCleanedReloadsAcc    = 0 }
+
+
+-- | Remember the associations before a jump.
+accJumpValid :: Assoc Store -> BlockId -> CleanM ()
+accJumpValid assocs target
+ = modify $ \s -> s {
+        sJumpValidAcc = addToUFM_C (++)
+                                (sJumpValidAcc s)
+                                target
+                                [assocs] }
+
+
+accBlockReloadsSlot :: BlockId -> Slot -> CleanM ()
+accBlockReloadsSlot blockId slot
+ = modify $ \s -> s {
+        sReloadedBy = addToUFM_C (++)
+                                (sReloadedBy s)
+                                (SSlot slot)
+                                [blockId] }
+
+
+-------------------------------------------------------------------------------
+-- A store location can be a stack slot or a register
+data Store
+        = SSlot Int
+        | SReg  Reg
+
+
+-- | Check if this is a reg store.
+isStoreReg :: Store -> Bool
+isStoreReg ss
+ = case ss of
+        SSlot _ -> False
+        SReg  _ -> True
+
+
+-- Spill cleaning is only done once all virtuals have been allocated to realRegs
+instance Uniquable Store where
+    getUnique (SReg  r)
+        | RegReal (RealRegSingle i)     <- r
+        = mkRegSingleUnique i
+
+        | RegReal (RealRegPair r1 r2)   <- r
+        = mkRegPairUnique (r1 * 65535 + r2)
+
+        | otherwise
+        = error $ "RegSpillClean.getUnique: found virtual reg during spill clean,"
+                ++ "only real regs expected."
+
+    getUnique (SSlot i) = mkRegSubUnique i    -- [SLPJ] I hope "SubUnique" is ok
+
+
+instance Outputable Store where
+        ppr (SSlot i)   = text "slot" <> int i
+        ppr (SReg  r)   = ppr r
+
+
+-------------------------------------------------------------------------------
+-- Association graphs.
+-- In the spill cleaner, two store locations are associated if they are known
+-- to hold the same value.
+--
+type Assoc a    = UniqFM (UniqSet a)
+
+-- | An empty association
+emptyAssoc :: Assoc a
+emptyAssoc      = emptyUFM
+
+
+-- | Add an association between these two things.
+addAssoc :: Uniquable a
+         => a -> a -> Assoc a -> Assoc a
+
+addAssoc a b m
+ = let  m1      = addToUFM_C unionUniqSets m  a (unitUniqSet b)
+        m2      = addToUFM_C unionUniqSets m1 b (unitUniqSet a)
+   in   m2
+
+
+-- | Delete all associations to a node.
+delAssoc :: (Uniquable a)
+         => a -> Assoc a -> Assoc a
+
+delAssoc a m
+        | Just aSet     <- lookupUFM  m a
+        , m1            <- delFromUFM m a
+        = nonDetFoldUniqSet (\x m -> delAssoc1 x a m) m1 aSet
+          -- It's OK to use nonDetFoldUFM here because deletion is commutative
+
+        | otherwise     = m
+
+
+-- | Delete a single association edge (a -> b).
+delAssoc1 :: Uniquable a
+          => a -> a -> Assoc a -> Assoc a
+
+delAssoc1 a b m
+        | Just aSet     <- lookupUFM m a
+        = addToUFM m a (delOneFromUniqSet aSet b)
+
+        | otherwise     = m
+
+
+-- | Check if these two things are associated.
+elemAssoc :: (Uniquable a)
+          => a -> a -> Assoc a -> Bool
+
+elemAssoc a b m
+        = elementOfUniqSet b (closeAssoc a m)
+
+
+-- | Find the refl. trans. closure of the association from this point.
+closeAssoc :: (Uniquable a)
+        => a -> Assoc a -> UniqSet a
+
+closeAssoc a assoc
+ =      closeAssoc' assoc emptyUniqSet (unitUniqSet a)
+ where
+        closeAssoc' assoc visited toVisit
+         = case nonDetEltsUniqSet toVisit of
+             -- See Note [Unique Determinism and code generation]
+
+                -- nothing else to visit, we're done
+                []      -> visited
+
+                (x:_)
+                 -- we've already seen this node
+                 |  elementOfUniqSet x visited
+                 -> closeAssoc' assoc visited (delOneFromUniqSet toVisit x)
+
+                 -- haven't seen this node before,
+                 --     remember to visit all its neighbors
+                 |  otherwise
+                 -> let neighbors
+                         = case lookupUFM assoc x of
+                                Nothing         -> emptyUniqSet
+                                Just set        -> set
+
+                   in closeAssoc' assoc
+                        (addOneToUniqSet visited x)
+                        (unionUniqSets   toVisit neighbors)
+
+-- | Intersect two associations.
+intersectAssoc :: Assoc a -> Assoc a -> Assoc a
+intersectAssoc a b
+        = intersectUFM_C (intersectUniqSets) a b
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/SpillCost.hs b/compiler/GHC/CmmToAsm/Reg/Graph/SpillCost.hs
new file mode 100644
index 0000000000..e3e456e98d
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/SpillCost.hs
@@ -0,0 +1,317 @@
+{-# LANGUAGE ScopedTypeVariables, GADTs, BangPatterns #-}
+module GHC.CmmToAsm.Reg.Graph.SpillCost (
+        SpillCostRecord,
+        plusSpillCostRecord,
+        pprSpillCostRecord,
+
+        SpillCostInfo,
+        zeroSpillCostInfo,
+        plusSpillCostInfo,
+
+        slurpSpillCostInfo,
+        chooseSpill,
+
+        lifeMapFromSpillCostInfo
+) where
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GraphBase
+
+import GHC.Cmm.Dataflow.Collections (mapLookup)
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm
+import UniqFM
+import UniqSet
+import Digraph          (flattenSCCs)
+import Outputable
+import GHC.Platform
+import State
+import GHC.CmmToAsm.CFG
+
+import Data.List        (nub, minimumBy)
+import Data.Maybe
+import Control.Monad (join)
+
+
+-- | Records the expected cost to spill some register.
+type SpillCostRecord
+ =      ( VirtualReg    -- register name
+        , Int           -- number of writes to this reg
+        , Int           -- number of reads from this reg
+        , Int)          -- number of instrs this reg was live on entry to
+
+
+-- | Map of `SpillCostRecord`
+type SpillCostInfo
+        = UniqFM SpillCostRecord
+
+type SpillCostState = State (UniqFM SpillCostRecord) ()
+
+-- | An empty map of spill costs.
+zeroSpillCostInfo :: SpillCostInfo
+zeroSpillCostInfo       = emptyUFM
+
+
+-- | Add two spill cost infos.
+plusSpillCostInfo :: SpillCostInfo -> SpillCostInfo -> SpillCostInfo
+plusSpillCostInfo sc1 sc2
+        = plusUFM_C plusSpillCostRecord sc1 sc2
+
+
+-- | Add two spill cost records.
+plusSpillCostRecord :: SpillCostRecord -> SpillCostRecord -> SpillCostRecord
+plusSpillCostRecord (r1, a1, b1, c1) (r2, a2, b2, c2)
+        | r1 == r2      = (r1, a1 + a2, b1 + b2, c1 + c2)
+        | otherwise     = error "RegSpillCost.plusRegInt: regs don't match"
+
+
+-- | Slurp out information used for determining spill costs.
+--
+--   For each vreg, the number of times it was written to, read from,
+--   and the number of instructions it was live on entry to (lifetime)
+--
+slurpSpillCostInfo :: forall instr statics. (Outputable instr, Instruction instr)
+                   => Platform
+                   -> Maybe CFG
+                   -> LiveCmmDecl statics instr
+                   -> SpillCostInfo
+
+slurpSpillCostInfo platform cfg cmm
+        = execState (countCmm cmm) zeroSpillCostInfo
+ where
+        countCmm CmmData{}              = return ()
+        countCmm (CmmProc info _ _ sccs)
+                = mapM_ (countBlock info freqMap)
+                $ flattenSCCs sccs
+            where
+                LiveInfo _ entries _ _ = info
+                freqMap = (fst . mkGlobalWeights (head entries)) <$> cfg
+
+        -- Lookup the regs that are live on entry to this block in
+        --      the info table from the CmmProc.
+        countBlock info freqMap (BasicBlock blockId instrs)
+                | LiveInfo _ _ blockLive _ <- info
+                , Just rsLiveEntry  <- mapLookup blockId blockLive
+                , rsLiveEntry_virt  <- takeVirtuals rsLiveEntry
+                = countLIs (ceiling $ blockFreq freqMap blockId) rsLiveEntry_virt instrs
+
+                | otherwise
+                = error "RegAlloc.SpillCost.slurpSpillCostInfo: bad block"
+
+
+        countLIs :: Int -> UniqSet VirtualReg -> [LiveInstr instr] -> SpillCostState
+        countLIs _      _      []
+                = return ()
+
+        -- Skip over comment and delta pseudo instrs.
+        countLIs scale rsLive (LiveInstr instr Nothing : lis)
+                | isMetaInstr instr
+                = countLIs scale rsLive lis
+
+                | otherwise
+                = pprPanic "RegSpillCost.slurpSpillCostInfo"
+                $ text "no liveness information on instruction " <> ppr instr
+
+        countLIs scale rsLiveEntry (LiveInstr instr (Just live) : lis)
+         = do
+                -- Increment the lifetime counts for regs live on entry to this instr.
+                mapM_ incLifetime $ nonDetEltsUniqSet rsLiveEntry
+                    -- This is non-deterministic but we do not
+                    -- currently support deterministic code-generation.
+                    -- See Note [Unique Determinism and code generation]
+
+                -- Increment counts for what regs were read/written from.
+                let (RU read written)   = regUsageOfInstr platform instr
+                mapM_ (incUses scale) $ catMaybes $ map takeVirtualReg $ nub read
+                mapM_ (incDefs scale) $ catMaybes $ map takeVirtualReg $ nub written
+
+                -- Compute liveness for entry to next instruction.
+                let liveDieRead_virt    = takeVirtuals (liveDieRead  live)
+                let liveDieWrite_virt   = takeVirtuals (liveDieWrite live)
+                let liveBorn_virt       = takeVirtuals (liveBorn     live)
+
+                let rsLiveAcross
+                        = rsLiveEntry `minusUniqSet` liveDieRead_virt
+
+                let rsLiveNext
+                        = (rsLiveAcross `unionUniqSets` liveBorn_virt)
+                                        `minusUniqSet`  liveDieWrite_virt
+
+                countLIs scale rsLiveNext lis
+
+        incDefs     count reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, count, 0, 0)
+        incUses     count reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, 0, count, 0)
+        incLifetime       reg = modify $ \s -> addToUFM_C plusSpillCostRecord s reg (reg, 0, 0, 1)
+
+        blockFreq :: Maybe (LabelMap Double) -> Label -> Double
+        blockFreq freqs bid
+          | Just freq <- join (mapLookup bid <$> freqs)
+          = max 1.0 (10000 * freq)
+          | otherwise
+          = 1.0 -- Only if no cfg given
+
+-- | Take all the virtual registers from this set.
+takeVirtuals :: UniqSet Reg -> UniqSet VirtualReg
+takeVirtuals set = mkUniqSet
+  [ vr | RegVirtual vr <- nonDetEltsUniqSet set ]
+  -- See Note [Unique Determinism and code generation]
+
+
+-- | Choose a node to spill from this graph
+chooseSpill
+        :: SpillCostInfo
+        -> Graph VirtualReg RegClass RealReg
+        -> VirtualReg
+
+chooseSpill info graph
+ = let  cost    = spillCost_length info graph
+        node    = minimumBy (\n1 n2 -> compare (cost $ nodeId n1) (cost $ nodeId n2))
+                $ nonDetEltsUFM $ graphMap graph
+                -- See Note [Unique Determinism and code generation]
+
+   in   nodeId node
+
+
+-------------------------------------------------------------------------------
+-- | Chaitins spill cost function is:
+--
+--   cost =     sum         loadCost * freq (u)  +    sum        storeCost * freq (d)
+--          u <- uses (v)                         d <- defs (v)
+--
+--   There are no loops in our code at the moment, so we can set the freq's to 1.
+--
+--  If we don't have live range splitting then Chaitins function performs badly
+--  if we have lots of nested live ranges and very few registers.
+--
+--               v1 v2 v3
+--      def v1   .
+--      use v1   .
+--      def v2   .  .
+--      def v3   .  .  .
+--      use v1   .  .  .
+--      use v3   .  .  .
+--      use v2   .  .
+--      use v1   .
+--
+--           defs uses degree   cost
+--      v1:  1     3     3      1.5
+--      v2:  1     2     3      1.0
+--      v3:  1     1     3      0.666
+--
+--   v3 has the lowest cost, but if we only have 2 hardregs and we insert
+--   spill code for v3 then this isn't going to improve the colorability of
+--   the graph.
+--
+--  When compiling SHA1, which as very long basic blocks and some vregs
+--  with very long live ranges the allocator seems to try and spill from
+--  the inside out and eventually run out of stack slots.
+--
+--  Without live range splitting, its's better to spill from the outside
+--  in so set the cost of very long live ranges to zero
+--
+
+-- spillCost_chaitin
+--         :: SpillCostInfo
+--         -> Graph VirtualReg RegClass RealReg
+--         -> VirtualReg
+--         -> Float
+
+-- spillCost_chaitin info graph reg
+--         -- Spilling a live range that only lives for 1 instruction
+--         -- isn't going to help us at all - and we definitely want to avoid
+--         -- trying to re-spill previously inserted spill code.
+--         | lifetime <= 1         = 1/0
+
+--         -- It's unlikely that we'll find a reg for a live range this long
+--         -- better to spill it straight up and not risk trying to keep it around
+--         -- and have to go through the build/color cycle again.
+
+--         -- To facility this we scale down the spill cost of long ranges.
+--         -- This makes sure long ranges are still spilled first.
+--         -- But this way spill cost remains relevant for long live
+--         -- ranges.
+--         | lifetime >= 128
+--         = (spillCost / conflicts) / 10.0
+
+
+--         -- Otherwise revert to chaitin's regular cost function.
+--         | otherwise = (spillCost / conflicts)
+--         where
+--             !spillCost = fromIntegral (uses + defs) :: Float
+--             conflicts = fromIntegral (nodeDegree classOfVirtualReg graph reg)
+--             (_, defs, uses, lifetime)
+--                 = fromMaybe (reg, 0, 0, 0) $ lookupUFM info reg
+
+
+-- Just spill the longest live range.
+spillCost_length
+        :: SpillCostInfo
+        -> Graph VirtualReg RegClass RealReg
+        -> VirtualReg
+        -> Float
+
+spillCost_length info _ reg
+        | lifetime <= 1         = 1/0
+        | otherwise             = 1 / fromIntegral lifetime
+        where (_, _, _, lifetime)
+                = fromMaybe (reg, 0, 0, 0)
+                $ lookupUFM info reg
+
+
+-- | Extract a map of register lifetimes from a `SpillCostInfo`.
+lifeMapFromSpillCostInfo :: SpillCostInfo -> UniqFM (VirtualReg, Int)
+lifeMapFromSpillCostInfo info
+        = listToUFM
+        $ map (\(r, _, _, life) -> (r, (r, life)))
+        $ nonDetEltsUFM info
+        -- See Note [Unique Determinism and code generation]
+
+
+-- | Determine the degree (number of neighbors) of this node which
+--   have the same class.
+nodeDegree
+        :: (VirtualReg -> RegClass)
+        -> Graph VirtualReg RegClass RealReg
+        -> VirtualReg
+        -> Int
+
+nodeDegree classOfVirtualReg graph reg
+        | Just node     <- lookupUFM (graphMap graph) reg
+
+        , virtConflicts
+           <- length
+           $ filter (\r -> classOfVirtualReg r == classOfVirtualReg reg)
+           $ nonDetEltsUniqSet
+           -- See Note [Unique Determinism and code generation]
+           $ nodeConflicts node
+
+        = virtConflicts + sizeUniqSet (nodeExclusions node)
+
+        | otherwise
+        = 0
+
+
+-- | Show a spill cost record, including the degree from the graph
+--   and final calculated spill cost.
+pprSpillCostRecord
+        :: (VirtualReg -> RegClass)
+        -> (Reg -> SDoc)
+        -> Graph VirtualReg RegClass RealReg
+        -> SpillCostRecord
+        -> SDoc
+
+pprSpillCostRecord regClass pprReg graph (reg, uses, defs, life)
+        =  hsep
+        [ pprReg (RegVirtual reg)
+        , ppr uses
+        , ppr defs
+        , ppr life
+        , ppr $ nodeDegree regClass graph reg
+        , text $ show $ (fromIntegral (uses + defs)
+                       / fromIntegral (nodeDegree regClass graph reg) :: Float) ]
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/Stats.hs b/compiler/GHC/CmmToAsm/Reg/Graph/Stats.hs
new file mode 100644
index 0000000000..05d2e814af
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/Stats.hs
@@ -0,0 +1,346 @@
+{-# LANGUAGE BangPatterns, CPP #-}
+
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-- | Carries interesting info for debugging / profiling of the
+--   graph coloring register allocator.
+module GHC.CmmToAsm.Reg.Graph.Stats (
+        RegAllocStats (..),
+
+        pprStats,
+        pprStatsSpills,
+        pprStatsLifetimes,
+        pprStatsConflict,
+        pprStatsLifeConflict,
+
+        countSRMs, addSRM
+) where
+
+import GhcPrelude
+
+import qualified GraphColor as Color
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Reg.Graph.Spill
+import GHC.CmmToAsm.Reg.Graph.SpillCost
+import GHC.CmmToAsm.Reg.Graph.TrivColorable
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Reg.Target
+
+import Outputable
+import UniqFM
+import UniqSet
+import State
+
+-- | Holds interesting statistics from the register allocator.
+data RegAllocStats statics instr
+
+        -- Information about the initial conflict graph.
+        = RegAllocStatsStart
+        { -- | Initial code, with liveness.
+          raLiveCmm     :: [LiveCmmDecl statics instr]
+
+          -- | The initial, uncolored graph.
+        , raGraph       :: Color.Graph VirtualReg RegClass RealReg
+
+          -- | Information to help choose which regs to spill.
+        , raSpillCosts  :: SpillCostInfo }
+
+
+        -- Information about an intermediate graph.
+        -- This is one that we couldn't color, so had to insert spill code
+        -- instruction stream.
+        | RegAllocStatsSpill
+        { -- | Code we tried to allocate registers for.
+          raCode        :: [LiveCmmDecl statics instr]
+
+          -- | Partially colored graph.
+        , raGraph       :: Color.Graph VirtualReg RegClass RealReg
+
+          -- | The regs that were coalesced.
+        , raCoalesced   :: UniqFM VirtualReg
+
+          -- | Spiller stats.
+        , raSpillStats  :: SpillStats
+
+          -- | Number of instructions each reg lives for.
+        , raSpillCosts  :: SpillCostInfo
+
+          -- | Code with spill instructions added.
+        , raSpilled     :: [LiveCmmDecl statics instr] }
+
+
+        -- a successful coloring
+        | RegAllocStatsColored
+        { -- | Code we tried to allocate registers for.
+          raCode          :: [LiveCmmDecl statics instr]
+
+          -- | Uncolored graph.
+        , raGraph         :: Color.Graph VirtualReg RegClass RealReg
+
+          -- | Coalesced and colored graph.
+        , raGraphColored  :: Color.Graph VirtualReg RegClass RealReg
+
+          -- | Regs that were coalesced.
+        , raCoalesced     :: UniqFM VirtualReg
+
+          -- | Code with coalescings applied.
+        , raCodeCoalesced :: [LiveCmmDecl statics instr]
+
+          -- | Code with vregs replaced by hregs.
+        , raPatched       :: [LiveCmmDecl statics instr]
+
+          -- | Code with unneeded spill\/reloads cleaned out.
+        , raSpillClean    :: [LiveCmmDecl statics instr]
+
+          -- | Final code.
+        , raFinal         :: [NatCmmDecl statics instr]
+
+          -- | Spill\/reload\/reg-reg moves present in this code.
+        , raSRMs          :: (Int, Int, Int) }
+
+
+instance (Outputable statics, Outputable instr)
+       => Outputable (RegAllocStats statics instr) where
+
+ ppr (s@RegAllocStatsStart{}) = sdocWithPlatform $ \platform ->
+           text "#  Start"
+        $$ text "#  Native code with liveness information."
+        $$ ppr (raLiveCmm s)
+        $$ text ""
+        $$ text "#  Initial register conflict graph."
+        $$ Color.dotGraph
+                (targetRegDotColor platform)
+                (trivColorable platform
+                        (targetVirtualRegSqueeze platform)
+                        (targetRealRegSqueeze platform))
+                (raGraph s)
+
+
+ ppr (s@RegAllocStatsSpill{}) =
+           text "#  Spill"
+
+        $$ text "#  Code with liveness information."
+        $$ ppr (raCode s)
+        $$ text ""
+
+        $$ (if (not $ isNullUFM $ raCoalesced s)
+                then    text "#  Registers coalesced."
+                        $$ pprUFMWithKeys (raCoalesced s) (vcat . map ppr)
+                        $$ text ""
+                else empty)
+
+        $$ text "#  Spills inserted."
+        $$ ppr (raSpillStats s)
+        $$ text ""
+
+        $$ text "#  Code with spills inserted."
+        $$ ppr (raSpilled s)
+
+
+ ppr (s@RegAllocStatsColored { raSRMs = (spills, reloads, moves) })
+    = sdocWithPlatform $ \platform ->
+           text "#  Colored"
+
+        $$ text "#  Code with liveness information."
+        $$ ppr (raCode s)
+        $$ text ""
+
+        $$ text "#  Register conflict graph (colored)."
+        $$ Color.dotGraph
+                (targetRegDotColor platform)
+                (trivColorable platform
+                        (targetVirtualRegSqueeze platform)
+                        (targetRealRegSqueeze platform))
+                (raGraphColored s)
+        $$ text ""
+
+        $$ (if (not $ isNullUFM $ raCoalesced s)
+                then    text "#  Registers coalesced."
+                        $$ pprUFMWithKeys (raCoalesced s) (vcat . map ppr)
+                        $$ text ""
+                else empty)
+
+        $$ text "#  Native code after coalescings applied."
+        $$ ppr (raCodeCoalesced s)
+        $$ text ""
+
+        $$ text "#  Native code after register allocation."
+        $$ ppr (raPatched s)
+        $$ text ""
+
+        $$ text "#  Clean out unneeded spill/reloads."
+        $$ ppr (raSpillClean s)
+        $$ text ""
+
+        $$ text "#  Final code, after rewriting spill/rewrite pseudo instrs."
+        $$ ppr (raFinal s)
+        $$ text ""
+        $$  text "#  Score:"
+        $$ (text "#          spills  inserted: " <> int spills)
+        $$ (text "#          reloads inserted: " <> int reloads)
+        $$ (text "#   reg-reg moves remaining: " <> int moves)
+        $$ text ""
+
+
+-- | Do all the different analysis on this list of RegAllocStats
+pprStats
+        :: [RegAllocStats statics instr]
+        -> Color.Graph VirtualReg RegClass RealReg
+        -> SDoc
+
+pprStats stats graph
+ = let  outSpills       = pprStatsSpills    stats
+        outLife         = pprStatsLifetimes stats
+        outConflict     = pprStatsConflict  stats
+        outScatter      = pprStatsLifeConflict stats graph
+
+  in    vcat [outSpills, outLife, outConflict, outScatter]
+
+
+-- | Dump a table of how many spill loads \/ stores were inserted for each vreg.
+pprStatsSpills
+        :: [RegAllocStats statics instr] -> SDoc
+
+pprStatsSpills stats
+ = let
+        finals  = [ s   | s@RegAllocStatsColored{} <- stats]
+
+        -- sum up how many stores\/loads\/reg-reg-moves were left in the code
+        total   = foldl' addSRM (0, 0, 0)
+                $ map raSRMs finals
+
+    in  (  text "-- spills-added-total"
+        $$ text "--    (stores, loads, reg_reg_moves_remaining)"
+        $$ ppr total
+        $$ text "")
+
+
+-- | Dump a table of how long vregs tend to live for in the initial code.
+pprStatsLifetimes
+        :: [RegAllocStats statics instr] -> SDoc
+
+pprStatsLifetimes stats
+ = let  info            = foldl' plusSpillCostInfo zeroSpillCostInfo
+                                [ raSpillCosts s
+                                        | s@RegAllocStatsStart{} <- stats ]
+
+        lifeBins        = binLifetimeCount $ lifeMapFromSpillCostInfo info
+
+   in   (  text "-- vreg-population-lifetimes"
+        $$ text "--   (instruction_count, number_of_vregs_that_lived_that_long)"
+        $$ pprUFM lifeBins (vcat . map ppr)
+        $$ text "\n")
+
+
+binLifetimeCount :: UniqFM (VirtualReg, Int) -> UniqFM (Int, Int)
+binLifetimeCount fm
+ = let  lifes   = map (\l -> (l, (l, 1)))
+                $ map snd
+                $ nonDetEltsUFM fm
+                -- See Note [Unique Determinism and code generation]
+
+   in   addListToUFM_C
+                (\(l1, c1) (_, c2) -> (l1, c1 + c2))
+                emptyUFM
+                lifes
+
+
+-- | Dump a table of how many conflicts vregs tend to have in the initial code.
+pprStatsConflict
+        :: [RegAllocStats statics instr] -> SDoc
+
+pprStatsConflict stats
+ = let  confMap = foldl' (plusUFM_C (\(c1, n1) (_, n2) -> (c1, n1 + n2)))
+                        emptyUFM
+                $ map Color.slurpNodeConflictCount
+                        [ raGraph s | s@RegAllocStatsStart{} <- stats ]
+
+   in   (  text "-- vreg-conflicts"
+        $$ text "--   (conflict_count, number_of_vregs_that_had_that_many_conflicts)"
+        $$ pprUFM confMap (vcat . map ppr)
+        $$ text "\n")
+
+
+-- | For every vreg, dump how many conflicts it has, and its lifetime.
+--      Good for making a scatter plot.
+pprStatsLifeConflict
+        :: [RegAllocStats statics instr]
+        -> Color.Graph VirtualReg RegClass RealReg -- ^ global register conflict graph
+        -> SDoc
+
+pprStatsLifeConflict stats graph
+ = let  lifeMap = lifeMapFromSpillCostInfo
+                $ foldl' plusSpillCostInfo zeroSpillCostInfo
+                $ [ raSpillCosts s | s@RegAllocStatsStart{} <- stats ]
+
+        scatter = map   (\r ->  let lifetime  = case lookupUFM lifeMap r of
+                                                      Just (_, l) -> l
+                                                      Nothing     -> 0
+                                    Just node = Color.lookupNode graph r
+                                in parens $ hcat $ punctuate (text ", ")
+                                        [ doubleQuotes $ ppr $ Color.nodeId node
+                                        , ppr $ sizeUniqSet (Color.nodeConflicts node)
+                                        , ppr $ lifetime ])
+                $ map Color.nodeId
+                $ nonDetEltsUFM
+                -- See Note [Unique Determinism and code generation]
+                $ Color.graphMap graph
+
+   in   (  text "-- vreg-conflict-lifetime"
+        $$ text "--   (vreg, vreg_conflicts, vreg_lifetime)"
+        $$ (vcat scatter)
+        $$ text "\n")
+
+
+-- | Count spill/reload/reg-reg moves.
+--      Lets us see how well the register allocator has done.
+countSRMs
+        :: Instruction instr
+        => LiveCmmDecl statics instr -> (Int, Int, Int)
+
+countSRMs cmm
+        = execState (mapBlockTopM countSRM_block cmm) (0, 0, 0)
+
+
+countSRM_block
+        :: Instruction instr
+        => GenBasicBlock (LiveInstr instr)
+        -> State (Int, Int, Int) (GenBasicBlock (LiveInstr instr))
+
+countSRM_block (BasicBlock i instrs)
+ = do   instrs' <- mapM countSRM_instr instrs
+        return  $ BasicBlock i instrs'
+
+
+countSRM_instr
+        :: Instruction instr
+        => LiveInstr instr -> State (Int, Int, Int) (LiveInstr instr)
+
+countSRM_instr li
+        | LiveInstr SPILL{} _    <- li
+        = do    modify  $ \(s, r, m)    -> (s + 1, r, m)
+                return li
+
+        | LiveInstr RELOAD{} _  <- li
+        = do    modify  $ \(s, r, m)    -> (s, r + 1, m)
+                return li
+
+        | LiveInstr instr _     <- li
+        , Just _        <- takeRegRegMoveInstr instr
+        = do    modify  $ \(s, r, m)    -> (s, r, m + 1)
+                return li
+
+        | otherwise
+        =       return li
+
+
+-- sigh..
+addSRM :: (Int, Int, Int) -> (Int, Int, Int) -> (Int, Int, Int)
+addSRM (s1, r1, m1) (s2, r2, m2)
+ = let  !s = s1 + s2
+        !r = r1 + r2
+        !m = m1 + m2
+   in   (s, r, m)
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs b/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
new file mode 100644
index 0000000000..ec7c5ad13e
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/TrivColorable.hs
@@ -0,0 +1,274 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.CmmToAsm.Reg.Graph.TrivColorable (
+        trivColorable,
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GraphBase
+
+import UniqSet
+import GHC.Platform
+import Panic
+
+-- trivColorable ---------------------------------------------------------------
+
+-- trivColorable function for the graph coloring allocator
+--
+--      This gets hammered by scanGraph during register allocation,
+--      so needs to be fairly efficient.
+--
+--      NOTE:   This only works for architectures with just RcInteger and RcDouble
+--              (which are disjoint) ie. x86, x86_64 and ppc
+--
+--      The number of allocatable regs is hard coded in here so we can do
+--              a fast comparison in trivColorable.
+--
+--      It's ok if these numbers are _less_ than the actual number of free
+--              regs, but they can't be more or the register conflict
+--              graph won't color.
+--
+--      If the graph doesn't color then the allocator will panic, but it won't
+--              generate bad object code or anything nasty like that.
+--
+--      There is an allocatableRegsInClass :: RegClass -> Int, but doing
+--      the unboxing is too slow for us here.
+--      TODO: Is that still true? Could we use allocatableRegsInClass
+--      without losing performance now?
+--
+--      Look at includes/stg/MachRegs.h to get the numbers.
+--
+
+
+-- Disjoint registers ----------------------------------------------------------
+--
+--      The definition has been unfolded into individual cases for speed.
+--      Each architecture has a different register setup, so we use a
+--      different regSqueeze function for each.
+--
+accSqueeze
+        :: Int
+        -> Int
+        -> (reg -> Int)
+        -> UniqSet reg
+        -> Int
+
+accSqueeze count maxCount squeeze us = acc count (nonDetEltsUniqSet us)
+  -- See Note [Unique Determinism and code generation]
+  where acc count [] = count
+        acc count _ | count >= maxCount = count
+        acc count (r:rs) = acc (count + squeeze r) rs
+
+{- Note [accSqueeze]
+~~~~~~~~~~~~~~~~~~~~
+BL 2007/09
+Doing a nice fold over the UniqSet makes trivColorable use
+32% of total compile time and 42% of total alloc when compiling SHA1.hs from darcs.
+Therefore the UniqFM is made non-abstract and we use custom fold.
+
+MS 2010/04
+When converting UniqFM to use Data.IntMap, the fold cannot use UniqFM internal
+representation any more. But it is imperative that the accSqueeze stops
+the folding if the count gets greater or equal to maxCount. We thus convert
+UniqFM to a (lazy) list, do the fold and stops if necessary, which was
+the most efficient variant tried. Benchmark compiling 10-times SHA1.hs follows.
+(original = previous implementation, folding = fold of the whole UFM,
+ lazyFold = the current implementation,
+ hackFold = using internal representation of Data.IntMap)
+
+                                 original  folding   hackFold  lazyFold
+ -O -fasm (used everywhere)      31.509s   30.387s   30.791s   30.603s
+                                 100.00%   96.44%    97.72%    97.12%
+ -fregs-graph                    67.938s   74.875s   62.673s   64.679s
+                                 100.00%   110.21%   92.25%    95.20%
+ -fregs-iterative                89.761s   143.913s  81.075s   86.912s
+                                 100.00%   160.33%   90.32%    96.83%
+ -fnew-codegen                   38.225s   37.142s   37.551s   37.119s
+                                 100.00%   97.17%    98.24%    97.11%
+ -fnew-codegen -fregs-graph      91.786s   91.51s    87.368s   86.88s
+                                 100.00%   99.70%    95.19%    94.65%
+ -fnew-codegen -fregs-iterative  206.72s   343.632s  194.694s  208.677s
+                                 100.00%   166.23%   94.18%    100.95%
+-}
+
+trivColorable
+        :: Platform
+        -> (RegClass -> VirtualReg -> Int)
+        -> (RegClass -> RealReg    -> Int)
+        -> Triv VirtualReg RegClass RealReg
+
+trivColorable platform virtualRegSqueeze realRegSqueeze RcInteger conflicts exclusions
+        | let cALLOCATABLE_REGS_INTEGER
+                  =        (case platformArch platform of
+                            ArchX86       -> 3
+                            ArchX86_64    -> 5
+                            ArchPPC       -> 16
+                            ArchSPARC     -> 14
+                            ArchSPARC64   -> panic "trivColorable ArchSPARC64"
+                            ArchPPC_64 _  -> 15
+                            ArchARM _ _ _ -> panic "trivColorable ArchARM"
+                            ArchARM64     -> panic "trivColorable ArchARM64"
+                            ArchAlpha     -> panic "trivColorable ArchAlpha"
+                            ArchMipseb    -> panic "trivColorable ArchMipseb"
+                            ArchMipsel    -> panic "trivColorable ArchMipsel"
+                            ArchS390X     -> panic "trivColorable ArchS390X"
+                            ArchJavaScript-> panic "trivColorable ArchJavaScript"
+                            ArchUnknown   -> panic "trivColorable ArchUnknown")
+        , count2        <- accSqueeze 0 cALLOCATABLE_REGS_INTEGER
+                                (virtualRegSqueeze RcInteger)
+                                conflicts
+
+        , count3        <- accSqueeze  count2    cALLOCATABLE_REGS_INTEGER
+                                (realRegSqueeze   RcInteger)
+                                exclusions
+
+        = count3 < cALLOCATABLE_REGS_INTEGER
+
+trivColorable platform virtualRegSqueeze realRegSqueeze RcFloat conflicts exclusions
+        | let cALLOCATABLE_REGS_FLOAT
+                  =        (case platformArch platform of
+                    -- On x86_64 and x86, Float and RcDouble
+                    -- use the same registers,
+                    -- so we only use RcDouble to represent the
+                    -- register allocation problem on those types.
+                            ArchX86       -> 0
+                            ArchX86_64    -> 0
+                            ArchPPC       -> 0
+                            ArchSPARC     -> 22
+                            ArchSPARC64   -> panic "trivColorable ArchSPARC64"
+                            ArchPPC_64 _  -> 0
+                            ArchARM _ _ _ -> panic "trivColorable ArchARM"
+                            ArchARM64     -> panic "trivColorable ArchARM64"
+                            ArchAlpha     -> panic "trivColorable ArchAlpha"
+                            ArchMipseb    -> panic "trivColorable ArchMipseb"
+                            ArchMipsel    -> panic "trivColorable ArchMipsel"
+                            ArchS390X     -> panic "trivColorable ArchS390X"
+                            ArchJavaScript-> panic "trivColorable ArchJavaScript"
+                            ArchUnknown   -> panic "trivColorable ArchUnknown")
+        , count2        <- accSqueeze 0 cALLOCATABLE_REGS_FLOAT
+                                (virtualRegSqueeze RcFloat)
+                                conflicts
+
+        , count3        <- accSqueeze  count2    cALLOCATABLE_REGS_FLOAT
+                                (realRegSqueeze   RcFloat)
+                                exclusions
+
+        = count3 < cALLOCATABLE_REGS_FLOAT
+
+trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions
+        | let cALLOCATABLE_REGS_DOUBLE
+                  =        (case platformArch platform of
+                            ArchX86       -> 8
+                            -- in x86 32bit mode sse2 there are only
+                            -- 8 XMM registers xmm0 ... xmm7
+                            ArchX86_64    -> 10
+                            -- in x86_64 there are 16 XMM registers
+                            -- xmm0 .. xmm15, here 10 is a
+                            -- "dont need to solve conflicts" count that
+                            -- was chosen at some point in the past.
+                            ArchPPC       -> 26
+                            ArchSPARC     -> 11
+                            ArchSPARC64   -> panic "trivColorable ArchSPARC64"
+                            ArchPPC_64 _  -> 20
+                            ArchARM _ _ _ -> panic "trivColorable ArchARM"
+                            ArchARM64     -> panic "trivColorable ArchARM64"
+                            ArchAlpha     -> panic "trivColorable ArchAlpha"
+                            ArchMipseb    -> panic "trivColorable ArchMipseb"
+                            ArchMipsel    -> panic "trivColorable ArchMipsel"
+                            ArchS390X     -> panic "trivColorable ArchS390X"
+                            ArchJavaScript-> panic "trivColorable ArchJavaScript"
+                            ArchUnknown   -> panic "trivColorable ArchUnknown")
+        , count2        <- accSqueeze 0 cALLOCATABLE_REGS_DOUBLE
+                                (virtualRegSqueeze RcDouble)
+                                conflicts
+
+        , count3        <- accSqueeze  count2    cALLOCATABLE_REGS_DOUBLE
+                                (realRegSqueeze   RcDouble)
+                                exclusions
+
+        = count3 < cALLOCATABLE_REGS_DOUBLE
+
+
+
+
+-- Specification Code ----------------------------------------------------------
+--
+--      The trivColorable function for each particular architecture should
+--      implement the following function, but faster.
+--
+
+{-
+trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool
+trivColorable classN conflicts exclusions
+ = let
+
+        acc :: Reg -> (Int, Int) -> (Int, Int)
+        acc r (cd, cf)
+         = case regClass r of
+                RcInteger       -> (cd+1, cf)
+                RcFloat         -> (cd,   cf+1)
+                _               -> panic "Regs.trivColorable: reg class not handled"
+
+        tmp                     = nonDetFoldUFM acc (0, 0) conflicts
+        (countInt,  countFloat) = nonDetFoldUFM acc tmp    exclusions
+
+        squeese         = worst countInt   classN RcInteger
+                        + worst countFloat classN RcFloat
+
+   in   squeese < allocatableRegsInClass classN
+
+-- | Worst case displacement
+--      node N of classN has n neighbors of class C.
+--
+--      We currently only have RcInteger and RcDouble, which don't conflict at all.
+--      This is a bit boring compared to what's in RegArchX86.
+--
+worst :: Int -> RegClass -> RegClass -> Int
+worst n classN classC
+ = case classN of
+        RcInteger
+         -> case classC of
+                RcInteger       -> min n (allocatableRegsInClass RcInteger)
+                RcFloat         -> 0
+
+        RcDouble
+         -> case classC of
+                RcFloat         -> min n (allocatableRegsInClass RcFloat)
+                RcInteger       -> 0
+
+-- allocatableRegs is allMachRegNos with the fixed-use regs removed.
+-- i.e., these are the regs for which we are prepared to allow the
+-- register allocator to attempt to map VRegs to.
+allocatableRegs :: [RegNo]
+allocatableRegs
+   = let isFree i = freeReg i
+     in  filter isFree allMachRegNos
+
+
+-- | The number of regs in each class.
+--      We go via top level CAFs to ensure that we're not recomputing
+--      the length of these lists each time the fn is called.
+allocatableRegsInClass :: RegClass -> Int
+allocatableRegsInClass cls
+ = case cls of
+        RcInteger       -> allocatableRegsInteger
+        RcFloat         -> allocatableRegsDouble
+
+allocatableRegsInteger :: Int
+allocatableRegsInteger
+        = length $ filter (\r -> regClass r == RcInteger)
+                 $ map RealReg allocatableRegs
+
+allocatableRegsFloat :: Int
+allocatableRegsFloat
+        = length $ filter (\r -> regClass r == RcFloat
+                 $ map RealReg allocatableRegs
+-}
diff --git a/compiler/GHC/CmmToAsm/Reg/Graph/X86.hs b/compiler/GHC/CmmToAsm/Reg/Graph/X86.hs
new file mode 100644
index 0000000000..0d4c56ba21
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Graph/X86.hs
@@ -0,0 +1,161 @@
+
+-- | A description of the register set of the X86.
+--
+--   This isn't used directly in GHC proper.
+--
+--   See RegArchBase.hs for the reference.
+--   See MachRegs.hs for the actual trivColorable function used in GHC.
+--
+module GHC.CmmToAsm.Reg.Graph.X86 (
+        classOfReg,
+        regsOfClass,
+        regName,
+        regAlias,
+        worst,
+        squeese,
+) where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Graph.Base  (Reg(..), RegSub(..), RegClass(..))
+import UniqSet
+
+import qualified Data.Array as A
+
+
+-- | Determine the class of a register
+classOfReg :: Reg -> RegClass
+classOfReg reg
+ = case reg of
+        Reg c _         -> c
+
+        RegSub SubL16 _ -> ClassG16
+        RegSub SubL8  _ -> ClassG8
+        RegSub SubL8H _ -> ClassG8
+
+
+-- | Determine all the regs that make up a certain class.
+regsOfClass :: RegClass -> UniqSet Reg
+regsOfClass c
+ = case c of
+        ClassG32
+         -> mkUniqSet   [ Reg ClassG32  i
+                        | i <- [0..7] ]
+
+        ClassG16
+         -> mkUniqSet   [ RegSub SubL16 (Reg ClassG32 i)
+                        | i <- [0..7] ]
+
+        ClassG8
+         -> unionUniqSets
+                (mkUniqSet [ RegSub SubL8  (Reg ClassG32 i) | i <- [0..3] ])
+                (mkUniqSet [ RegSub SubL8H (Reg ClassG32 i) | i <- [0..3] ])
+
+        ClassF64
+         -> mkUniqSet   [ Reg ClassF64  i
+                        | i <- [0..5] ]
+
+
+-- | Determine the common name of a reg
+--      returns Nothing if this reg is not part of the machine.
+regName :: Reg -> Maybe String
+regName reg
+ = case reg of
+        Reg ClassG32 i
+         | i <= 7 ->
+           let names = A.listArray (0,8)
+                       [ "eax", "ebx", "ecx", "edx"
+                       , "ebp", "esi", "edi", "esp" ]
+           in Just $ names A.! i
+
+        RegSub SubL16 (Reg ClassG32 i)
+         | i <= 7 ->
+           let names = A.listArray (0,8)
+                       [ "ax", "bx", "cx", "dx"
+                       , "bp", "si", "di", "sp"]
+           in Just $ names A.! i
+
+        RegSub SubL8  (Reg ClassG32 i)
+         | i <= 3 ->
+           let names = A.listArray (0,4) [ "al", "bl", "cl", "dl"]
+           in Just $ names A.! i
+
+        RegSub SubL8H (Reg ClassG32 i)
+         | i <= 3 ->
+           let names = A.listArray (0,4) [ "ah", "bh", "ch", "dh"]
+           in Just $ names A.! i
+
+        _         -> Nothing
+
+
+-- | Which regs alias what other regs.
+regAlias :: Reg -> UniqSet Reg
+regAlias reg
+ = case reg of
+
+        -- 32 bit regs alias all of the subregs
+        Reg ClassG32 i
+
+         -- for eax, ebx, ecx, eds
+         |  i <= 3
+         -> mkUniqSet
+         $ [ Reg ClassG32 i,   RegSub SubL16 reg
+           , RegSub SubL8 reg, RegSub SubL8H reg ]
+
+         -- for esi, edi, esp, ebp
+         | 4 <= i && i <= 7
+         -> mkUniqSet
+         $ [ Reg ClassG32 i,   RegSub SubL16 reg ]
+
+        -- 16 bit subregs alias the whole reg
+        RegSub SubL16 r@(Reg ClassG32 _)
+         ->     regAlias r
+
+        -- 8 bit subregs alias the 32 and 16, but not the other 8 bit subreg
+        RegSub SubL8  r@(Reg ClassG32 _)
+         -> mkUniqSet $ [ r, RegSub SubL16 r, RegSub SubL8 r ]
+
+        RegSub SubL8H r@(Reg ClassG32 _)
+         -> mkUniqSet $ [ r, RegSub SubL16 r, RegSub SubL8H r ]
+
+        -- fp
+        Reg ClassF64 _
+         -> unitUniqSet reg
+
+        _ -> error "regAlias: invalid register"
+
+
+-- | Optimised versions of RegColorBase.{worst, squeese} specific to x86
+worst :: Int -> RegClass -> RegClass -> Int
+worst n classN classC
+ = case classN of
+        ClassG32
+         -> case classC of
+                ClassG32        -> min n 8
+                ClassG16        -> min n 8
+                ClassG8         -> min n 4
+                ClassF64        -> 0
+
+        ClassG16
+         -> case classC of
+                ClassG32        -> min n 8
+                ClassG16        -> min n 8
+                ClassG8         -> min n 4
+                ClassF64        -> 0
+
+        ClassG8
+         -> case classC of
+                ClassG32        -> min (n*2) 8
+                ClassG16        -> min (n*2) 8
+                ClassG8         -> min n 8
+                ClassF64        -> 0
+
+        ClassF64
+         -> case classC of
+                ClassF64        -> min n 6
+                _               -> 0
+
+squeese :: RegClass -> [(Int, RegClass)] -> Int
+squeese classN countCs
+        = sum (map (\(i, classC) -> worst i classN classC) countCs)
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear.hs b/compiler/GHC/CmmToAsm/Reg/Linear.hs
new file mode 100644
index 0000000000..9b263889d8
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear.hs
@@ -0,0 +1,920 @@
+{-# LANGUAGE BangPatterns, CPP, ScopedTypeVariables #-}
+
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-----------------------------------------------------------------------------
+--
+-- The register allocator
+--
+-- (c) The University of Glasgow 2004
+--
+-----------------------------------------------------------------------------
+
+{-
+The algorithm is roughly:
+
+  1) Compute strongly connected components of the basic block list.
+
+  2) Compute liveness (mapping from pseudo register to
+     point(s) of death?).
+
+  3) Walk instructions in each basic block.  We keep track of
+        (a) Free real registers (a bitmap?)
+        (b) Current assignment of temporaries to machine registers and/or
+            spill slots (call this the "assignment").
+        (c) Partial mapping from basic block ids to a virt-to-loc mapping.
+            When we first encounter a branch to a basic block,
+            we fill in its entry in this table with the current mapping.
+
+     For each instruction:
+        (a) For each temporary *read* by the instruction:
+            If the temporary does not have a real register allocation:
+                - Allocate a real register from the free list.  If
+                  the list is empty:
+                  - Find a temporary to spill.  Pick one that is
+                    not used in this instruction (ToDo: not
+                    used for a while...)
+                  - generate a spill instruction
+                - If the temporary was previously spilled,
+                  generate an instruction to read the temp from its spill loc.
+            (optimisation: if we can see that a real register is going to
+            be used soon, then don't use it for allocation).
+
+        (b) For each real register clobbered by this instruction:
+            If a temporary resides in it,
+                If the temporary is live after this instruction,
+                    Move the temporary to another (non-clobbered & free) reg,
+                    or spill it to memory.  Mark the temporary as residing
+                    in both memory and a register if it was spilled (it might
+                    need to be read by this instruction).
+
+            (ToDo: this is wrong for jump instructions?)
+
+            We do this after step (a), because if we start with
+               movq v1, %rsi
+            which is an instruction that clobbers %rsi, if v1 currently resides
+            in %rsi we want to get
+               movq %rsi, %freereg
+               movq %rsi, %rsi     -- will disappear
+            instead of
+               movq %rsi, %freereg
+               movq %freereg, %rsi
+
+        (c) Update the current assignment
+
+        (d) If the instruction is a branch:
+              if the destination block already has a register assignment,
+                Generate a new block with fixup code and redirect the
+                jump to the new block.
+              else,
+                Update the block id->assignment mapping with the current
+                assignment.
+
+        (e) Delete all register assignments for temps which are read
+            (only) and die here.  Update the free register list.
+
+        (f) Mark all registers clobbered by this instruction as not free,
+            and mark temporaries which have been spilled due to clobbering
+            as in memory (step (a) marks then as in both mem & reg).
+
+        (g) For each temporary *written* by this instruction:
+            Allocate a real register as for (b), spilling something
+            else if necessary.
+                - except when updating the assignment, drop any memory
+                  locations that the temporary was previously in, since
+                  they will be no longer valid after this instruction.
+
+        (h) Delete all register assignments for temps which are
+            written and die here (there should rarely be any).  Update
+            the free register list.
+
+        (i) Rewrite the instruction with the new mapping.
+
+        (j) For each spilled reg known to be now dead, re-add its stack slot
+            to the free list.
+
+-}
+
+module GHC.CmmToAsm.Reg.Linear (
+        regAlloc,
+        module  GHC.CmmToAsm.Reg.Linear.Base,
+        module  GHC.CmmToAsm.Reg.Linear.Stats
+  ) where
+
+#include "HsVersions.h"
+
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Linear.State
+import GHC.CmmToAsm.Reg.Linear.Base
+import GHC.CmmToAsm.Reg.Linear.StackMap
+import GHC.CmmToAsm.Reg.Linear.FreeRegs
+import GHC.CmmToAsm.Reg.Linear.Stats
+import GHC.CmmToAsm.Reg.Linear.JoinToTargets
+import qualified GHC.CmmToAsm.Reg.Linear.PPC    as PPC
+import qualified GHC.CmmToAsm.Reg.Linear.SPARC  as SPARC
+import qualified GHC.CmmToAsm.Reg.Linear.X86    as X86
+import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64
+import GHC.CmmToAsm.Reg.Target
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm hiding (RegSet)
+
+import Digraph
+import GHC.Driver.Session
+import Unique
+import UniqSet
+import UniqFM
+import UniqSupply
+import Outputable
+import GHC.Platform
+
+import Data.Maybe
+import Data.List
+import Control.Monad
+
+-- -----------------------------------------------------------------------------
+-- Top level of the register allocator
+
+-- Allocate registers
+regAlloc
+        :: (Outputable instr, Instruction instr)
+        => DynFlags
+        -> LiveCmmDecl statics instr
+        -> UniqSM ( NatCmmDecl statics instr
+                  , Maybe Int  -- number of extra stack slots required,
+                               -- beyond maxSpillSlots
+                  , Maybe RegAllocStats
+                  )
+
+regAlloc _ (CmmData sec d)
+        = return
+                ( CmmData sec d
+                , Nothing
+                , Nothing )
+
+regAlloc _ (CmmProc (LiveInfo info _ _ _) lbl live [])
+        = return ( CmmProc info lbl live (ListGraph [])
+                 , Nothing
+                 , Nothing )
+
+regAlloc dflags (CmmProc static lbl live sccs)
+        | LiveInfo info entry_ids@(first_id:_) block_live _ <- static
+        = do
+                -- do register allocation on each component.
+                (final_blocks, stats, stack_use)
+                        <- linearRegAlloc dflags entry_ids block_live sccs
+
+                -- make sure the block that was first in the input list
+                --      stays at the front of the output
+                let ((first':_), rest')
+                                = partition ((== first_id) . blockId) final_blocks
+
+                let max_spill_slots = maxSpillSlots dflags
+                    extra_stack
+                      | stack_use > max_spill_slots
+                      = Just (stack_use - max_spill_slots)
+                      | otherwise
+                      = Nothing
+
+                return  ( CmmProc info lbl live (ListGraph (first' : rest'))
+                        , extra_stack
+                        , Just stats)
+
+-- bogus. to make non-exhaustive match warning go away.
+regAlloc _ (CmmProc _ _ _ _)
+        = panic "RegAllocLinear.regAlloc: no match"
+
+
+-- -----------------------------------------------------------------------------
+-- Linear sweep to allocate registers
+
+
+-- | Do register allocation on some basic blocks.
+--   But be careful to allocate a block in an SCC only if it has
+--   an entry in the block map or it is the first block.
+--
+linearRegAlloc
+        :: (Outputable instr, Instruction instr)
+        => DynFlags
+        -> [BlockId] -- ^ entry points
+        -> BlockMap RegSet
+              -- ^ live regs on entry to each basic block
+        -> [SCC (LiveBasicBlock instr)]
+              -- ^ instructions annotated with "deaths"
+        -> UniqSM ([NatBasicBlock instr], RegAllocStats, Int)
+
+linearRegAlloc dflags entry_ids block_live sccs
+ = case platformArch platform of
+      ArchX86        -> go $ (frInitFreeRegs platform :: X86.FreeRegs)
+      ArchX86_64     -> go $ (frInitFreeRegs platform :: X86_64.FreeRegs)
+      ArchS390X      -> panic "linearRegAlloc ArchS390X"
+      ArchSPARC      -> go $ (frInitFreeRegs platform :: SPARC.FreeRegs)
+      ArchSPARC64    -> panic "linearRegAlloc ArchSPARC64"
+      ArchPPC        -> go $ (frInitFreeRegs platform :: PPC.FreeRegs)
+      ArchARM _ _ _  -> panic "linearRegAlloc ArchARM"
+      ArchARM64      -> panic "linearRegAlloc ArchARM64"
+      ArchPPC_64 _   -> go $ (frInitFreeRegs platform :: PPC.FreeRegs)
+      ArchAlpha      -> panic "linearRegAlloc ArchAlpha"
+      ArchMipseb     -> panic "linearRegAlloc ArchMipseb"
+      ArchMipsel     -> panic "linearRegAlloc ArchMipsel"
+      ArchJavaScript -> panic "linearRegAlloc ArchJavaScript"
+      ArchUnknown    -> panic "linearRegAlloc ArchUnknown"
+ where
+  go f = linearRegAlloc' dflags f entry_ids block_live sccs
+  platform = targetPlatform dflags
+
+linearRegAlloc'
+        :: (FR freeRegs, Outputable instr, Instruction instr)
+        => DynFlags
+        -> freeRegs
+        -> [BlockId]                    -- ^ entry points
+        -> BlockMap RegSet              -- ^ live regs on entry to each basic block
+        -> [SCC (LiveBasicBlock instr)] -- ^ instructions annotated with "deaths"
+        -> UniqSM ([NatBasicBlock instr], RegAllocStats, Int)
+
+linearRegAlloc' dflags initFreeRegs entry_ids block_live sccs
+ = do   us      <- getUniqueSupplyM
+        let (_, stack, stats, blocks) =
+                runR dflags mapEmpty initFreeRegs emptyRegMap (emptyStackMap dflags) us
+                    $ linearRA_SCCs entry_ids block_live [] sccs
+        return  (blocks, stats, getStackUse stack)
+
+
+linearRA_SCCs :: (FR freeRegs, Instruction instr, Outputable instr)
+              => [BlockId]
+              -> BlockMap RegSet
+              -> [NatBasicBlock instr]
+              -> [SCC (LiveBasicBlock instr)]
+              -> RegM freeRegs [NatBasicBlock instr]
+
+linearRA_SCCs _ _ blocksAcc []
+        = return $ reverse blocksAcc
+
+linearRA_SCCs entry_ids block_live blocksAcc (AcyclicSCC block : sccs)
+ = do   blocks' <- processBlock block_live block
+        linearRA_SCCs entry_ids block_live
+                ((reverse blocks') ++ blocksAcc)
+                sccs
+
+linearRA_SCCs entry_ids block_live blocksAcc (CyclicSCC blocks : sccs)
+ = do
+        blockss' <- process entry_ids block_live blocks [] (return []) False
+        linearRA_SCCs entry_ids block_live
+                (reverse (concat blockss') ++ blocksAcc)
+                sccs
+
+{- from John Dias's patch 2008/10/16:
+   The linear-scan allocator sometimes allocates a block
+   before allocating one of its predecessors, which could lead to
+   inconsistent allocations. Make it so a block is only allocated
+   if a predecessor has set the "incoming" assignments for the block, or
+   if it's the procedure's entry block.
+
+   BL 2009/02: Careful. If the assignment for a block doesn't get set for
+   some reason then this function will loop. We should probably do some
+   more sanity checking to guard against this eventuality.
+-}
+
+process :: (FR freeRegs, Instruction instr, Outputable instr)
+        => [BlockId]
+        -> BlockMap RegSet
+        -> [GenBasicBlock (LiveInstr instr)]
+        -> [GenBasicBlock (LiveInstr instr)]
+        -> [[NatBasicBlock instr]]
+        -> Bool
+        -> RegM freeRegs [[NatBasicBlock instr]]
+
+process _ _ [] []         accum _
+        = return $ reverse accum
+
+process entry_ids block_live [] next_round accum madeProgress
+        | not madeProgress
+
+          {- BUGS: There are so many unreachable blocks in the code the warnings are overwhelming.
+             pprTrace "RegAlloc.Linear.Main.process: no progress made, bailing out."
+                (  text "Unreachable blocks:"
+                $$ vcat (map ppr next_round)) -}
+        = return $ reverse accum
+
+        | otherwise
+        = process entry_ids block_live
+                  next_round [] accum False
+
+process entry_ids block_live (b@(BasicBlock id _) : blocks)
+        next_round accum madeProgress
+ = do
+        block_assig <- getBlockAssigR
+
+        if isJust (mapLookup id block_assig)
+             || id `elem` entry_ids
+         then do
+                b'  <- processBlock block_live b
+                process entry_ids block_live blocks
+                        next_round (b' : accum) True
+
+         else   process entry_ids block_live blocks
+                        (b : next_round) accum madeProgress
+
+
+-- | Do register allocation on this basic block
+--
+processBlock
+        :: (FR freeRegs, Outputable instr, Instruction instr)
+        => BlockMap RegSet              -- ^ live regs on entry to each basic block
+        -> LiveBasicBlock instr         -- ^ block to do register allocation on
+        -> RegM freeRegs [NatBasicBlock instr]   -- ^ block with registers allocated
+
+processBlock block_live (BasicBlock id instrs)
+ = do   initBlock id block_live
+        (instrs', fixups)
+                <- linearRA block_live [] [] id instrs
+        return  $ BasicBlock id instrs' : fixups
+
+
+-- | Load the freeregs and current reg assignment into the RegM state
+--      for the basic block with this BlockId.
+initBlock :: FR freeRegs
+          => BlockId -> BlockMap RegSet -> RegM freeRegs ()
+initBlock id block_live
+ = do   dflags <- getDynFlags
+        let platform = targetPlatform dflags
+        block_assig     <- getBlockAssigR
+        case mapLookup id block_assig of
+                -- no prior info about this block: we must consider
+                -- any fixed regs to be allocated, but we can ignore
+                -- virtual regs (presumably this is part of a loop,
+                -- and we'll iterate again).  The assignment begins
+                -- empty.
+                Nothing
+                 -> do  -- pprTrace "initFreeRegs" (text $ show initFreeRegs) (return ())
+                        case mapLookup id block_live of
+                          Nothing ->
+                            setFreeRegsR    (frInitFreeRegs platform)
+                          Just live ->
+                            setFreeRegsR $ foldl' (flip $ frAllocateReg platform) (frInitFreeRegs platform)
+                                                  [ r | RegReal r <- nonDetEltsUniqSet live ]
+                            -- See Note [Unique Determinism and code generation]
+                        setAssigR       emptyRegMap
+
+                -- load info about register assignments leading into this block.
+                Just (freeregs, assig)
+                 -> do  setFreeRegsR    freeregs
+                        setAssigR       assig
+
+
+-- | Do allocation for a sequence of instructions.
+linearRA
+        :: (FR freeRegs, Outputable instr, Instruction instr)
+        => BlockMap RegSet                      -- ^ map of what vregs are live on entry to each block.
+        -> [instr]                              -- ^ accumulator for instructions already processed.
+        -> [NatBasicBlock instr]                -- ^ accumulator for blocks of fixup code.
+        -> BlockId                              -- ^ id of the current block, for debugging.
+        -> [LiveInstr instr]                    -- ^ liveness annotated instructions in this block.
+
+        -> RegM freeRegs
+                ( [instr]                       --   instructions after register allocation
+                , [NatBasicBlock instr])        --   fresh blocks of fixup code.
+
+
+linearRA _          accInstr accFixup _ []
+        = return
+                ( reverse accInstr              -- instrs need to be returned in the correct order.
+                , accFixup)                     -- it doesn't matter what order the fixup blocks are returned in.
+
+
+linearRA block_live accInstr accFixups id (instr:instrs)
+ = do
+        (accInstr', new_fixups) <- raInsn block_live accInstr id instr
+
+        linearRA block_live accInstr' (new_fixups ++ accFixups) id instrs
+
+
+-- | Do allocation for a single instruction.
+raInsn
+        :: (FR freeRegs, Outputable instr, Instruction instr)
+        => BlockMap RegSet                      -- ^ map of what vregs are love on entry to each block.
+        -> [instr]                              -- ^ accumulator for instructions already processed.
+        -> BlockId                              -- ^ the id of the current block, for debugging
+        -> LiveInstr instr                      -- ^ the instr to have its regs allocated, with liveness info.
+        -> RegM freeRegs
+                ( [instr]                       -- new instructions
+                , [NatBasicBlock instr])        -- extra fixup blocks
+
+raInsn _     new_instrs _ (LiveInstr ii Nothing)
+        | Just n        <- takeDeltaInstr ii
+        = do    setDeltaR n
+                return (new_instrs, [])
+
+raInsn _     new_instrs _ (LiveInstr ii@(Instr i) Nothing)
+        | isMetaInstr ii
+        = return (i : new_instrs, [])
+
+
+raInsn block_live new_instrs id (LiveInstr (Instr instr) (Just live))
+ = do
+    assig    <- getAssigR
+
+    -- If we have a reg->reg move between virtual registers, where the
+    -- src register is not live after this instruction, and the dst
+    -- register does not already have an assignment,
+    -- and the source register is assigned to a register, not to a spill slot,
+    -- then we can eliminate the instruction.
+    -- (we can't eliminate it if the source register is on the stack, because
+    --  we do not want to use one spill slot for different virtual registers)
+    case takeRegRegMoveInstr instr of
+        Just (src,dst)  | src `elementOfUniqSet` (liveDieRead live),
+                          isVirtualReg dst,
+                          not (dst `elemUFM` assig),
+                          isRealReg src || isInReg src assig -> do
+           case src of
+              (RegReal rr) -> setAssigR (addToUFM assig dst (InReg rr))
+                -- if src is a fixed reg, then we just map dest to this
+                -- reg in the assignment.  src must be an allocatable reg,
+                -- otherwise it wouldn't be in r_dying.
+              _virt -> case lookupUFM assig src of
+                         Nothing -> panic "raInsn"
+                         Just loc ->
+                           setAssigR (addToUFM (delFromUFM assig src) dst loc)
+
+           -- we have eliminated this instruction
+          {-
+          freeregs <- getFreeRegsR
+          assig <- getAssigR
+          pprTrace "raInsn" (text "ELIMINATED: " <> docToSDoc (pprInstr instr)
+                        $$ ppr r_dying <+> ppr w_dying $$ text (show freeregs) $$ ppr assig) $ do
+          -}
+           return (new_instrs, [])
+
+        _ -> genRaInsn block_live new_instrs id instr
+                        (nonDetEltsUniqSet $ liveDieRead live)
+                        (nonDetEltsUniqSet $ liveDieWrite live)
+                        -- See Note [Unique Determinism and code generation]
+
+raInsn _ _ _ instr
+        = pprPanic "raInsn" (text "no match for:" <> ppr instr)
+
+-- ToDo: what can we do about
+--
+--     R1 = x
+--     jump I64[x] // [R1]
+--
+-- where x is mapped to the same reg as R1.  We want to coalesce x and
+-- R1, but the register allocator doesn't know whether x will be
+-- assigned to again later, in which case x and R1 should be in
+-- different registers.  Right now we assume the worst, and the
+-- assignment to R1 will clobber x, so we'll spill x into another reg,
+-- generating another reg->reg move.
+
+
+isInReg :: Reg -> RegMap Loc -> Bool
+isInReg src assig | Just (InReg _) <- lookupUFM assig src = True
+                  | otherwise = False
+
+
+genRaInsn :: (FR freeRegs, Instruction instr, Outputable instr)
+          => BlockMap RegSet
+          -> [instr]
+          -> BlockId
+          -> instr
+          -> [Reg]
+          -> [Reg]
+          -> RegM freeRegs ([instr], [NatBasicBlock instr])
+
+genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
+  dflags <- getDynFlags
+  let platform = targetPlatform dflags
+  case regUsageOfInstr platform instr of { RU read written ->
+    do
+    let real_written    = [ rr  | (RegReal     rr) <- written ]
+    let virt_written    = [ vr  | (RegVirtual  vr) <- written ]
+
+    -- we don't need to do anything with real registers that are
+    -- only read by this instr.  (the list is typically ~2 elements,
+    -- so using nub isn't a problem).
+    let virt_read       = nub [ vr      | (RegVirtual vr) <- read ]
+
+    -- debugging
+{-    freeregs <- getFreeRegsR
+    assig    <- getAssigR
+    pprDebugAndThen (defaultDynFlags Settings{ sTargetPlatform=platform } undefined) trace "genRaInsn"
+        (ppr instr
+                $$ text "r_dying      = " <+> ppr r_dying
+                $$ text "w_dying      = " <+> ppr w_dying
+                $$ text "virt_read    = " <+> ppr virt_read
+                $$ text "virt_written = " <+> ppr virt_written
+                $$ text "freeregs     = " <+> text (show freeregs)
+                $$ text "assig        = " <+> ppr assig)
+        $ do
+-}
+
+    -- (a), (b) allocate real regs for all regs read by this instruction.
+    (r_spills, r_allocd) <-
+        allocateRegsAndSpill True{-reading-} virt_read [] [] virt_read
+
+    -- (c) save any temporaries which will be clobbered by this instruction
+    clobber_saves <- saveClobberedTemps real_written r_dying
+
+    -- (d) Update block map for new destinations
+    -- NB. do this before removing dead regs from the assignment, because
+    -- these dead regs might in fact be live in the jump targets (they're
+    -- only dead in the code that follows in the current basic block).
+    (fixup_blocks, adjusted_instr)
+        <- joinToTargets block_live block_id instr
+
+    -- Debugging - show places where the reg alloc inserted
+    -- assignment fixup blocks.
+    -- when (not $ null fixup_blocks) $
+    --    pprTrace "fixup_blocks" (ppr fixup_blocks) (return ())
+
+    -- (e) Delete all register assignments for temps which are read
+    --     (only) and die here.  Update the free register list.
+    releaseRegs r_dying
+
+    -- (f) Mark regs which are clobbered as unallocatable
+    clobberRegs real_written
+
+    -- (g) Allocate registers for temporaries *written* (only)
+    (w_spills, w_allocd) <-
+        allocateRegsAndSpill False{-writing-} virt_written [] [] virt_written
+
+    -- (h) Release registers for temps which are written here and not
+    -- used again.
+    releaseRegs w_dying
+
+    let
+        -- (i) Patch the instruction
+        patch_map
+                = listToUFM
+                        [ (t, RegReal r)
+                                | (t, r) <- zip virt_read    r_allocd
+                                         ++ zip virt_written w_allocd ]
+
+        patched_instr
+                = patchRegsOfInstr adjusted_instr patchLookup
+
+        patchLookup x
+                = case lookupUFM patch_map x of
+                        Nothing -> x
+                        Just y  -> y
+
+
+    -- (j) free up stack slots for dead spilled regs
+    -- TODO (can't be bothered right now)
+
+    -- erase reg->reg moves where the source and destination are the same.
+    --  If the src temp didn't die in this instr but happened to be allocated
+    --  to the same real reg as the destination, then we can erase the move anyway.
+    let squashed_instr  = case takeRegRegMoveInstr patched_instr of
+                                Just (src, dst)
+                                 | src == dst   -> []
+                                _               -> [patched_instr]
+
+    let code = squashed_instr ++ w_spills ++ reverse r_spills
+                ++ clobber_saves ++ new_instrs
+
+--    pprTrace "patched-code" ((vcat $ map (docToSDoc . pprInstr) code)) $ do
+--    pprTrace "pached-fixup" ((ppr fixup_blocks)) $ do
+
+    return (code, fixup_blocks)
+
+  }
+
+-- -----------------------------------------------------------------------------
+-- releaseRegs
+
+releaseRegs :: FR freeRegs => [Reg] -> RegM freeRegs ()
+releaseRegs regs = do
+  dflags <- getDynFlags
+  let platform = targetPlatform dflags
+  assig <- getAssigR
+  free <- getFreeRegsR
+  let loop assig !free [] = do setAssigR assig; setFreeRegsR free; return ()
+      loop assig !free (RegReal rr : rs) = loop assig (frReleaseReg platform rr free) rs
+      loop assig !free (r:rs) =
+         case lookupUFM assig r of
+         Just (InBoth real _) -> loop (delFromUFM assig r)
+                                      (frReleaseReg platform real free) rs
+         Just (InReg real)    -> loop (delFromUFM assig r)
+                                      (frReleaseReg platform real free) rs
+         _                    -> loop (delFromUFM assig r) free rs
+  loop assig free regs
+
+
+-- -----------------------------------------------------------------------------
+-- Clobber real registers
+
+-- For each temp in a register that is going to be clobbered:
+--      - if the temp dies after this instruction, do nothing
+--      - otherwise, put it somewhere safe (another reg if possible,
+--              otherwise spill and record InBoth in the assignment).
+--      - for allocateRegs on the temps *read*,
+--      - clobbered regs are allocatable.
+--
+--      for allocateRegs on the temps *written*,
+--        - clobbered regs are not allocatable.
+--
+
+saveClobberedTemps
+        :: (Instruction instr, FR freeRegs)
+        => [RealReg]            -- real registers clobbered by this instruction
+        -> [Reg]                -- registers which are no longer live after this insn
+        -> RegM freeRegs [instr]         -- return: instructions to spill any temps that will
+                                -- be clobbered.
+
+saveClobberedTemps [] _
+        = return []
+
+saveClobberedTemps clobbered dying
+ = do
+        assig   <- getAssigR
+        let to_spill
+                = [ (temp,reg)
+                        | (temp, InReg reg) <- nonDetUFMToList assig
+                        -- This is non-deterministic but we do not
+                        -- currently support deterministic code-generation.
+                        -- See Note [Unique Determinism and code generation]
+                        , any (realRegsAlias reg) clobbered
+                        , temp `notElem` map getUnique dying  ]
+
+        (instrs,assig') <- clobber assig [] to_spill
+        setAssigR assig'
+        return instrs
+
+   where
+     clobber assig instrs []
+            = return (instrs, assig)
+
+     clobber assig instrs ((temp, reg) : rest)
+       = do dflags <- getDynFlags
+            let platform = targetPlatform dflags
+
+            freeRegs <- getFreeRegsR
+            let regclass = targetClassOfRealReg platform reg
+                freeRegs_thisClass = frGetFreeRegs platform regclass freeRegs
+
+            case filter (`notElem` clobbered) freeRegs_thisClass of
+
+              -- (1) we have a free reg of the right class that isn't
+              -- clobbered by this instruction; use it to save the
+              -- clobbered value.
+              (my_reg : _) -> do
+                  setFreeRegsR (frAllocateReg platform my_reg freeRegs)
+
+                  let new_assign = addToUFM assig temp (InReg my_reg)
+                  let instr = mkRegRegMoveInstr platform
+                                  (RegReal reg) (RegReal my_reg)
+
+                  clobber new_assign (instr : instrs) rest
+
+              -- (2) no free registers: spill the value
+              [] -> do
+                  (spill, slot)   <- spillR (RegReal reg) temp
+
+                  -- record why this reg was spilled for profiling
+                  recordSpill (SpillClobber temp)
+
+                  let new_assign  = addToUFM assig temp (InBoth reg slot)
+
+                  clobber new_assign (spill : instrs) rest
+
+
+
+-- | Mark all these real regs as allocated,
+--      and kick out their vreg assignments.
+--
+clobberRegs :: FR freeRegs => [RealReg] -> RegM freeRegs ()
+clobberRegs []
+        = return ()
+
+clobberRegs clobbered
+ = do   dflags <- getDynFlags
+        let platform = targetPlatform dflags
+
+        freeregs        <- getFreeRegsR
+        setFreeRegsR $! foldl' (flip $ frAllocateReg platform) freeregs clobbered
+
+        assig           <- getAssigR
+        setAssigR $! clobber assig (nonDetUFMToList assig)
+          -- This is non-deterministic but we do not
+          -- currently support deterministic code-generation.
+          -- See Note [Unique Determinism and code generation]
+
+   where
+        -- if the temp was InReg and clobbered, then we will have
+        -- saved it in saveClobberedTemps above.  So the only case
+        -- we have to worry about here is InBoth.  Note that this
+        -- also catches temps which were loaded up during allocation
+        -- of read registers, not just those saved in saveClobberedTemps.
+
+        clobber assig []
+                = assig
+
+        clobber assig ((temp, InBoth reg slot) : rest)
+                | any (realRegsAlias reg) clobbered
+                = clobber (addToUFM assig temp (InMem slot)) rest
+
+        clobber assig (_:rest)
+                = clobber assig rest
+
+-- -----------------------------------------------------------------------------
+-- allocateRegsAndSpill
+
+-- Why are we performing a spill?
+data SpillLoc = ReadMem StackSlot  -- reading from register only in memory
+              | WriteNew           -- writing to a new variable
+              | WriteMem           -- writing to register only in memory
+-- Note that ReadNew is not valid, since you don't want to be reading
+-- from an uninitialized register.  We also don't need the location of
+-- the register in memory, since that will be invalidated by the write.
+-- Technically, we could coalesce WriteNew and WriteMem into a single
+-- entry as well. -- EZY
+
+-- This function does several things:
+--   For each temporary referred to by this instruction,
+--   we allocate a real register (spilling another temporary if necessary).
+--   We load the temporary up from memory if necessary.
+--   We also update the register assignment in the process, and
+--   the list of free registers and free stack slots.
+
+allocateRegsAndSpill
+        :: (FR freeRegs, Outputable instr, Instruction instr)
+        => Bool                 -- True <=> reading (load up spilled regs)
+        -> [VirtualReg]         -- don't push these out
+        -> [instr]              -- spill insns
+        -> [RealReg]            -- real registers allocated (accum.)
+        -> [VirtualReg]         -- temps to allocate
+        -> RegM freeRegs ( [instr] , [RealReg])
+
+allocateRegsAndSpill _       _    spills alloc []
+        = return (spills, reverse alloc)
+
+allocateRegsAndSpill reading keep spills alloc (r:rs)
+ = do   assig <- getAssigR
+        let doSpill = allocRegsAndSpill_spill reading keep spills alloc r rs assig
+        case lookupUFM assig r of
+                -- case (1a): already in a register
+                Just (InReg my_reg) ->
+                        allocateRegsAndSpill reading keep spills (my_reg:alloc) rs
+
+                -- case (1b): already in a register (and memory)
+                -- NB1. if we're writing this register, update its assignment to be
+                -- InReg, because the memory value is no longer valid.
+                -- NB2. This is why we must process written registers here, even if they
+                -- are also read by the same instruction.
+                Just (InBoth my_reg _)
+                 -> do  when (not reading) (setAssigR (addToUFM assig r (InReg my_reg)))
+                        allocateRegsAndSpill reading keep spills (my_reg:alloc) rs
+
+                -- Not already in a register, so we need to find a free one...
+                Just (InMem slot) | reading   -> doSpill (ReadMem slot)
+                                  | otherwise -> doSpill WriteMem
+                Nothing | reading   ->
+                   pprPanic "allocateRegsAndSpill: Cannot read from uninitialized register" (ppr r)
+                   -- NOTE: if the input to the NCG contains some
+                   -- unreachable blocks with junk code, this panic
+                   -- might be triggered.  Make sure you only feed
+                   -- sensible code into the NCG.  In GHC.Cmm.Pipeline we
+                   -- call removeUnreachableBlocks at the end for this
+                   -- reason.
+
+                        | otherwise -> doSpill WriteNew
+
+
+-- reading is redundant with reason, but we keep it around because it's
+-- convenient and it maintains the recursive structure of the allocator. -- EZY
+allocRegsAndSpill_spill :: (FR freeRegs, Instruction instr, Outputable instr)
+                        => Bool
+                        -> [VirtualReg]
+                        -> [instr]
+                        -> [RealReg]
+                        -> VirtualReg
+                        -> [VirtualReg]
+                        -> UniqFM Loc
+                        -> SpillLoc
+                        -> RegM freeRegs ([instr], [RealReg])
+allocRegsAndSpill_spill reading keep spills alloc r rs assig spill_loc
+ = do   dflags <- getDynFlags
+        let platform = targetPlatform dflags
+        freeRegs                <- getFreeRegsR
+        let freeRegs_thisClass  = frGetFreeRegs platform (classOfVirtualReg r) freeRegs
+
+        case freeRegs_thisClass of
+
+         -- case (2): we have a free register
+         (my_reg : _) ->
+           do   spills'   <- loadTemp r spill_loc my_reg spills
+
+                setAssigR       (addToUFM assig r $! newLocation spill_loc my_reg)
+                setFreeRegsR $  frAllocateReg platform my_reg freeRegs
+
+                allocateRegsAndSpill reading keep spills' (my_reg : alloc) rs
+
+
+          -- case (3): we need to push something out to free up a register
+         [] ->
+           do   let inRegOrBoth (InReg _) = True
+                    inRegOrBoth (InBoth _ _) = True
+                    inRegOrBoth _ = False
+                let candidates' =
+                      flip delListFromUFM keep $
+                      filterUFM inRegOrBoth $
+                      assig
+                      -- This is non-deterministic but we do not
+                      -- currently support deterministic code-generation.
+                      -- See Note [Unique Determinism and code generation]
+                let candidates = nonDetUFMToList candidates'
+
+                -- the vregs we could kick out that are already in a slot
+                let candidates_inBoth
+                        = [ (temp, reg, mem)
+                          | (temp, InBoth reg mem) <- candidates
+                          , targetClassOfRealReg platform reg == classOfVirtualReg r ]
+
+                -- the vregs we could kick out that are only in a reg
+                --      this would require writing the reg to a new slot before using it.
+                let candidates_inReg
+                        = [ (temp, reg)
+                          | (temp, InReg reg) <- candidates
+                          , targetClassOfRealReg platform reg == classOfVirtualReg r ]
+
+                let result
+
+                        -- we have a temporary that is in both register and mem,
+                        -- just free up its register for use.
+                        | (temp, my_reg, slot) : _      <- candidates_inBoth
+                        = do    spills' <- loadTemp r spill_loc my_reg spills
+                                let assig1  = addToUFM assig temp (InMem slot)
+                                let assig2  = addToUFM assig1 r $! newLocation spill_loc my_reg
+
+                                setAssigR assig2
+                                allocateRegsAndSpill reading keep spills' (my_reg:alloc) rs
+
+                        -- otherwise, we need to spill a temporary that currently
+                        -- resides in a register.
+                        | (temp_to_push_out, (my_reg :: RealReg)) : _
+                                        <- candidates_inReg
+                        = do
+                                (spill_insn, slot) <- spillR (RegReal my_reg) temp_to_push_out
+                                let spill_store  = (if reading then id else reverse)
+                                                        [ -- COMMENT (fsLit "spill alloc")
+                                                           spill_insn ]
+
+                                -- record that this temp was spilled
+                                recordSpill (SpillAlloc temp_to_push_out)
+
+                                -- update the register assignment
+                                let assig1  = addToUFM assig temp_to_push_out   (InMem slot)
+                                let assig2  = addToUFM assig1 r                 $! newLocation spill_loc my_reg
+                                setAssigR assig2
+
+                                -- if need be, load up a spilled temp into the reg we've just freed up.
+                                spills' <- loadTemp r spill_loc my_reg spills
+
+                                allocateRegsAndSpill reading keep
+                                        (spill_store ++ spills')
+                                        (my_reg:alloc) rs
+
+
+                        -- there wasn't anything to spill, so we're screwed.
+                        | otherwise
+                        = pprPanic ("RegAllocLinear.allocRegsAndSpill: no spill candidates\n")
+                        $ vcat
+                                [ text "allocating vreg:  " <> text (show r)
+                                , text "assignment:       " <> ppr assig
+                                , text "freeRegs:         " <> text (show freeRegs)
+                                , text "initFreeRegs:     " <> text (show (frInitFreeRegs platform `asTypeOf` freeRegs)) ]
+
+                result
+
+
+-- | Calculate a new location after a register has been loaded.
+newLocation :: SpillLoc -> RealReg -> Loc
+-- if the tmp was read from a slot, then now its in a reg as well
+newLocation (ReadMem slot) my_reg = InBoth my_reg slot
+-- writes will always result in only the register being available
+newLocation _ my_reg = InReg my_reg
+
+-- | Load up a spilled temporary if we need to (read from memory).
+loadTemp
+        :: (Instruction instr)
+        => VirtualReg   -- the temp being loaded
+        -> SpillLoc     -- the current location of this temp
+        -> RealReg      -- the hreg to load the temp into
+        -> [instr]
+        -> RegM freeRegs [instr]
+
+loadTemp vreg (ReadMem slot) hreg spills
+ = do
+        insn <- loadR (RegReal hreg) slot
+        recordSpill (SpillLoad $ getUnique vreg)
+        return  $  {- COMMENT (fsLit "spill load") : -} insn : spills
+
+loadTemp _ _ _ spills =
+   return spills
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs b/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs
new file mode 100644
index 0000000000..43dbab843b
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs
@@ -0,0 +1,141 @@
+
+-- | Put common type definitions here to break recursive module dependencies.
+
+module GHC.CmmToAsm.Reg.Linear.Base (
+        BlockAssignment,
+
+        Loc(..),
+        regsOfLoc,
+
+        -- for stats
+        SpillReason(..),
+        RegAllocStats(..),
+
+        -- the allocator monad
+        RA_State(..),
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Linear.StackMap
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.Platform.Reg
+
+import GHC.Driver.Session
+import Outputable
+import Unique
+import UniqFM
+import UniqSupply
+import GHC.Cmm.BlockId
+
+
+-- | Used to store the register assignment on entry to a basic block.
+--      We use this to handle join points, where multiple branch instructions
+--      target a particular label. We have to insert fixup code to make
+--      the register assignments from the different sources match up.
+--
+type BlockAssignment freeRegs
+        = BlockMap (freeRegs, RegMap Loc)
+
+
+-- | Where a vreg is currently stored
+--      A temporary can be marked as living in both a register and memory
+--      (InBoth), for example if it was recently loaded from a spill location.
+--      This makes it cheap to spill (no save instruction required), but we
+--      have to be careful to turn this into InReg if the value in the
+--      register is changed.
+
+--      This is also useful when a temporary is about to be clobbered.  We
+--      save it in a spill location, but mark it as InBoth because the current
+--      instruction might still want to read it.
+--
+data Loc
+        -- | vreg is in a register
+        = InReg   !RealReg
+
+        -- | vreg is held in a stack slot
+        | InMem   {-# UNPACK #-}  !StackSlot
+
+
+        -- | vreg is held in both a register and a stack slot
+        | InBoth   !RealReg
+                   {-# UNPACK #-} !StackSlot
+        deriving (Eq, Show, Ord)
+
+instance Outputable Loc where
+        ppr l = text (show l)
+
+
+-- | Get the reg numbers stored in this Loc.
+regsOfLoc :: Loc -> [RealReg]
+regsOfLoc (InReg r)    = [r]
+regsOfLoc (InBoth r _) = [r]
+regsOfLoc (InMem _)    = []
+
+
+-- | Reasons why instructions might be inserted by the spiller.
+--      Used when generating stats for -ddrop-asm-stats.
+--
+data SpillReason
+        -- | vreg was spilled to a slot so we could use its
+        --      current hreg for another vreg
+        = SpillAlloc    !Unique
+
+        -- | vreg was moved because its hreg was clobbered
+        | SpillClobber  !Unique
+
+        -- | vreg was loaded from a spill slot
+        | SpillLoad     !Unique
+
+        -- | reg-reg move inserted during join to targets
+        | SpillJoinRR   !Unique
+
+        -- | reg-mem move inserted during join to targets
+        | SpillJoinRM   !Unique
+
+
+-- | Used to carry interesting stats out of the register allocator.
+data RegAllocStats
+        = RegAllocStats
+        { ra_spillInstrs        :: UniqFM [Int]
+        , ra_fixupList     :: [(BlockId,BlockId,BlockId)]
+        -- ^ (from,fixup,to) : We inserted fixup code between from and to
+        }
+
+
+-- | The register allocator state
+data RA_State freeRegs
+        = RA_State
+
+        {
+        -- | the current mapping from basic blocks to
+        --      the register assignments at the beginning of that block.
+          ra_blockassig :: BlockAssignment freeRegs
+
+        -- | free machine registers
+        , ra_freeregs   :: !freeRegs
+
+        -- | assignment of temps to locations
+        , ra_assig      :: RegMap Loc
+
+        -- | current stack delta
+        , ra_delta      :: Int
+
+        -- | free stack slots for spilling
+        , ra_stack      :: StackMap
+
+        -- | unique supply for generating names for join point fixup blocks.
+        , ra_us         :: UniqSupply
+
+        -- | Record why things were spilled, for -ddrop-asm-stats.
+        --      Just keep a list here instead of a map of regs -> reasons.
+        --      We don't want to slow down the allocator if we're not going to emit the stats.
+        , ra_spills     :: [SpillReason]
+        , ra_DynFlags   :: DynFlags
+
+        -- | (from,fixup,to) : We inserted fixup code between from and to
+        , ra_fixups     :: [(BlockId,BlockId,BlockId)] }
+
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs b/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
new file mode 100644
index 0000000000..0d72d8b6e9
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/FreeRegs.hs
@@ -0,0 +1,89 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.CmmToAsm.Reg.Linear.FreeRegs (
+    FR(..),
+    maxSpillSlots
+)
+
+#include "HsVersions.h"
+
+where
+
+import GhcPrelude
+
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+
+import GHC.Driver.Session
+import Panic
+import GHC.Platform
+
+-- -----------------------------------------------------------------------------
+-- The free register set
+-- This needs to be *efficient*
+-- Here's an inefficient 'executable specification' of the FreeRegs data type:
+--
+--      type FreeRegs = [RegNo]
+--      noFreeRegs = 0
+--      releaseReg n f = if n `elem` f then f else (n : f)
+--      initFreeRegs = allocatableRegs
+--      getFreeRegs cls f = filter ( (==cls) . regClass . RealReg ) f
+--      allocateReg f r = filter (/= r) f
+
+import qualified GHC.CmmToAsm.Reg.Linear.PPC    as PPC
+import qualified GHC.CmmToAsm.Reg.Linear.SPARC  as SPARC
+import qualified GHC.CmmToAsm.Reg.Linear.X86    as X86
+import qualified GHC.CmmToAsm.Reg.Linear.X86_64 as X86_64
+
+import qualified GHC.CmmToAsm.PPC.Instr   as PPC.Instr
+import qualified GHC.CmmToAsm.SPARC.Instr as SPARC.Instr
+import qualified GHC.CmmToAsm.X86.Instr   as X86.Instr
+
+class Show freeRegs => FR freeRegs where
+    frAllocateReg :: Platform -> RealReg -> freeRegs -> freeRegs
+    frGetFreeRegs :: Platform -> RegClass -> freeRegs -> [RealReg]
+    frInitFreeRegs :: Platform -> freeRegs
+    frReleaseReg :: Platform -> RealReg -> freeRegs -> freeRegs
+
+instance FR X86.FreeRegs where
+    frAllocateReg  = \_ -> X86.allocateReg
+    frGetFreeRegs  = X86.getFreeRegs
+    frInitFreeRegs = X86.initFreeRegs
+    frReleaseReg   = \_ -> X86.releaseReg
+
+instance FR X86_64.FreeRegs where
+    frAllocateReg  = \_ -> X86_64.allocateReg
+    frGetFreeRegs  = X86_64.getFreeRegs
+    frInitFreeRegs = X86_64.initFreeRegs
+    frReleaseReg   = \_ -> X86_64.releaseReg
+
+instance FR PPC.FreeRegs where
+    frAllocateReg  = \_ -> PPC.allocateReg
+    frGetFreeRegs  = \_ -> PPC.getFreeRegs
+    frInitFreeRegs = PPC.initFreeRegs
+    frReleaseReg   = \_ -> PPC.releaseReg
+
+instance FR SPARC.FreeRegs where
+    frAllocateReg  = SPARC.allocateReg
+    frGetFreeRegs  = \_ -> SPARC.getFreeRegs
+    frInitFreeRegs = SPARC.initFreeRegs
+    frReleaseReg   = SPARC.releaseReg
+
+maxSpillSlots :: DynFlags -> Int
+maxSpillSlots dflags
+              = case platformArch (targetPlatform dflags) of
+                ArchX86       -> X86.Instr.maxSpillSlots dflags
+                ArchX86_64    -> X86.Instr.maxSpillSlots dflags
+                ArchPPC       -> PPC.Instr.maxSpillSlots dflags
+                ArchS390X     -> panic "maxSpillSlots ArchS390X"
+                ArchSPARC     -> SPARC.Instr.maxSpillSlots dflags
+                ArchSPARC64   -> panic "maxSpillSlots ArchSPARC64"
+                ArchARM _ _ _ -> panic "maxSpillSlots ArchARM"
+                ArchARM64     -> panic "maxSpillSlots ArchARM64"
+                ArchPPC_64 _  -> PPC.Instr.maxSpillSlots dflags
+                ArchAlpha     -> panic "maxSpillSlots ArchAlpha"
+                ArchMipseb    -> panic "maxSpillSlots ArchMipseb"
+                ArchMipsel    -> panic "maxSpillSlots ArchMipsel"
+                ArchJavaScript-> panic "maxSpillSlots ArchJavaScript"
+                ArchUnknown   -> panic "maxSpillSlots ArchUnknown"
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs b/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
new file mode 100644
index 0000000000..b4ad1b948c
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
@@ -0,0 +1,378 @@
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-- | Handles joining of a jump instruction to its targets.
+
+--      The first time we encounter a jump to a particular basic block, we
+--      record the assignment of temporaries.  The next time we encounter a
+--      jump to the same block, we compare our current assignment to the
+--      stored one.  They might be different if spilling has occurred in one
+--      branch; so some fixup code will be required to match up the assignments.
+--
+module GHC.CmmToAsm.Reg.Linear.JoinToTargets (joinToTargets) where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Linear.State
+import GHC.CmmToAsm.Reg.Linear.Base
+import GHC.CmmToAsm.Reg.Linear.FreeRegs
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import Digraph
+import GHC.Driver.Session
+import Outputable
+import Unique
+import UniqFM
+import UniqSet
+
+-- | For a jump instruction at the end of a block, generate fixup code so its
+--      vregs are in the correct regs for its destination.
+--
+joinToTargets
+        :: (FR freeRegs, Instruction instr, Outputable instr)
+        => BlockMap RegSet              -- ^ maps the unique of the blockid to the set of vregs
+                                        --      that are known to be live on the entry to each block.
+
+        -> BlockId                      -- ^ id of the current block
+        -> instr                        -- ^ branch instr on the end of the source block.
+
+        -> RegM freeRegs ([NatBasicBlock instr] -- fresh blocks of fixup code.
+                         , instr)               -- the original branch
+                                                -- instruction, but maybe
+                                                -- patched to jump
+                                                -- to a fixup block first.
+
+joinToTargets block_live id instr
+
+        -- we only need to worry about jump instructions.
+        | not $ isJumpishInstr instr
+        = return ([], instr)
+
+        | otherwise
+        = joinToTargets' block_live [] id instr (jumpDestsOfInstr instr)
+
+-----
+joinToTargets'
+        :: (FR freeRegs, Instruction instr, Outputable instr)
+        => BlockMap RegSet              -- ^ maps the unique of the blockid to the set of vregs
+                                        --      that are known to be live on the entry to each block.
+
+        -> [NatBasicBlock instr]        -- ^ acc blocks of fixup code.
+
+        -> BlockId                      -- ^ id of the current block
+        -> instr                        -- ^ branch instr on the end of the source block.
+
+        -> [BlockId]                    -- ^ branch destinations still to consider.
+
+        -> RegM freeRegs ([NatBasicBlock instr], instr)
+
+-- no more targets to consider. all done.
+joinToTargets' _          new_blocks _ instr []
+        = return (new_blocks, instr)
+
+-- handle a branch target.
+joinToTargets' block_live new_blocks block_id instr (dest:dests)
+ = do
+        -- get the map of where the vregs are stored on entry to each basic block.
+        block_assig     <- getBlockAssigR
+
+        -- get the assignment on entry to the branch instruction.
+        assig           <- getAssigR
+
+        -- adjust the current assignment to remove any vregs that are not live
+        -- on entry to the destination block.
+        let Just live_set       = mapLookup dest block_live
+        let still_live uniq _   = uniq `elemUniqSet_Directly` live_set
+        let adjusted_assig      = filterUFM_Directly still_live assig
+
+        -- and free up those registers which are now free.
+        let to_free =
+                [ r     | (reg, loc) <- nonDetUFMToList assig
+                        -- This is non-deterministic but we do not
+                        -- currently support deterministic code-generation.
+                        -- See Note [Unique Determinism and code generation]
+                        , not (elemUniqSet_Directly reg live_set)
+                        , r          <- regsOfLoc loc ]
+
+        case mapLookup dest block_assig of
+         Nothing
+          -> joinToTargets_first
+                        block_live new_blocks block_id instr dest dests
+                        block_assig adjusted_assig to_free
+
+         Just (_, dest_assig)
+          -> joinToTargets_again
+                        block_live new_blocks block_id instr dest dests
+                        adjusted_assig dest_assig
+
+
+-- this is the first time we jumped to this block.
+joinToTargets_first :: (FR freeRegs, Instruction instr, Outputable instr)
+                    => BlockMap RegSet
+                    -> [NatBasicBlock instr]
+                    -> BlockId
+                    -> instr
+                    -> BlockId
+                    -> [BlockId]
+                    -> BlockAssignment freeRegs
+                    -> RegMap Loc
+                    -> [RealReg]
+                    -> RegM freeRegs ([NatBasicBlock instr], instr)
+joinToTargets_first block_live new_blocks block_id instr dest dests
+        block_assig src_assig
+        to_free
+
+ = do   dflags <- getDynFlags
+        let platform = targetPlatform dflags
+
+        -- free up the regs that are not live on entry to this block.
+        freeregs        <- getFreeRegsR
+        let freeregs' = foldl' (flip $ frReleaseReg platform) freeregs to_free
+
+        -- remember the current assignment on entry to this block.
+        setBlockAssigR (mapInsert dest (freeregs', src_assig) block_assig)
+
+        joinToTargets' block_live new_blocks block_id instr dests
+
+
+-- we've jumped to this block before
+joinToTargets_again :: (Instruction instr, FR freeRegs, Outputable instr)
+                    => BlockMap RegSet
+                    -> [NatBasicBlock instr]
+                    -> BlockId
+                    -> instr
+                    -> BlockId
+                    -> [BlockId]
+                    -> UniqFM Loc
+                    -> UniqFM Loc
+                    -> RegM freeRegs ([NatBasicBlock instr], instr)
+joinToTargets_again
+    block_live new_blocks block_id instr dest dests
+    src_assig dest_assig
+
+        -- the assignments already match, no problem.
+        | nonDetUFMToList dest_assig == nonDetUFMToList src_assig
+        -- This is non-deterministic but we do not
+        -- currently support deterministic code-generation.
+        -- See Note [Unique Determinism and code generation]
+        = joinToTargets' block_live new_blocks block_id instr dests
+
+        -- assignments don't match, need fixup code
+        | otherwise
+        = do
+
+                -- make a graph of what things need to be moved where.
+                let graph = makeRegMovementGraph src_assig dest_assig
+
+                -- look for cycles in the graph. This can happen if regs need to be swapped.
+                -- Note that we depend on the fact that this function does a
+                --      bottom up traversal of the tree-like portions of the graph.
+                --
+                --  eg, if we have
+                --      R1 -> R2 -> R3
+                --
+                --  ie move value in R1 to R2 and value in R2 to R3.
+                --
+                -- We need to do the R2 -> R3 move before R1 -> R2.
+                --
+                let sccs  = stronglyConnCompFromEdgedVerticesOrdR graph
+
+              -- debugging
+                {-
+                pprTrace
+                        ("joinToTargets: making fixup code")
+                        (vcat   [ text "        in block: "     <> ppr block_id
+                                , text " jmp instruction: "     <> ppr instr
+                                , text "  src assignment: "     <> ppr src_assig
+                                , text " dest assignment: "     <> ppr dest_assig
+                                , text "  movement graph: "     <> ppr graph
+                                , text "   sccs of graph: "     <> ppr sccs
+                                , text ""])
+                        (return ())
+                -}
+                delta           <- getDeltaR
+                fixUpInstrs_    <- mapM (handleComponent delta instr) sccs
+                let fixUpInstrs = concat fixUpInstrs_
+
+                -- make a new basic block containing the fixup code.
+                --      A the end of the current block we will jump to the fixup one,
+                --      then that will jump to our original destination.
+                fixup_block_id <- mkBlockId <$> getUniqueR
+                let block = BasicBlock fixup_block_id
+                                $ fixUpInstrs ++ mkJumpInstr dest
+
+                -- if we didn't need any fixups, then don't include the block
+                case fixUpInstrs of
+                 []     -> joinToTargets' block_live new_blocks block_id instr dests
+
+                 -- patch the original branch instruction so it goes to our
+                 --     fixup block instead.
+                 _      -> let  instr'  =  patchJumpInstr instr
+                                            (\bid -> if bid == dest
+                                                        then fixup_block_id
+                                                        else bid) -- no change!
+
+                           in do
+                                {- --debugging
+                                pprTrace "FixUpEdge info:"
+                                    (
+                                    text "inBlock:" <> ppr block_id $$
+                                    text "instr:" <> ppr instr $$
+                                    text "instr':" <> ppr instr' $$
+                                    text "fixup_block_id':" <>
+                                        ppr fixup_block_id $$
+                                    text "dest:" <> ppr dest
+                                    ) (return ())
+                                -}
+                                recordFixupBlock block_id fixup_block_id dest
+                                joinToTargets' block_live (block : new_blocks)
+                                               block_id instr' dests
+
+
+-- | Construct a graph of register\/spill movements.
+--
+--      Cyclic components seem to occur only very rarely.
+--
+--      We cut some corners by not handling memory-to-memory moves.
+--      This shouldn't happen because every temporary gets its own stack slot.
+--
+makeRegMovementGraph :: RegMap Loc -> RegMap Loc -> [Node Loc Unique]
+makeRegMovementGraph adjusted_assig dest_assig
+ = [ node       | (vreg, src) <- nonDetUFMToList adjusted_assig
+                    -- This is non-deterministic but we do not
+                    -- currently support deterministic code-generation.
+                    -- See Note [Unique Determinism and code generation]
+                    -- source reg might not be needed at the dest:
+                , Just loc <- [lookupUFM_Directly dest_assig vreg]
+                , node <- expandNode vreg src loc ]
+
+
+-- | Expand out the destination, so InBoth destinations turn into
+--      a combination of InReg and InMem.
+
+--      The InBoth handling is a little tricky here.  If the destination is
+--      InBoth, then we must ensure that the value ends up in both locations.
+--      An InBoth  destination must conflict with an InReg or InMem source, so
+--      we expand an InBoth destination as necessary.
+--
+--      An InBoth source is slightly different: we only care about the register
+--      that the source value is in, so that we can move it to the destinations.
+--
+expandNode
+        :: a
+        -> Loc                  -- ^ source of move
+        -> Loc                  -- ^ destination of move
+        -> [Node Loc a ]
+
+expandNode vreg loc@(InReg src) (InBoth dst mem)
+        | src == dst = [DigraphNode vreg loc [InMem mem]]
+        | otherwise  = [DigraphNode vreg loc [InReg dst, InMem mem]]
+
+expandNode vreg loc@(InMem src) (InBoth dst mem)
+        | src == mem = [DigraphNode vreg loc [InReg dst]]
+        | otherwise  = [DigraphNode vreg loc [InReg dst, InMem mem]]
+
+expandNode _        (InBoth _ src) (InMem dst)
+        | src == dst = [] -- guaranteed to be true
+
+expandNode _        (InBoth src _) (InReg dst)
+        | src == dst = []
+
+expandNode vreg     (InBoth src _) dst
+        = expandNode vreg (InReg src) dst
+
+expandNode vreg src dst
+        | src == dst = []
+        | otherwise  = [DigraphNode vreg src [dst]]
+
+
+-- | Generate fixup code for a particular component in the move graph
+--      This component tells us what values need to be moved to what
+--      destinations. We have eliminated any possibility of single-node
+--      cycles in expandNode above.
+--
+handleComponent
+        :: Instruction instr
+        => Int -> instr -> SCC (Node Loc Unique)
+        -> RegM freeRegs [instr]
+
+-- If the graph is acyclic then we won't get the swapping problem below.
+--      In this case we can just do the moves directly, and avoid having to
+--      go via a spill slot.
+--
+handleComponent delta _  (AcyclicSCC (DigraphNode vreg src dsts))
+        = mapM (makeMove delta vreg src) dsts
+
+
+-- Handle some cyclic moves.
+--      This can happen if we have two regs that need to be swapped.
+--      eg:
+--           vreg   source loc   dest loc
+--          (vreg1, InReg r1,    [InReg r2])
+--          (vreg2, InReg r2,    [InReg r1])
+--
+--      To avoid needing temp register, we just spill all the source regs, then
+--      reaload them into their destination regs.
+--
+--      Note that we can not have cycles that involve memory locations as
+--      sources as single destination because memory locations (stack slots)
+--      are allocated exclusively for a virtual register and therefore can not
+--      require a fixup.
+--
+handleComponent delta instr
+        (CyclicSCC ((DigraphNode vreg (InReg sreg) ((InReg dreg: _))) : rest))
+        -- dest list may have more than one element, if the reg is also InMem.
+ = do
+        -- spill the source into its slot
+        (instrSpill, slot)
+                        <- spillR (RegReal sreg) vreg
+
+        -- reload into destination reg
+        instrLoad       <- loadR (RegReal dreg) slot
+
+        remainingFixUps <- mapM (handleComponent delta instr)
+                                (stronglyConnCompFromEdgedVerticesOrdR rest)
+
+        -- make sure to do all the reloads after all the spills,
+        --      so we don't end up clobbering the source values.
+        return ([instrSpill] ++ concat remainingFixUps ++ [instrLoad])
+
+handleComponent _ _ (CyclicSCC _)
+ = panic "Register Allocator: handleComponent cyclic"
+
+
+-- | Move a vreg between these two locations.
+--
+makeMove
+    :: Instruction instr
+    => Int      -- ^ current C stack delta.
+    -> Unique   -- ^ unique of the vreg that we're moving.
+    -> Loc      -- ^ source location.
+    -> Loc      -- ^ destination location.
+    -> RegM freeRegs instr  -- ^ move instruction.
+
+makeMove delta vreg src dst
+ = do dflags <- getDynFlags
+      let platform = targetPlatform dflags
+
+      case (src, dst) of
+          (InReg s, InReg d) ->
+              do recordSpill (SpillJoinRR vreg)
+                 return $ mkRegRegMoveInstr platform (RegReal s) (RegReal d)
+          (InMem s, InReg d) ->
+              do recordSpill (SpillJoinRM vreg)
+                 return $ mkLoadInstr dflags (RegReal d) delta s
+          (InReg s, InMem d) ->
+              do recordSpill (SpillJoinRM vreg)
+                 return $ mkSpillInstr dflags (RegReal s) delta d
+          _ ->
+              -- we don't handle memory to memory moves.
+              -- they shouldn't happen because we don't share
+              -- stack slots between vregs.
+              panic ("makeMove " ++ show vreg ++ " (" ++ show src ++ ") ("
+                  ++ show dst ++ ")"
+                  ++ " we don't handle mem->mem moves.")
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs b/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs
new file mode 100644
index 0000000000..ce0a187647
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs
@@ -0,0 +1,60 @@
+-- | Free regs map for PowerPC
+module GHC.CmmToAsm.Reg.Linear.PPC where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.PPC.Regs
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import Outputable
+import GHC.Platform
+
+import Data.Word
+import Data.Bits
+
+-- The PowerPC has 32 integer and 32 floating point registers.
+-- This is 32bit PowerPC, so Word64 is inefficient - two Word32s are much
+-- better.
+-- Note that when getFreeRegs scans for free registers, it starts at register
+-- 31 and counts down. This is a hack for the PowerPC - the higher-numbered
+-- registers are callee-saves, while the lower regs are caller-saves, so it
+-- makes sense to start at the high end.
+-- Apart from that, the code does nothing PowerPC-specific, so feel free to
+-- add your favourite platform to the #if (if you have 64 registers but only
+-- 32-bit words).
+
+data FreeRegs = FreeRegs !Word32 !Word32
+              deriving( Show )  -- The Show is used in an ASSERT
+
+noFreeRegs :: FreeRegs
+noFreeRegs = FreeRegs 0 0
+
+releaseReg :: RealReg -> FreeRegs -> FreeRegs
+releaseReg (RealRegSingle r) (FreeRegs g f)
+    | r > 31    = FreeRegs g (f .|. (1 `shiftL` (r - 32)))
+    | otherwise = FreeRegs (g .|. (1 `shiftL` r)) f
+
+releaseReg _ _
+        = panic "RegAlloc.Linear.PPC.releaseReg: bad reg"
+
+initFreeRegs :: Platform -> FreeRegs
+initFreeRegs platform = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform)
+
+getFreeRegs :: RegClass -> FreeRegs -> [RealReg]        -- lazily
+getFreeRegs cls (FreeRegs g f)
+    | RcDouble <- cls = go f (0x80000000) 63
+    | RcInteger <- cls = go g (0x80000000) 31
+    | otherwise = pprPanic "RegAllocLinear.getFreeRegs: Bad register class" (ppr cls)
+    where
+        go _ 0 _ = []
+        go x m i | x .&. m /= 0 = RealRegSingle i : (go x (m `shiftR` 1) $! i-1)
+                 | otherwise    = go x (m `shiftR` 1) $! i-1
+
+allocateReg :: RealReg -> FreeRegs -> FreeRegs
+allocateReg (RealRegSingle r) (FreeRegs g f)
+    | r > 31    = FreeRegs g (f .&. complement (1 `shiftL` (r - 32)))
+    | otherwise = FreeRegs (g .&. complement (1 `shiftL` r)) f
+
+allocateReg _ _
+        = panic "RegAlloc.Linear.PPC.allocateReg: bad reg"
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs b/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs
new file mode 100644
index 0000000000..7fa85f0913
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs
@@ -0,0 +1,188 @@
+{-# LANGUAGE CPP #-}
+
+-- | Free regs map for SPARC
+module GHC.CmmToAsm.Reg.Linear.SPARC where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+
+import GHC.Platform.Regs
+import Outputable
+import GHC.Platform
+
+import Data.Word
+import Data.Bits
+
+
+--------------------------------------------------------------------------------
+-- SPARC is like PPC, except for twinning of floating point regs.
+--      When we allocate a double reg we must take an even numbered
+--      float reg, as well as the one after it.
+
+
+-- Holds bitmaps showing what registers are currently allocated.
+--      The float and double reg bitmaps overlap, but we only alloc
+--      float regs into the float map, and double regs into the double map.
+--
+--      Free regs have a bit set in the corresponding bitmap.
+--
+data FreeRegs
+        = FreeRegs
+                !Word32         -- int    reg bitmap    regs  0..31
+                !Word32         -- float  reg bitmap    regs 32..63
+                !Word32         -- double reg bitmap    regs 32..63
+
+instance Show FreeRegs where
+        show = showFreeRegs
+
+-- | A reg map where no regs are free to be allocated.
+noFreeRegs :: FreeRegs
+noFreeRegs = FreeRegs 0 0 0
+
+
+-- | The initial set of free regs.
+initFreeRegs :: Platform -> FreeRegs
+initFreeRegs platform
+ =      foldl' (flip $ releaseReg platform) noFreeRegs allocatableRegs
+
+
+-- | Get all the free registers of this class.
+getFreeRegs :: RegClass -> FreeRegs -> [RealReg]        -- lazily
+getFreeRegs cls (FreeRegs g f d)
+        | RcInteger <- cls = map RealRegSingle                  $ go 1 g 1 0
+        | RcFloat   <- cls = map RealRegSingle                  $ go 1 f 1 32
+        | RcDouble  <- cls = map (\i -> RealRegPair i (i+1))    $ go 2 d 1 32
+#if __GLASGOW_HASKELL__ <= 810
+        | otherwise = pprPanic "RegAllocLinear.getFreeRegs: Bad register class " (ppr cls)
+#endif
+        where
+                go _    _      0    _
+                        = []
+
+                go step bitmap mask ix
+                        | bitmap .&. mask /= 0
+                        = ix : (go step bitmap (mask `shiftL` step) $! ix + step)
+
+                        | otherwise
+                        = go step bitmap (mask `shiftL` step) $! ix + step
+
+
+-- | Grab a register.
+allocateReg :: Platform -> RealReg -> FreeRegs -> FreeRegs
+allocateReg platform
+         reg@(RealRegSingle r)
+             (FreeRegs g f d)
+
+        -- can't allocate free regs
+        | not $ freeReg platform r
+        = pprPanic "SPARC.FreeRegs.allocateReg: not allocating pinned reg" (ppr reg)
+
+        -- a general purpose reg
+        | r <= 31
+        = let   mask    = complement (bitMask r)
+          in    FreeRegs
+                        (g .&. mask)
+                        f
+                        d
+
+        -- a float reg
+        | r >= 32, r <= 63
+        = let   mask    = complement (bitMask (r - 32))
+
+                -- the mask of the double this FP reg aliases
+                maskLow = if r `mod` 2 == 0
+                                then complement (bitMask (r - 32))
+                                else complement (bitMask (r - 32 - 1))
+          in    FreeRegs
+                        g
+                        (f .&. mask)
+                        (d .&. maskLow)
+
+        | otherwise
+        = pprPanic "SPARC.FreeRegs.releaseReg: not allocating bad reg" (ppr reg)
+
+allocateReg _
+         reg@(RealRegPair r1 r2)
+             (FreeRegs g f d)
+
+        | r1 >= 32, r1 <= 63, r1 `mod` 2 == 0
+        , r2 >= 32, r2 <= 63
+        = let   mask1   = complement (bitMask (r1 - 32))
+                mask2   = complement (bitMask (r2 - 32))
+          in
+                FreeRegs
+                        g
+                        ((f .&. mask1) .&. mask2)
+                        (d .&. mask1)
+
+        | otherwise
+        = pprPanic "SPARC.FreeRegs.releaseReg: not allocating bad reg" (ppr reg)
+
+
+
+-- | Release a register from allocation.
+--      The register liveness information says that most regs die after a C call,
+--      but we still don't want to allocate to some of them.
+--
+releaseReg :: Platform -> RealReg -> FreeRegs -> FreeRegs
+releaseReg platform
+         reg@(RealRegSingle r)
+        regs@(FreeRegs g f d)
+
+        -- don't release pinned reg
+        | not $ freeReg platform r
+        = regs
+
+        -- a general purpose reg
+        | r <= 31
+        = let   mask    = bitMask r
+          in    FreeRegs (g .|. mask) f d
+
+        -- a float reg
+        | r >= 32, r <= 63
+        = let   mask    = bitMask (r - 32)
+
+                -- the mask of the double this FP reg aliases
+                maskLow = if r `mod` 2 == 0
+                                then bitMask (r - 32)
+                                else bitMask (r - 32 - 1)
+          in    FreeRegs
+                        g
+                        (f .|. mask)
+                        (d .|. maskLow)
+
+        | otherwise
+        = pprPanic "SPARC.FreeRegs.releaseReg: not releasing bad reg" (ppr reg)
+
+releaseReg _
+         reg@(RealRegPair r1 r2)
+             (FreeRegs g f d)
+
+        | r1 >= 32, r1 <= 63, r1 `mod` 2 == 0
+        , r2 >= 32, r2 <= 63
+        = let   mask1   = bitMask (r1 - 32)
+                mask2   = bitMask (r2 - 32)
+          in
+                FreeRegs
+                        g
+                        ((f .|. mask1) .|. mask2)
+                        (d .|. mask1)
+
+        | otherwise
+        = pprPanic "SPARC.FreeRegs.releaseReg: not releasing bad reg" (ppr reg)
+
+
+
+bitMask :: Int -> Word32
+bitMask n       = 1 `shiftL` n
+
+
+showFreeRegs :: FreeRegs -> String
+showFreeRegs regs
+        =  "FreeRegs\n"
+        ++ "    integer: " ++ (show $ getFreeRegs RcInteger regs)       ++ "\n"
+        ++ "      float: " ++ (show $ getFreeRegs RcFloat   regs)       ++ "\n"
+        ++ "     double: " ++ (show $ getFreeRegs RcDouble  regs)       ++ "\n"
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/StackMap.hs b/compiler/GHC/CmmToAsm/Reg/Linear/StackMap.hs
new file mode 100644
index 0000000000..630b101fc7
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/StackMap.hs
@@ -0,0 +1,61 @@
+
+-- | The assignment of virtual registers to stack slots
+
+--      We have lots of stack slots. Memory-to-memory moves are a pain on most
+--      architectures. Therefore, we avoid having to generate memory-to-memory moves
+--      by simply giving every virtual register its own stack slot.
+
+--      The StackMap stack map keeps track of virtual register - stack slot
+--      associations and of which stack slots are still free. Once it has been
+--      associated, a stack slot is never "freed" or removed from the StackMap again,
+--      it remains associated until we are done with the current CmmProc.
+--
+module GHC.CmmToAsm.Reg.Linear.StackMap (
+        StackSlot,
+        StackMap(..),
+        emptyStackMap,
+        getStackSlotFor,
+        getStackUse
+)
+
+where
+
+import GhcPrelude
+
+import GHC.Driver.Session
+import UniqFM
+import Unique
+
+
+-- | Identifier for a stack slot.
+type StackSlot = Int
+
+data StackMap
+        = StackMap
+        { -- | The slots that are still available to be allocated.
+          stackMapNextFreeSlot  :: !Int
+
+          -- | Assignment of vregs to stack slots.
+        , stackMapAssignment    :: UniqFM StackSlot }
+
+
+-- | An empty stack map, with all slots available.
+emptyStackMap :: DynFlags -> StackMap
+emptyStackMap _ = StackMap 0 emptyUFM
+
+
+-- | If this vreg unique already has a stack assignment then return the slot number,
+--      otherwise allocate a new slot, and update the map.
+--
+getStackSlotFor :: StackMap -> Unique -> (StackMap, Int)
+
+getStackSlotFor fs@(StackMap _ reserved) reg
+  | Just slot <- lookupUFM reserved reg  =  (fs, slot)
+
+getStackSlotFor (StackMap freeSlot reserved) reg =
+    (StackMap (freeSlot+1) (addToUFM reserved reg freeSlot), freeSlot)
+
+-- | Return the number of stack slots that were allocated
+getStackUse :: StackMap -> Int
+getStackUse (StackMap freeSlot _) = freeSlot
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/State.hs b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
new file mode 100644
index 0000000000..a167cc7e00
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
@@ -0,0 +1,184 @@
+{-# LANGUAGE CPP, PatternSynonyms, DeriveFunctor #-}
+
+#if !defined(GHC_LOADED_INTO_GHCI)
+{-# LANGUAGE UnboxedTuples #-}
+#endif
+
+-- | State monad for the linear register allocator.
+
+--      Here we keep all the state that the register allocator keeps track
+--      of as it walks the instructions in a basic block.
+
+module GHC.CmmToAsm.Reg.Linear.State (
+        RA_State(..),
+        RegM,
+        runR,
+
+        spillR,
+        loadR,
+
+        getFreeRegsR,
+        setFreeRegsR,
+
+        getAssigR,
+        setAssigR,
+
+        getBlockAssigR,
+        setBlockAssigR,
+
+        setDeltaR,
+        getDeltaR,
+
+        getUniqueR,
+
+        recordSpill,
+        recordFixupBlock
+)
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Linear.Stats
+import GHC.CmmToAsm.Reg.Linear.StackMap
+import GHC.CmmToAsm.Reg.Linear.Base
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+import GHC.Cmm.BlockId
+
+import GHC.Driver.Session
+import Unique
+import UniqSupply
+
+import Control.Monad (ap)
+
+-- Avoids using unboxed tuples when loading into GHCi
+#if !defined(GHC_LOADED_INTO_GHCI)
+
+type RA_Result freeRegs a = (# RA_State freeRegs, a #)
+
+pattern RA_Result :: a -> b -> (# a, b #)
+pattern RA_Result a b = (# a, b #)
+{-# COMPLETE RA_Result #-}
+#else
+
+data RA_Result freeRegs a = RA_Result {-# UNPACK #-} !(RA_State freeRegs) !a
+  deriving (Functor)
+
+#endif
+
+-- | The register allocator monad type.
+newtype RegM freeRegs a
+        = RegM { unReg :: RA_State freeRegs -> RA_Result freeRegs a }
+        deriving (Functor)
+
+instance Applicative (RegM freeRegs) where
+      pure a  =  RegM $ \s -> RA_Result s a
+      (<*>) = ap
+
+instance Monad (RegM freeRegs) where
+  m >>= k   =  RegM $ \s -> case unReg m s of { RA_Result s a -> unReg (k a) s }
+
+instance HasDynFlags (RegM a) where
+    getDynFlags = RegM $ \s -> RA_Result s (ra_DynFlags s)
+
+
+-- | Run a computation in the RegM register allocator monad.
+runR    :: DynFlags
+        -> BlockAssignment freeRegs
+        -> freeRegs
+        -> RegMap Loc
+        -> StackMap
+        -> UniqSupply
+        -> RegM freeRegs a
+        -> (BlockAssignment freeRegs, StackMap, RegAllocStats, a)
+
+runR dflags block_assig freeregs assig stack us thing =
+  case unReg thing
+        (RA_State
+                { ra_blockassig = block_assig
+                , ra_freeregs   = freeregs
+                , ra_assig      = assig
+                , ra_delta      = 0{-???-}
+                , ra_stack      = stack
+                , ra_us         = us
+                , ra_spills     = []
+                , ra_DynFlags   = dflags
+                , ra_fixups     = [] })
+   of
+        RA_Result state returned_thing
+         ->     (ra_blockassig state, ra_stack state, makeRAStats state, returned_thing)
+
+
+-- | Make register allocator stats from its final state.
+makeRAStats :: RA_State freeRegs -> RegAllocStats
+makeRAStats state
+        = RegAllocStats
+        { ra_spillInstrs        = binSpillReasons (ra_spills state)
+        , ra_fixupList          = ra_fixups state }
+
+
+spillR :: Instruction instr
+       => Reg -> Unique -> RegM freeRegs (instr, Int)
+
+spillR reg temp = RegM $ \ s@RA_State{ra_delta=delta, ra_stack=stack0} ->
+  let dflags = ra_DynFlags s
+      (stack1,slot) = getStackSlotFor stack0 temp
+      instr  = mkSpillInstr dflags reg delta slot
+  in
+  RA_Result s{ra_stack=stack1} (instr,slot)
+
+
+loadR :: Instruction instr
+      => Reg -> Int -> RegM freeRegs instr
+
+loadR reg slot = RegM $ \ s@RA_State{ra_delta=delta} ->
+  let dflags = ra_DynFlags s
+  in RA_Result s (mkLoadInstr dflags reg delta slot)
+
+getFreeRegsR :: RegM freeRegs freeRegs
+getFreeRegsR = RegM $ \ s@RA_State{ra_freeregs = freeregs} ->
+  RA_Result s freeregs
+
+setFreeRegsR :: freeRegs -> RegM freeRegs ()
+setFreeRegsR regs = RegM $ \ s ->
+  RA_Result s{ra_freeregs = regs} ()
+
+getAssigR :: RegM freeRegs (RegMap Loc)
+getAssigR = RegM $ \ s@RA_State{ra_assig = assig} ->
+  RA_Result s assig
+
+setAssigR :: RegMap Loc -> RegM freeRegs ()
+setAssigR assig = RegM $ \ s ->
+  RA_Result s{ra_assig=assig} ()
+
+getBlockAssigR :: RegM freeRegs (BlockAssignment freeRegs)
+getBlockAssigR = RegM $ \ s@RA_State{ra_blockassig = assig} ->
+  RA_Result s assig
+
+setBlockAssigR :: BlockAssignment freeRegs -> RegM freeRegs ()
+setBlockAssigR assig = RegM $ \ s ->
+  RA_Result s{ra_blockassig = assig} ()
+
+setDeltaR :: Int -> RegM freeRegs ()
+setDeltaR n = RegM $ \ s ->
+  RA_Result s{ra_delta = n} ()
+
+getDeltaR :: RegM freeRegs Int
+getDeltaR = RegM $ \s -> RA_Result s (ra_delta s)
+
+getUniqueR :: RegM freeRegs Unique
+getUniqueR = RegM $ \s ->
+  case takeUniqFromSupply (ra_us s) of
+    (uniq, us) -> RA_Result s{ra_us = us} uniq
+
+
+-- | Record that a spill instruction was inserted, for profiling.
+recordSpill :: SpillReason -> RegM freeRegs ()
+recordSpill spill
+    = RegM $ \s -> RA_Result (s { ra_spills = spill : ra_spills s }) ()
+
+-- | Record a created fixup block
+recordFixupBlock :: BlockId -> BlockId -> BlockId -> RegM freeRegs ()
+recordFixupBlock from between to
+    = RegM $ \s -> RA_Result (s { ra_fixups = (from,between,to) : ra_fixups s }) ()
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/Stats.hs b/compiler/GHC/CmmToAsm/Reg/Linear/Stats.hs
new file mode 100644
index 0000000000..1176b220a3
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/Stats.hs
@@ -0,0 +1,87 @@
+module GHC.CmmToAsm.Reg.Linear.Stats (
+        binSpillReasons,
+        countRegRegMovesNat,
+        pprStats
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Reg.Linear.Base
+import GHC.CmmToAsm.Reg.Liveness
+import GHC.CmmToAsm.Instr
+
+import UniqFM
+import Outputable
+
+import State
+
+-- | Build a map of how many times each reg was alloced, clobbered, loaded etc.
+binSpillReasons
+        :: [SpillReason] -> UniqFM [Int]
+
+binSpillReasons reasons
+        = addListToUFM_C
+                (zipWith (+))
+                emptyUFM
+                (map (\reason -> case reason of
+                        SpillAlloc r    -> (r, [1, 0, 0, 0, 0])
+                        SpillClobber r  -> (r, [0, 1, 0, 0, 0])
+                        SpillLoad r     -> (r, [0, 0, 1, 0, 0])
+                        SpillJoinRR r   -> (r, [0, 0, 0, 1, 0])
+                        SpillJoinRM r   -> (r, [0, 0, 0, 0, 1])) reasons)
+
+
+-- | Count reg-reg moves remaining in this code.
+countRegRegMovesNat
+        :: Instruction instr
+        => NatCmmDecl statics instr -> Int
+
+countRegRegMovesNat cmm
+        = execState (mapGenBlockTopM countBlock cmm) 0
+ where
+        countBlock b@(BasicBlock _ instrs)
+         = do   mapM_ countInstr instrs
+                return  b
+
+        countInstr instr
+                | Just _        <- takeRegRegMoveInstr instr
+                = do    modify (+ 1)
+                        return instr
+
+                | otherwise
+                =       return instr
+
+
+-- | Pretty print some RegAllocStats
+pprStats
+        :: Instruction instr
+        => [NatCmmDecl statics instr] -> [RegAllocStats] -> SDoc
+
+pprStats code statss
+ = let  -- sum up all the instrs inserted by the spiller
+        spills          = foldl' (plusUFM_C (zipWith (+)))
+                                emptyUFM
+                        $ map ra_spillInstrs statss
+
+        spillTotals     = foldl' (zipWith (+))
+                                [0, 0, 0, 0, 0]
+                        $ nonDetEltsUFM spills
+                        -- See Note [Unique Determinism and code generation]
+
+        -- count how many reg-reg-moves remain in the code
+        moves           = sum $ map countRegRegMovesNat code
+
+        pprSpill (reg, spills)
+                = parens $ (hcat $ punctuate (text ", ")  (doubleQuotes (ppr reg) : map ppr spills))
+
+   in   (  text "-- spills-added-total"
+        $$ text "--    (allocs, clobbers, loads, joinRR, joinRM, reg_reg_moves_remaining)"
+        $$ (parens $ (hcat $ punctuate (text ", ") (map ppr spillTotals ++ [ppr moves])))
+        $$ text ""
+        $$ text "-- spills-added"
+        $$ text "--    (reg_name, allocs, clobbers, loads, joinRR, joinRM)"
+        $$ (pprUFMWithKeys spills (vcat . map pprSpill))
+        $$ text "")
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs b/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs
new file mode 100644
index 0000000000..ce103bd6b2
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs
@@ -0,0 +1,52 @@
+
+-- | Free regs map for i386
+module GHC.CmmToAsm.Reg.Linear.X86 where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Regs
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import Panic
+import GHC.Platform
+
+import Data.Word
+import Data.Bits
+
+newtype FreeRegs = FreeRegs Word32
+    deriving Show
+
+noFreeRegs :: FreeRegs
+noFreeRegs = FreeRegs 0
+
+releaseReg :: RealReg -> FreeRegs -> FreeRegs
+releaseReg (RealRegSingle n) (FreeRegs f)
+        = FreeRegs (f .|. (1 `shiftL` n))
+
+releaseReg _ _
+        = panic "RegAlloc.Linear.X86.FreeRegs.releaseReg: no reg"
+
+initFreeRegs :: Platform -> FreeRegs
+initFreeRegs platform
+        = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform)
+
+getFreeRegs :: Platform -> RegClass -> FreeRegs -> [RealReg] -- lazily
+getFreeRegs platform cls (FreeRegs f) = go f 0
+
+  where go 0 _ = []
+        go n m
+          | n .&. 1 /= 0 && classOfRealReg platform (RealRegSingle m) == cls
+          = RealRegSingle m : (go (n `shiftR` 1) $! (m+1))
+
+          | otherwise
+          = go (n `shiftR` 1) $! (m+1)
+        -- ToDo: there's no point looking through all the integer registers
+        -- in order to find a floating-point one.
+
+allocateReg :: RealReg -> FreeRegs -> FreeRegs
+allocateReg (RealRegSingle r) (FreeRegs f)
+        = FreeRegs (f .&. complement (1 `shiftL` r))
+
+allocateReg _ _
+        = panic "RegAlloc.Linear.X86.FreeRegs.allocateReg: no reg"
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs b/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs
new file mode 100644
index 0000000000..322ddd6bdd
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs
@@ -0,0 +1,53 @@
+
+-- | Free regs map for x86_64
+module GHC.CmmToAsm.Reg.Linear.X86_64 where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Regs
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import Panic
+import GHC.Platform
+
+import Data.Word
+import Data.Bits
+
+newtype FreeRegs = FreeRegs Word64
+    deriving Show
+
+noFreeRegs :: FreeRegs
+noFreeRegs = FreeRegs 0
+
+releaseReg :: RealReg -> FreeRegs -> FreeRegs
+releaseReg (RealRegSingle n) (FreeRegs f)
+        = FreeRegs (f .|. (1 `shiftL` n))
+
+releaseReg _ _
+        = panic "RegAlloc.Linear.X86_64.FreeRegs.releaseReg: no reg"
+
+initFreeRegs :: Platform -> FreeRegs
+initFreeRegs platform
+        = foldl' (flip releaseReg) noFreeRegs (allocatableRegs platform)
+
+getFreeRegs :: Platform -> RegClass -> FreeRegs -> [RealReg] -- lazily
+getFreeRegs platform cls (FreeRegs f) = go f 0
+
+  where go 0 _ = []
+        go n m
+          | n .&. 1 /= 0 && classOfRealReg platform (RealRegSingle m) == cls
+          = RealRegSingle m : (go (n `shiftR` 1) $! (m+1))
+
+          | otherwise
+          = go (n `shiftR` 1) $! (m+1)
+        -- ToDo: there's no point looking through all the integer registers
+        -- in order to find a floating-point one.
+
+allocateReg :: RealReg -> FreeRegs -> FreeRegs
+allocateReg (RealRegSingle r) (FreeRegs f)
+        = FreeRegs (f .&. complement (1 `shiftL` r))
+
+allocateReg _ _
+        = panic "RegAlloc.Linear.X86_64.FreeRegs.allocateReg: no reg"
+
+
diff --git a/compiler/GHC/CmmToAsm/Reg/Liveness.hs b/compiler/GHC/CmmToAsm/Reg/Liveness.hs
new file mode 100644
index 0000000000..03b8123f93
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Liveness.hs
@@ -0,0 +1,1025 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeFamilies #-}
+
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-----------------------------------------------------------------------------
+--
+-- The register liveness determinator
+--
+-- (c) The University of Glasgow 2004-2013
+--
+-----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.Reg.Liveness (
+        RegSet,
+        RegMap, emptyRegMap,
+        BlockMap, mapEmpty,
+        LiveCmmDecl,
+        InstrSR   (..),
+        LiveInstr (..),
+        Liveness (..),
+        LiveInfo (..),
+        LiveBasicBlock,
+
+        mapBlockTop,    mapBlockTopM,   mapSCCM,
+        mapGenBlockTop, mapGenBlockTopM,
+        stripLive,
+        stripLiveBlock,
+        slurpConflicts,
+        slurpReloadCoalesce,
+        eraseDeltasLive,
+        patchEraseLive,
+        patchRegsLiveInstr,
+        reverseBlocksInTops,
+        regLiveness,
+        cmmTopLiveness
+  ) where
+import GhcPrelude
+
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Instr
+
+import GHC.Cmm.BlockId
+import GHC.CmmToAsm.CFG
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm hiding (RegSet, emptyRegSet)
+
+import Digraph
+import GHC.Driver.Session
+import MonadUtils
+import Outputable
+import GHC.Platform
+import UniqSet
+import UniqFM
+import UniqSupply
+import Bag
+import State
+
+import Data.List
+import Data.Maybe
+import Data.IntSet              (IntSet)
+
+-----------------------------------------------------------------------------
+type RegSet = UniqSet Reg
+
+type RegMap a = UniqFM a
+
+emptyRegMap :: UniqFM a
+emptyRegMap = emptyUFM
+
+emptyRegSet :: RegSet
+emptyRegSet = emptyUniqSet
+
+type BlockMap a = LabelMap a
+
+
+-- | A top level thing which carries liveness information.
+type LiveCmmDecl statics instr
+        = GenCmmDecl
+                statics
+                LiveInfo
+                [SCC (LiveBasicBlock instr)]
+
+
+-- | The register allocator also wants to use SPILL/RELOAD meta instructions,
+--   so we'll keep those here.
+data InstrSR instr
+        -- | A real machine instruction
+        = Instr  instr
+
+        -- | spill this reg to a stack slot
+        | SPILL  Reg Int
+
+        -- | reload this reg from a stack slot
+        | RELOAD Int Reg
+
+instance Instruction instr => Instruction (InstrSR instr) where
+        regUsageOfInstr platform i
+         = case i of
+                Instr  instr    -> regUsageOfInstr platform instr
+                SPILL  reg _    -> RU [reg] []
+                RELOAD _ reg    -> RU [] [reg]
+
+        patchRegsOfInstr i f
+         = case i of
+                Instr instr     -> Instr (patchRegsOfInstr instr f)
+                SPILL  reg slot -> SPILL (f reg) slot
+                RELOAD slot reg -> RELOAD slot (f reg)
+
+        isJumpishInstr i
+         = case i of
+                Instr instr     -> isJumpishInstr instr
+                _               -> False
+
+        jumpDestsOfInstr i
+         = case i of
+                Instr instr     -> jumpDestsOfInstr instr
+                _               -> []
+
+        patchJumpInstr i f
+         = case i of
+                Instr instr     -> Instr (patchJumpInstr instr f)
+                _               -> i
+
+        mkSpillInstr            = error "mkSpillInstr[InstrSR]: Not making SPILL meta-instr"
+        mkLoadInstr             = error "mkLoadInstr[InstrSR]: Not making LOAD meta-instr"
+
+        takeDeltaInstr i
+         = case i of
+                Instr instr     -> takeDeltaInstr instr
+                _               -> Nothing
+
+        isMetaInstr i
+         = case i of
+                Instr instr     -> isMetaInstr instr
+                _               -> False
+
+        mkRegRegMoveInstr platform r1 r2
+            = Instr (mkRegRegMoveInstr platform r1 r2)
+
+        takeRegRegMoveInstr i
+         = case i of
+                Instr instr     -> takeRegRegMoveInstr instr
+                _               -> Nothing
+
+        mkJumpInstr target      = map Instr (mkJumpInstr target)
+
+        mkStackAllocInstr platform amount =
+             Instr <$> mkStackAllocInstr platform amount
+
+        mkStackDeallocInstr platform amount =
+             Instr <$> mkStackDeallocInstr platform amount
+
+
+-- | An instruction with liveness information.
+data LiveInstr instr
+        = LiveInstr (InstrSR instr) (Maybe Liveness)
+
+-- | Liveness information.
+--   The regs which die are ones which are no longer live in the *next* instruction
+--   in this sequence.
+--   (NB. if the instruction is a jump, these registers might still be live
+--   at the jump target(s) - you have to check the liveness at the destination
+--   block to find out).
+
+data Liveness
+        = Liveness
+        { liveBorn      :: RegSet       -- ^ registers born in this instruction (written to for first time).
+        , liveDieRead   :: RegSet       -- ^ registers that died because they were read for the last time.
+        , liveDieWrite  :: RegSet }     -- ^ registers that died because they were clobbered by something.
+
+
+-- | Stash regs live on entry to each basic block in the info part of the cmm code.
+data LiveInfo
+        = LiveInfo
+                (LabelMap RawCmmStatics)  -- cmm info table static stuff
+                [BlockId]                 -- entry points (first one is the
+                                          -- entry point for the proc).
+                (BlockMap RegSet)         -- argument locals live on entry to this block
+                (BlockMap IntSet)         -- stack slots live on entry to this block
+
+
+-- | A basic block with liveness information.
+type LiveBasicBlock instr
+        = GenBasicBlock (LiveInstr instr)
+
+
+instance Outputable instr
+      => Outputable (InstrSR instr) where
+
+        ppr (Instr realInstr)
+           = ppr realInstr
+
+        ppr (SPILL reg slot)
+           = hcat [
+                text "\tSPILL",
+                char ' ',
+                ppr reg,
+                comma,
+                text "SLOT" <> parens (int slot)]
+
+        ppr (RELOAD slot reg)
+           = hcat [
+                text "\tRELOAD",
+                char ' ',
+                text "SLOT" <> parens (int slot),
+                comma,
+                ppr reg]
+
+instance Outputable instr
+      => Outputable (LiveInstr instr) where
+
+        ppr (LiveInstr instr Nothing)
+         = ppr instr
+
+        ppr (LiveInstr instr (Just live))
+         =  ppr instr
+                $$ (nest 8
+                        $ vcat
+                        [ pprRegs (text "# born:    ") (liveBorn live)
+                        , pprRegs (text "# r_dying: ") (liveDieRead live)
+                        , pprRegs (text "# w_dying: ") (liveDieWrite live) ]
+                    $+$ space)
+
+         where  pprRegs :: SDoc -> RegSet -> SDoc
+                pprRegs name regs
+                 | isEmptyUniqSet regs  = empty
+                 | otherwise            = name <>
+                     (pprUFM (getUniqSet regs) (hcat . punctuate space . map ppr))
+
+instance Outputable LiveInfo where
+    ppr (LiveInfo mb_static entryIds liveVRegsOnEntry liveSlotsOnEntry)
+        =  (ppr mb_static)
+        $$ text "# entryIds         = " <> ppr entryIds
+        $$ text "# liveVRegsOnEntry = " <> ppr liveVRegsOnEntry
+        $$ text "# liveSlotsOnEntry = " <> text (show liveSlotsOnEntry)
+
+
+
+-- | map a function across all the basic blocks in this code
+--
+mapBlockTop
+        :: (LiveBasicBlock instr -> LiveBasicBlock instr)
+        -> LiveCmmDecl statics instr -> LiveCmmDecl statics instr
+
+mapBlockTop f cmm
+        = evalState (mapBlockTopM (\x -> return $ f x) cmm) ()
+
+
+-- | map a function across all the basic blocks in this code (monadic version)
+--
+mapBlockTopM
+        :: Monad m
+        => (LiveBasicBlock instr -> m (LiveBasicBlock instr))
+        -> LiveCmmDecl statics instr -> m (LiveCmmDecl statics instr)
+
+mapBlockTopM _ cmm@(CmmData{})
+        = return cmm
+
+mapBlockTopM f (CmmProc header label live sccs)
+ = do   sccs'   <- mapM (mapSCCM f) sccs
+        return  $ CmmProc header label live sccs'
+
+mapSCCM :: Monad m => (a -> m b) -> SCC a -> m (SCC b)
+mapSCCM f (AcyclicSCC x)
+ = do   x'      <- f x
+        return  $ AcyclicSCC x'
+
+mapSCCM f (CyclicSCC xs)
+ = do   xs'     <- mapM f xs
+        return  $ CyclicSCC xs'
+
+
+-- map a function across all the basic blocks in this code
+mapGenBlockTop
+        :: (GenBasicBlock             i -> GenBasicBlock            i)
+        -> (GenCmmDecl d h (ListGraph i) -> GenCmmDecl d h (ListGraph i))
+
+mapGenBlockTop f cmm
+        = evalState (mapGenBlockTopM (\x -> return $ f x) cmm) ()
+
+
+-- | map a function across all the basic blocks in this code (monadic version)
+mapGenBlockTopM
+        :: Monad m
+        => (GenBasicBlock            i  -> m (GenBasicBlock            i))
+        -> (GenCmmDecl d h (ListGraph i) -> m (GenCmmDecl d h (ListGraph i)))
+
+mapGenBlockTopM _ cmm@(CmmData{})
+        = return cmm
+
+mapGenBlockTopM f (CmmProc header label live (ListGraph blocks))
+ = do   blocks' <- mapM f blocks
+        return  $ CmmProc header label live (ListGraph blocks')
+
+
+-- | Slurp out the list of register conflicts and reg-reg moves from this top level thing.
+--   Slurping of conflicts and moves is wrapped up together so we don't have
+--   to make two passes over the same code when we want to build the graph.
+--
+slurpConflicts
+        :: Instruction instr
+        => LiveCmmDecl statics instr
+        -> (Bag (UniqSet Reg), Bag (Reg, Reg))
+
+slurpConflicts live
+        = slurpCmm (emptyBag, emptyBag) live
+
+ where  slurpCmm   rs  CmmData{}                = rs
+        slurpCmm   rs (CmmProc info _ _ sccs)
+                = foldl' (slurpSCC info) rs sccs
+
+        slurpSCC  info rs (AcyclicSCC b)
+                = slurpBlock info rs b
+
+        slurpSCC  info rs (CyclicSCC bs)
+                = foldl'  (slurpBlock info) rs bs
+
+        slurpBlock info rs (BasicBlock blockId instrs)
+                | LiveInfo _ _ blockLive _        <- info
+                , Just rsLiveEntry                <- mapLookup blockId blockLive
+                , (conflicts, moves)              <- slurpLIs rsLiveEntry rs instrs
+                = (consBag rsLiveEntry conflicts, moves)
+
+                | otherwise
+                = panic "Liveness.slurpConflicts: bad block"
+
+        slurpLIs rsLive (conflicts, moves) []
+                = (consBag rsLive conflicts, moves)
+
+        slurpLIs rsLive rs (LiveInstr _ Nothing     : lis)
+                = slurpLIs rsLive rs lis
+
+        slurpLIs rsLiveEntry (conflicts, moves) (LiveInstr instr (Just live) : lis)
+         = let
+                -- regs that die because they are read for the last time at the start of an instruction
+                --      are not live across it.
+                rsLiveAcross    = rsLiveEntry `minusUniqSet` (liveDieRead live)
+
+                -- regs live on entry to the next instruction.
+                --      be careful of orphans, make sure to delete dying regs _after_ unioning
+                --      in the ones that are born here.
+                rsLiveNext      = (rsLiveAcross `unionUniqSets` (liveBorn     live))
+                                                `minusUniqSet`  (liveDieWrite live)
+
+                -- orphan vregs are the ones that die in the same instruction they are born in.
+                --      these are likely to be results that are never used, but we still
+                --      need to assign a hreg to them..
+                rsOrphans       = intersectUniqSets
+                                        (liveBorn live)
+                                        (unionUniqSets (liveDieWrite live) (liveDieRead live))
+
+                --
+                rsConflicts     = unionUniqSets rsLiveNext rsOrphans
+
+          in    case takeRegRegMoveInstr instr of
+                 Just rr        -> slurpLIs rsLiveNext
+                                        ( consBag rsConflicts conflicts
+                                        , consBag rr moves) lis
+
+                 Nothing        -> slurpLIs rsLiveNext
+                                        ( consBag rsConflicts conflicts
+                                        , moves) lis
+
+
+-- | For spill\/reloads
+--
+--   SPILL  v1, slot1
+--   ...
+--   RELOAD slot1, v2
+--
+--   If we can arrange that v1 and v2 are allocated to the same hreg it's more likely
+--   the spill\/reload instrs can be cleaned and replaced by a nop reg-reg move.
+--
+--
+slurpReloadCoalesce
+        :: forall statics instr. Instruction instr
+        => LiveCmmDecl statics instr
+        -> Bag (Reg, Reg)
+
+slurpReloadCoalesce live
+        = slurpCmm emptyBag live
+
+ where
+        slurpCmm :: Bag (Reg, Reg)
+                 -> GenCmmDecl t t1 [SCC (LiveBasicBlock instr)]
+                 -> Bag (Reg, Reg)
+        slurpCmm cs CmmData{}   = cs
+        slurpCmm cs (CmmProc _ _ _ sccs)
+                = slurpComp cs (flattenSCCs sccs)
+
+        slurpComp :: Bag (Reg, Reg)
+                     -> [LiveBasicBlock instr]
+                     -> Bag (Reg, Reg)
+        slurpComp  cs blocks
+         = let  (moveBags, _)   = runState (slurpCompM blocks) emptyUFM
+           in   unionManyBags (cs : moveBags)
+
+        slurpCompM :: [LiveBasicBlock instr]
+                   -> State (UniqFM [UniqFM Reg]) [Bag (Reg, Reg)]
+        slurpCompM blocks
+         = do   -- run the analysis once to record the mapping across jumps.
+                mapM_   (slurpBlock False) blocks
+
+                -- run it a second time while using the information from the last pass.
+                --      We /could/ run this many more times to deal with graphical control
+                --      flow and propagating info across multiple jumps, but it's probably
+                --      not worth the trouble.
+                mapM    (slurpBlock True) blocks
+
+        slurpBlock :: Bool -> LiveBasicBlock instr
+                   -> State (UniqFM [UniqFM Reg]) (Bag (Reg, Reg))
+        slurpBlock propagate (BasicBlock blockId instrs)
+         = do   -- grab the slot map for entry to this block
+                slotMap         <- if propagate
+                                        then getSlotMap blockId
+                                        else return emptyUFM
+
+                (_, mMoves)     <- mapAccumLM slurpLI slotMap instrs
+                return $ listToBag $ catMaybes mMoves
+
+        slurpLI :: UniqFM Reg                           -- current slotMap
+                -> LiveInstr instr
+                -> State (UniqFM [UniqFM Reg])          -- blockId -> [slot -> reg]
+                                                        --      for tracking slotMaps across jumps
+
+                         ( UniqFM Reg                   -- new slotMap
+                         , Maybe (Reg, Reg))            -- maybe a new coalesce edge
+
+        slurpLI slotMap li
+
+                -- remember what reg was stored into the slot
+                | LiveInstr (SPILL reg slot) _  <- li
+                , slotMap'                      <- addToUFM slotMap slot reg
+                = return (slotMap', Nothing)
+
+                -- add an edge between the this reg and the last one stored into the slot
+                | LiveInstr (RELOAD slot reg) _ <- li
+                = case lookupUFM slotMap slot of
+                        Just reg2
+                         | reg /= reg2  -> return (slotMap, Just (reg, reg2))
+                         | otherwise    -> return (slotMap, Nothing)
+
+                        Nothing         -> return (slotMap, Nothing)
+
+                -- if we hit a jump, remember the current slotMap
+                | LiveInstr (Instr instr) _     <- li
+                , targets                       <- jumpDestsOfInstr instr
+                , not $ null targets
+                = do    mapM_   (accSlotMap slotMap) targets
+                        return  (slotMap, Nothing)
+
+                | otherwise
+                = return (slotMap, Nothing)
+
+        -- record a slotmap for an in edge to this block
+        accSlotMap slotMap blockId
+                = modify (\s -> addToUFM_C (++) s blockId [slotMap])
+
+        -- work out the slot map on entry to this block
+        --      if we have slot maps for multiple in-edges then we need to merge them.
+        getSlotMap blockId
+         = do   map             <- get
+                let slotMaps    = fromMaybe [] (lookupUFM map blockId)
+                return          $ foldr mergeSlotMaps emptyUFM slotMaps
+
+        mergeSlotMaps :: UniqFM Reg -> UniqFM Reg -> UniqFM Reg
+        mergeSlotMaps map1 map2
+                = listToUFM
+                $ [ (k, r1)
+                  | (k, r1) <- nonDetUFMToList map1
+                  -- This is non-deterministic but we do not
+                  -- currently support deterministic code-generation.
+                  -- See Note [Unique Determinism and code generation]
+                  , case lookupUFM map2 k of
+                          Nothing -> False
+                          Just r2 -> r1 == r2 ]
+
+
+-- | Strip away liveness information, yielding NatCmmDecl
+stripLive
+        :: (Outputable statics, Outputable instr, Instruction instr)
+        => DynFlags
+        -> LiveCmmDecl statics instr
+        -> NatCmmDecl statics instr
+
+stripLive dflags live
+        = stripCmm live
+
+ where  stripCmm :: (Outputable statics, Outputable instr, Instruction instr)
+                 => LiveCmmDecl statics instr -> NatCmmDecl statics instr
+        stripCmm (CmmData sec ds)       = CmmData sec ds
+        stripCmm (CmmProc (LiveInfo info (first_id:_) _ _) label live sccs)
+         = let  final_blocks    = flattenSCCs sccs
+
+                -- make sure the block that was first in the input list
+                --      stays at the front of the output. This is the entry point
+                --      of the proc, and it needs to come first.
+                ((first':_), rest')
+                                = partition ((== first_id) . blockId) final_blocks
+
+           in   CmmProc info label live
+                          (ListGraph $ map (stripLiveBlock dflags) $ first' : rest')
+
+        -- If the proc has blocks but we don't know what the first one was, then we're dead.
+        stripCmm proc
+                 = pprPanic "RegAlloc.Liveness.stripLive: no first_id on proc" (ppr proc)
+
+-- | Strip away liveness information from a basic block,
+--   and make real spill instructions out of SPILL, RELOAD pseudos along the way.
+
+stripLiveBlock
+        :: Instruction instr
+        => DynFlags
+        -> LiveBasicBlock instr
+        -> NatBasicBlock instr
+
+stripLiveBlock dflags (BasicBlock i lis)
+ =      BasicBlock i instrs'
+
+ where  (instrs', _)
+                = runState (spillNat [] lis) 0
+
+        spillNat acc []
+         =      return (reverse acc)
+
+        spillNat acc (LiveInstr (SPILL reg slot) _ : instrs)
+         = do   delta   <- get
+                spillNat (mkSpillInstr dflags reg delta slot : acc) instrs
+
+        spillNat acc (LiveInstr (RELOAD slot reg) _ : instrs)
+         = do   delta   <- get
+                spillNat (mkLoadInstr dflags reg delta slot : acc) instrs
+
+        spillNat acc (LiveInstr (Instr instr) _ : instrs)
+         | Just i <- takeDeltaInstr instr
+         = do   put i
+                spillNat acc instrs
+
+        spillNat acc (LiveInstr (Instr instr) _ : instrs)
+         =      spillNat (instr : acc) instrs
+
+
+-- | Erase Delta instructions.
+
+eraseDeltasLive
+        :: Instruction instr
+        => LiveCmmDecl statics instr
+        -> LiveCmmDecl statics instr
+
+eraseDeltasLive cmm
+        = mapBlockTop eraseBlock cmm
+ where
+        eraseBlock (BasicBlock id lis)
+                = BasicBlock id
+                $ filter (\(LiveInstr i _) -> not $ isJust $ takeDeltaInstr i)
+                $ lis
+
+
+-- | Patch the registers in this code according to this register mapping.
+--   also erase reg -> reg moves when the reg is the same.
+--   also erase reg -> reg moves when the destination dies in this instr.
+patchEraseLive
+        :: Instruction instr
+        => (Reg -> Reg)
+        -> LiveCmmDecl statics instr -> LiveCmmDecl statics instr
+
+patchEraseLive patchF cmm
+        = patchCmm cmm
+ where
+        patchCmm cmm@CmmData{}  = cmm
+
+        patchCmm (CmmProc info label live sccs)
+         | LiveInfo static id blockMap mLiveSlots <- info
+         = let
+                patchRegSet set = mkUniqSet $ map patchF $ nonDetEltsUFM set
+                  -- See Note [Unique Determinism and code generation]
+                blockMap'       = mapMap (patchRegSet . getUniqSet) blockMap
+
+                info'           = LiveInfo static id blockMap' mLiveSlots
+           in   CmmProc info' label live $ map patchSCC sccs
+
+        patchSCC (AcyclicSCC b)  = AcyclicSCC (patchBlock b)
+        patchSCC (CyclicSCC  bs) = CyclicSCC  (map patchBlock bs)
+
+        patchBlock (BasicBlock id lis)
+                = BasicBlock id $ patchInstrs lis
+
+        patchInstrs []          = []
+        patchInstrs (li : lis)
+
+                | LiveInstr i (Just live)       <- li'
+                , Just (r1, r2) <- takeRegRegMoveInstr i
+                , eatMe r1 r2 live
+                = patchInstrs lis
+
+                | otherwise
+                = li' : patchInstrs lis
+
+                where   li'     = patchRegsLiveInstr patchF li
+
+        eatMe   r1 r2 live
+                -- source and destination regs are the same
+                | r1 == r2      = True
+
+                -- destination reg is never used
+                | elementOfUniqSet r2 (liveBorn live)
+                , elementOfUniqSet r2 (liveDieRead live) || elementOfUniqSet r2 (liveDieWrite live)
+                = True
+
+                | otherwise     = False
+
+
+-- | Patch registers in this LiveInstr, including the liveness information.
+--
+patchRegsLiveInstr
+        :: Instruction instr
+        => (Reg -> Reg)
+        -> LiveInstr instr -> LiveInstr instr
+
+patchRegsLiveInstr patchF li
+ = case li of
+        LiveInstr instr Nothing
+         -> LiveInstr (patchRegsOfInstr instr patchF) Nothing
+
+        LiveInstr instr (Just live)
+         -> LiveInstr
+                (patchRegsOfInstr instr patchF)
+                (Just live
+                        { -- WARNING: have to go via lists here because patchF changes the uniq in the Reg
+                          liveBorn      = mapUniqSet patchF $ liveBorn live
+                        , liveDieRead   = mapUniqSet patchF $ liveDieRead live
+                        , liveDieWrite  = mapUniqSet patchF $ liveDieWrite live })
+                          -- See Note [Unique Determinism and code generation]
+
+
+--------------------------------------------------------------------------------
+-- | Convert a NatCmmDecl to a LiveCmmDecl, with liveness information
+
+cmmTopLiveness
+        :: (Outputable instr, Instruction instr)
+        => Maybe CFG -> Platform
+        -> NatCmmDecl statics instr
+        -> UniqSM (LiveCmmDecl statics instr)
+cmmTopLiveness cfg platform cmm
+        = regLiveness platform $ natCmmTopToLive cfg cmm
+
+natCmmTopToLive
+        :: (Instruction instr, Outputable instr)
+        => Maybe CFG -> NatCmmDecl statics instr
+        -> LiveCmmDecl statics instr
+
+natCmmTopToLive _ (CmmData i d)
+        = CmmData i d
+
+natCmmTopToLive _ (CmmProc info lbl live (ListGraph []))
+        = CmmProc (LiveInfo info [] mapEmpty mapEmpty) lbl live []
+
+natCmmTopToLive mCfg proc@(CmmProc info lbl live (ListGraph blocks@(first : _)))
+        = CmmProc (LiveInfo info' (first_id : entry_ids) mapEmpty mapEmpty)
+                lbl live sccsLive
+   where
+        first_id        = blockId first
+        all_entry_ids   = entryBlocks proc
+        sccs            = sccBlocks blocks all_entry_ids mCfg
+        sccsLive        = map (fmap (\(BasicBlock l instrs) ->
+                                       BasicBlock l (map (\i -> LiveInstr (Instr i) Nothing) instrs)))
+                        $ sccs
+
+        entry_ids       = filter (reachable_node) .
+                          filter (/= first_id) $ all_entry_ids
+        info'           = mapFilterWithKey (\node _ -> reachable_node node) info
+        reachable_node
+          | Just cfg <- mCfg
+          = hasNode cfg
+          | otherwise
+          = const True
+
+--
+-- Compute the liveness graph of the set of basic blocks.  Important:
+-- we also discard any unreachable code here, starting from the entry
+-- points (the first block in the list, and any blocks with info
+-- tables).  Unreachable code arises when code blocks are orphaned in
+-- earlier optimisation passes, and may confuse the register allocator
+-- by referring to registers that are not initialised.  It's easy to
+-- discard the unreachable code as part of the SCC pass, so that's
+-- exactly what we do. (#7574)
+--
+sccBlocks
+        :: forall instr . Instruction instr
+        => [NatBasicBlock instr]
+        -> [BlockId]
+        -> Maybe CFG
+        -> [SCC (NatBasicBlock instr)]
+
+sccBlocks blocks entries mcfg = map (fmap node_payload) sccs
+  where
+        nodes :: [ Node BlockId (NatBasicBlock instr) ]
+        nodes = [ DigraphNode block id (getOutEdges instrs)
+                | block@(BasicBlock id instrs) <- blocks ]
+
+        g1 = graphFromEdgedVerticesUniq nodes
+
+        reachable :: LabelSet
+        reachable
+            | Just cfg <- mcfg
+            -- Our CFG only contains reachable nodes by construction at this point.
+            = setFromList $ getCfgNodes cfg
+            | otherwise
+            = setFromList $ [ node_key node | node <- reachablesG g1 roots ]
+
+        g2 = graphFromEdgedVerticesUniq [ node | node <- nodes
+                                               , node_key node
+                                                  `setMember` reachable ]
+
+        sccs = stronglyConnCompG g2
+
+        getOutEdges :: Instruction instr => [instr] -> [BlockId]
+        getOutEdges instrs = concatMap jumpDestsOfInstr instrs
+
+        -- This is truly ugly, but I don't see a good alternative.
+        -- Digraph just has the wrong API.  We want to identify nodes
+        -- by their keys (BlockId), but Digraph requires the whole
+        -- node: (NatBasicBlock, BlockId, [BlockId]).  This takes
+        -- advantage of the fact that Digraph only looks at the key,
+        -- even though it asks for the whole triple.
+        roots = [DigraphNode (panic "sccBlocks") b (panic "sccBlocks")
+                | b <- entries ]
+
+--------------------------------------------------------------------------------
+-- Annotate code with register liveness information
+--
+
+regLiveness
+        :: (Outputable instr, Instruction instr)
+        => Platform
+        -> LiveCmmDecl statics instr
+        -> UniqSM (LiveCmmDecl statics instr)
+
+regLiveness _ (CmmData i d)
+        = return $ CmmData i d
+
+regLiveness _ (CmmProc info lbl live [])
+        | LiveInfo static mFirst _ _    <- info
+        = return $ CmmProc
+                        (LiveInfo static mFirst mapEmpty mapEmpty)
+                        lbl live []
+
+regLiveness platform (CmmProc info lbl live sccs)
+        | LiveInfo static mFirst _ liveSlotsOnEntry     <- info
+        = let   (ann_sccs, block_live)  = computeLiveness platform sccs
+
+          in    return $ CmmProc (LiveInfo static mFirst block_live liveSlotsOnEntry)
+                           lbl live ann_sccs
+
+
+-- -----------------------------------------------------------------------------
+-- | Check ordering of Blocks
+--   The computeLiveness function requires SCCs to be in reverse
+--   dependent order.  If they're not the liveness information will be
+--   wrong, and we'll get a bad allocation.  Better to check for this
+--   precondition explicitly or some other poor sucker will waste a
+--   day staring at bad assembly code..
+--
+checkIsReverseDependent
+        :: Instruction instr
+        => [SCC (LiveBasicBlock instr)]         -- ^ SCCs of blocks that we're about to run the liveness determinator on.
+        -> Maybe BlockId                        -- ^ BlockIds that fail the test (if any)
+
+checkIsReverseDependent sccs'
+ = go emptyUniqSet sccs'
+
+ where  go _ []
+         = Nothing
+
+        go blocksSeen (AcyclicSCC block : sccs)
+         = let  dests           = slurpJumpDestsOfBlock block
+                blocksSeen'     = unionUniqSets blocksSeen $ mkUniqSet [blockId block]
+                badDests        = dests `minusUniqSet` blocksSeen'
+           in   case nonDetEltsUniqSet badDests of
+                 -- See Note [Unique Determinism and code generation]
+                 []             -> go blocksSeen' sccs
+                 bad : _        -> Just bad
+
+        go blocksSeen (CyclicSCC blocks : sccs)
+         = let  dests           = unionManyUniqSets $ map slurpJumpDestsOfBlock blocks
+                blocksSeen'     = unionUniqSets blocksSeen $ mkUniqSet $ map blockId blocks
+                badDests        = dests `minusUniqSet` blocksSeen'
+           in   case nonDetEltsUniqSet badDests of
+                 -- See Note [Unique Determinism and code generation]
+                 []             -> go blocksSeen' sccs
+                 bad : _        -> Just bad
+
+        slurpJumpDestsOfBlock (BasicBlock _ instrs)
+                = unionManyUniqSets
+                $ map (mkUniqSet . jumpDestsOfInstr)
+                        [ i | LiveInstr i _ <- instrs]
+
+
+-- | If we've compute liveness info for this code already we have to reverse
+--   the SCCs in each top to get them back to the right order so we can do it again.
+reverseBlocksInTops :: LiveCmmDecl statics instr -> LiveCmmDecl statics instr
+reverseBlocksInTops top
+ = case top of
+        CmmData{}                       -> top
+        CmmProc info lbl live sccs      -> CmmProc info lbl live (reverse sccs)
+
+
+-- | Computing liveness
+--
+--  On entry, the SCCs must be in "reverse" order: later blocks may transfer
+--  control to earlier ones only, else `panic`.
+--
+--  The SCCs returned are in the *opposite* order, which is exactly what we
+--  want for the next pass.
+--
+computeLiveness
+        :: (Outputable instr, Instruction instr)
+        => Platform
+        -> [SCC (LiveBasicBlock instr)]
+        -> ([SCC (LiveBasicBlock instr)],       -- instructions annotated with list of registers
+                                                -- which are "dead after this instruction".
+               BlockMap RegSet)                 -- blocks annotated with set of live registers
+                                                -- on entry to the block.
+
+computeLiveness platform sccs
+ = case checkIsReverseDependent sccs of
+        Nothing         -> livenessSCCs platform mapEmpty [] sccs
+        Just bad        -> pprPanic "RegAlloc.Liveness.computeLiveness"
+                                (vcat   [ text "SCCs aren't in reverse dependent order"
+                                        , text "bad blockId" <+> ppr bad
+                                        , ppr sccs])
+
+livenessSCCs
+       :: Instruction instr
+       => Platform
+       -> BlockMap RegSet
+       -> [SCC (LiveBasicBlock instr)]          -- accum
+       -> [SCC (LiveBasicBlock instr)]
+       -> ( [SCC (LiveBasicBlock instr)]
+          , BlockMap RegSet)
+
+livenessSCCs _ blockmap done []
+        = (done, blockmap)
+
+livenessSCCs platform blockmap done (AcyclicSCC block : sccs)
+ = let  (blockmap', block')     = livenessBlock platform blockmap block
+   in   livenessSCCs platform blockmap' (AcyclicSCC block' : done) sccs
+
+livenessSCCs platform blockmap done
+        (CyclicSCC blocks : sccs) =
+        livenessSCCs platform blockmap' (CyclicSCC blocks':done) sccs
+ where      (blockmap', blocks')
+                = iterateUntilUnchanged linearLiveness equalBlockMaps
+                                      blockmap blocks
+
+            iterateUntilUnchanged
+                :: (a -> b -> (a,c)) -> (a -> a -> Bool)
+                -> a -> b
+                -> (a,c)
+
+            iterateUntilUnchanged f eq a b
+                = head $
+                  concatMap tail $
+                  groupBy (\(a1, _) (a2, _) -> eq a1 a2) $
+                  iterate (\(a, _) -> f a b) $
+                  (a, panic "RegLiveness.livenessSCCs")
+
+
+            linearLiveness
+                :: Instruction instr
+                => BlockMap RegSet -> [LiveBasicBlock instr]
+                -> (BlockMap RegSet, [LiveBasicBlock instr])
+
+            linearLiveness = mapAccumL (livenessBlock platform)
+
+                -- probably the least efficient way to compare two
+                -- BlockMaps for equality.
+            equalBlockMaps a b
+                = a' == b'
+              where a' = map f $ mapToList a
+                    b' = map f $ mapToList b
+                    f (key,elt) = (key, nonDetEltsUniqSet elt)
+                    -- See Note [Unique Determinism and code generation]
+
+
+
+-- | Annotate a basic block with register liveness information.
+--
+livenessBlock
+        :: Instruction instr
+        => Platform
+        -> BlockMap RegSet
+        -> LiveBasicBlock instr
+        -> (BlockMap RegSet, LiveBasicBlock instr)
+
+livenessBlock platform blockmap (BasicBlock block_id instrs)
+ = let
+        (regsLiveOnEntry, instrs1)
+            = livenessBack platform emptyUniqSet blockmap [] (reverse instrs)
+        blockmap'       = mapInsert block_id regsLiveOnEntry blockmap
+
+        instrs2         = livenessForward platform regsLiveOnEntry instrs1
+
+        output          = BasicBlock block_id instrs2
+
+   in   ( blockmap', output)
+
+-- | Calculate liveness going forwards,
+--   filling in when regs are born
+
+livenessForward
+        :: Instruction instr
+        => Platform
+        -> RegSet                       -- regs live on this instr
+        -> [LiveInstr instr] -> [LiveInstr instr]
+
+livenessForward _        _           []  = []
+livenessForward platform rsLiveEntry (li@(LiveInstr instr mLive) : lis)
+        | Just live <- mLive
+        = let
+                RU _ written  = regUsageOfInstr platform instr
+                -- Regs that are written to but weren't live on entry to this instruction
+                --      are recorded as being born here.
+                rsBorn          = mkUniqSet
+                                $ filter (\r -> not $ elementOfUniqSet r rsLiveEntry) written
+
+                rsLiveNext      = (rsLiveEntry `unionUniqSets` rsBorn)
+                                        `minusUniqSet` (liveDieRead live)
+                                        `minusUniqSet` (liveDieWrite live)
+
+        in LiveInstr instr (Just live { liveBorn = rsBorn })
+                : livenessForward platform rsLiveNext lis
+
+        | otherwise
+        = li : livenessForward platform rsLiveEntry lis
+
+
+-- | Calculate liveness going backwards,
+--   filling in when regs die, and what regs are live across each instruction
+
+livenessBack
+        :: Instruction instr
+        => Platform
+        -> RegSet                       -- regs live on this instr
+        -> BlockMap RegSet              -- regs live on entry to other BBs
+        -> [LiveInstr instr]            -- instructions (accum)
+        -> [LiveInstr instr]            -- instructions
+        -> (RegSet, [LiveInstr instr])
+
+livenessBack _        liveregs _        done []  = (liveregs, done)
+
+livenessBack platform liveregs blockmap acc (instr : instrs)
+ = let  (liveregs', instr')     = liveness1 platform liveregs blockmap instr
+   in   livenessBack platform liveregs' blockmap (instr' : acc) instrs
+
+
+-- don't bother tagging comments or deltas with liveness
+liveness1
+        :: Instruction instr
+        => Platform
+        -> RegSet
+        -> BlockMap RegSet
+        -> LiveInstr instr
+        -> (RegSet, LiveInstr instr)
+
+liveness1 _ liveregs _ (LiveInstr instr _)
+        | isMetaInstr instr
+        = (liveregs, LiveInstr instr Nothing)
+
+liveness1 platform liveregs blockmap (LiveInstr instr _)
+
+        | not_a_branch
+        = (liveregs1, LiveInstr instr
+                        (Just $ Liveness
+                        { liveBorn      = emptyUniqSet
+                        , liveDieRead   = mkUniqSet r_dying
+                        , liveDieWrite  = mkUniqSet w_dying }))
+
+        | otherwise
+        = (liveregs_br, LiveInstr instr
+                        (Just $ Liveness
+                        { liveBorn      = emptyUniqSet
+                        , liveDieRead   = mkUniqSet r_dying_br
+                        , liveDieWrite  = mkUniqSet w_dying }))
+
+        where
+            !(RU read written) = regUsageOfInstr platform instr
+
+            -- registers that were written here are dead going backwards.
+            -- registers that were read here are live going backwards.
+            liveregs1   = (liveregs `delListFromUniqSet` written)
+                                    `addListToUniqSet` read
+
+            -- registers that are not live beyond this point, are recorded
+            --  as dying here.
+            r_dying     = [ reg | reg <- read, reg `notElem` written,
+                              not (elementOfUniqSet reg liveregs) ]
+
+            w_dying     = [ reg | reg <- written,
+                             not (elementOfUniqSet reg liveregs) ]
+
+            -- union in the live regs from all the jump destinations of this
+            -- instruction.
+            targets      = jumpDestsOfInstr instr -- where we go from here
+            not_a_branch = null targets
+
+            targetLiveRegs target
+                  = case mapLookup target blockmap of
+                                Just ra -> ra
+                                Nothing -> emptyRegSet
+
+            live_from_branch = unionManyUniqSets (map targetLiveRegs targets)
+
+            liveregs_br = liveregs1 `unionUniqSets` live_from_branch
+
+            -- registers that are live only in the branch targets should
+            -- be listed as dying here.
+            live_branch_only = live_from_branch `minusUniqSet` liveregs
+            r_dying_br  = nonDetEltsUniqSet (mkUniqSet r_dying `unionUniqSets`
+                                             live_branch_only)
+                          -- See Note [Unique Determinism and code generation]
diff --git a/compiler/GHC/CmmToAsm/Reg/Target.hs b/compiler/GHC/CmmToAsm/Reg/Target.hs
new file mode 100644
index 0000000000..a45d70c826
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/Reg/Target.hs
@@ -0,0 +1,135 @@
+{-# LANGUAGE CPP #-}
+-- | Hard wired things related to registers.
+--      This is module is preventing the native code generator being able to
+--      emit code for non-host architectures.
+--
+--      TODO: Do a better job of the overloading, and eliminate this module.
+--      We'd probably do better with a Register type class, and hook this to
+--      Instruction somehow.
+--
+--      TODO: We should also make arch specific versions of RegAlloc.Graph.TrivColorable
+module GHC.CmmToAsm.Reg.Target (
+        targetVirtualRegSqueeze,
+        targetRealRegSqueeze,
+        targetClassOfRealReg,
+        targetMkVirtualReg,
+        targetRegDotColor,
+        targetClassOfReg
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+import GHC.CmmToAsm.Format
+
+import Outputable
+import Unique
+import GHC.Platform
+
+import qualified GHC.CmmToAsm.X86.Regs       as X86
+import qualified GHC.CmmToAsm.X86.RegInfo    as X86
+import qualified GHC.CmmToAsm.PPC.Regs       as PPC
+import qualified GHC.CmmToAsm.SPARC.Regs     as SPARC
+
+targetVirtualRegSqueeze :: Platform -> RegClass -> VirtualReg -> Int
+targetVirtualRegSqueeze platform
+    = case platformArch platform of
+      ArchX86       -> X86.virtualRegSqueeze
+      ArchX86_64    -> X86.virtualRegSqueeze
+      ArchPPC       -> PPC.virtualRegSqueeze
+      ArchS390X     -> panic "targetVirtualRegSqueeze ArchS390X"
+      ArchSPARC     -> SPARC.virtualRegSqueeze
+      ArchSPARC64   -> panic "targetVirtualRegSqueeze ArchSPARC64"
+      ArchPPC_64 _  -> PPC.virtualRegSqueeze
+      ArchARM _ _ _ -> panic "targetVirtualRegSqueeze ArchARM"
+      ArchARM64     -> panic "targetVirtualRegSqueeze ArchARM64"
+      ArchAlpha     -> panic "targetVirtualRegSqueeze ArchAlpha"
+      ArchMipseb    -> panic "targetVirtualRegSqueeze ArchMipseb"
+      ArchMipsel    -> panic "targetVirtualRegSqueeze ArchMipsel"
+      ArchJavaScript-> panic "targetVirtualRegSqueeze ArchJavaScript"
+      ArchUnknown   -> panic "targetVirtualRegSqueeze ArchUnknown"
+
+
+targetRealRegSqueeze :: Platform -> RegClass -> RealReg -> Int
+targetRealRegSqueeze platform
+    = case platformArch platform of
+      ArchX86       -> X86.realRegSqueeze
+      ArchX86_64    -> X86.realRegSqueeze
+      ArchPPC       -> PPC.realRegSqueeze
+      ArchS390X     -> panic "targetRealRegSqueeze ArchS390X"
+      ArchSPARC     -> SPARC.realRegSqueeze
+      ArchSPARC64   -> panic "targetRealRegSqueeze ArchSPARC64"
+      ArchPPC_64 _  -> PPC.realRegSqueeze
+      ArchARM _ _ _ -> panic "targetRealRegSqueeze ArchARM"
+      ArchARM64     -> panic "targetRealRegSqueeze ArchARM64"
+      ArchAlpha     -> panic "targetRealRegSqueeze ArchAlpha"
+      ArchMipseb    -> panic "targetRealRegSqueeze ArchMipseb"
+      ArchMipsel    -> panic "targetRealRegSqueeze ArchMipsel"
+      ArchJavaScript-> panic "targetRealRegSqueeze ArchJavaScript"
+      ArchUnknown   -> panic "targetRealRegSqueeze ArchUnknown"
+
+targetClassOfRealReg :: Platform -> RealReg -> RegClass
+targetClassOfRealReg platform
+    = case platformArch platform of
+      ArchX86       -> X86.classOfRealReg platform
+      ArchX86_64    -> X86.classOfRealReg platform
+      ArchPPC       -> PPC.classOfRealReg
+      ArchS390X     -> panic "targetClassOfRealReg ArchS390X"
+      ArchSPARC     -> SPARC.classOfRealReg
+      ArchSPARC64   -> panic "targetClassOfRealReg ArchSPARC64"
+      ArchPPC_64 _  -> PPC.classOfRealReg
+      ArchARM _ _ _ -> panic "targetClassOfRealReg ArchARM"
+      ArchARM64     -> panic "targetClassOfRealReg ArchARM64"
+      ArchAlpha     -> panic "targetClassOfRealReg ArchAlpha"
+      ArchMipseb    -> panic "targetClassOfRealReg ArchMipseb"
+      ArchMipsel    -> panic "targetClassOfRealReg ArchMipsel"
+      ArchJavaScript-> panic "targetClassOfRealReg ArchJavaScript"
+      ArchUnknown   -> panic "targetClassOfRealReg ArchUnknown"
+
+targetMkVirtualReg :: Platform -> Unique -> Format -> VirtualReg
+targetMkVirtualReg platform
+    = case platformArch platform of
+      ArchX86       -> X86.mkVirtualReg
+      ArchX86_64    -> X86.mkVirtualReg
+      ArchPPC       -> PPC.mkVirtualReg
+      ArchS390X     -> panic "targetMkVirtualReg ArchS390X"
+      ArchSPARC     -> SPARC.mkVirtualReg
+      ArchSPARC64   -> panic "targetMkVirtualReg ArchSPARC64"
+      ArchPPC_64 _  -> PPC.mkVirtualReg
+      ArchARM _ _ _ -> panic "targetMkVirtualReg ArchARM"
+      ArchARM64     -> panic "targetMkVirtualReg ArchARM64"
+      ArchAlpha     -> panic "targetMkVirtualReg ArchAlpha"
+      ArchMipseb    -> panic "targetMkVirtualReg ArchMipseb"
+      ArchMipsel    -> panic "targetMkVirtualReg ArchMipsel"
+      ArchJavaScript-> panic "targetMkVirtualReg ArchJavaScript"
+      ArchUnknown   -> panic "targetMkVirtualReg ArchUnknown"
+
+targetRegDotColor :: Platform -> RealReg -> SDoc
+targetRegDotColor platform
+    = case platformArch platform of
+      ArchX86       -> X86.regDotColor platform
+      ArchX86_64    -> X86.regDotColor platform
+      ArchPPC       -> PPC.regDotColor
+      ArchS390X     -> panic "targetRegDotColor ArchS390X"
+      ArchSPARC     -> SPARC.regDotColor
+      ArchSPARC64   -> panic "targetRegDotColor ArchSPARC64"
+      ArchPPC_64 _  -> PPC.regDotColor
+      ArchARM _ _ _ -> panic "targetRegDotColor ArchARM"
+      ArchARM64     -> panic "targetRegDotColor ArchARM64"
+      ArchAlpha     -> panic "targetRegDotColor ArchAlpha"
+      ArchMipseb    -> panic "targetRegDotColor ArchMipseb"
+      ArchMipsel    -> panic "targetRegDotColor ArchMipsel"
+      ArchJavaScript-> panic "targetRegDotColor ArchJavaScript"
+      ArchUnknown   -> panic "targetRegDotColor ArchUnknown"
+
+
+targetClassOfReg :: Platform -> Reg -> RegClass
+targetClassOfReg platform reg
+ = case reg of
+   RegVirtual vr -> classOfVirtualReg vr
+   RegReal rr -> targetClassOfRealReg platform rr
diff --git a/compiler/GHC/CmmToAsm/SPARC/AddrMode.hs b/compiler/GHC/CmmToAsm/SPARC/AddrMode.hs
new file mode 100644
index 0000000000..6cc660bba9
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/AddrMode.hs
@@ -0,0 +1,44 @@
+
+module GHC.CmmToAsm.SPARC.AddrMode (
+        AddrMode(..),
+        addrOffset
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.Base
+import GHC.Platform.Reg
+
+-- addressing modes ------------------------------------------------------------
+
+-- | Represents a memory address in an instruction.
+--      Being a RISC machine, the SPARC addressing modes are very regular.
+--
+data AddrMode
+        = AddrRegReg    Reg Reg         -- addr = r1 + r2
+        | AddrRegImm    Reg Imm         -- addr = r1 + imm
+
+
+-- | Add an integer offset to the address in an AddrMode.
+--
+addrOffset :: AddrMode -> Int -> Maybe AddrMode
+addrOffset addr off
+  = case addr of
+      AddrRegImm r (ImmInt n)
+       | fits13Bits n2 -> Just (AddrRegImm r (ImmInt n2))
+       | otherwise     -> Nothing
+       where n2 = n + off
+
+      AddrRegImm r (ImmInteger n)
+       | fits13Bits n2 -> Just (AddrRegImm r (ImmInt (fromInteger n2)))
+       | otherwise     -> Nothing
+       where n2 = n + toInteger off
+
+      AddrRegReg r (RegReal (RealRegSingle 0))
+       | fits13Bits off -> Just (AddrRegImm r (ImmInt off))
+       | otherwise     -> Nothing
+
+      _ -> Nothing
diff --git a/compiler/GHC/CmmToAsm/SPARC/Base.hs b/compiler/GHC/CmmToAsm/SPARC/Base.hs
new file mode 100644
index 0000000000..86a897dacb
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Base.hs
@@ -0,0 +1,77 @@
+
+-- | Bits and pieces on the bottom of the module dependency tree.
+--      Also import the required constants, so we know what we're using.
+--
+--      In the interests of cross-compilation, we want to free ourselves
+--      from the autoconf generated modules like main/Constants
+
+module GHC.CmmToAsm.SPARC.Base (
+        wordLength,
+        wordLengthInBits,
+        spillAreaLength,
+        spillSlotSize,
+        extraStackArgsHere,
+        fits13Bits,
+        is32BitInteger,
+        largeOffsetError
+)
+
+where
+
+import GhcPrelude
+
+import GHC.Driver.Session
+import Panic
+
+import Data.Int
+
+
+-- On 32 bit SPARC, pointers are 32 bits.
+wordLength :: Int
+wordLength = 4
+
+wordLengthInBits :: Int
+wordLengthInBits
+        = wordLength * 8
+
+-- Size of the available spill area
+spillAreaLength :: DynFlags -> Int
+spillAreaLength
+        = rESERVED_C_STACK_BYTES
+
+-- | We need 8 bytes because our largest registers are 64 bit.
+spillSlotSize :: Int
+spillSlotSize = 8
+
+
+-- | We (allegedly) put the first six C-call arguments in registers;
+--      where do we start putting the rest of them?
+extraStackArgsHere :: Int
+extraStackArgsHere = 23
+
+
+{-# SPECIALIZE fits13Bits :: Int -> Bool, Integer -> Bool #-}
+-- | Check whether an offset is representable with 13 bits.
+fits13Bits :: Integral a => a -> Bool
+fits13Bits x = x >= -4096 && x < 4096
+
+-- | Check whether an integer will fit in 32 bits.
+--      A CmmInt is intended to be truncated to the appropriate
+--      number of bits, so here we truncate it to Int64.  This is
+--      important because e.g. -1 as a CmmInt might be either
+--      -1 or 18446744073709551615.
+--
+is32BitInteger :: Integer -> Bool
+is32BitInteger i
+        = i64 <= 0x7fffffff && i64 >= -0x80000000
+        where i64 = fromIntegral i :: Int64
+
+
+-- | Sadness.
+largeOffsetError :: (Show a) => a -> b
+largeOffsetError i
+  = panic ("ERROR: SPARC native-code generator cannot handle large offset ("
+                ++ show i ++ ");\nprobably because of large constant data structures;" ++
+                "\nworkaround: use -fllvm on this module.\n")
+
+
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen.hs
new file mode 100644
index 0000000000..2580ea4014
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen.hs
@@ -0,0 +1,700 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Generating machine code (instruction selection)
+--
+-- (c) The University of Glasgow 1996-2013
+--
+-----------------------------------------------------------------------------
+
+{-# LANGUAGE GADTs #-}
+module GHC.CmmToAsm.SPARC.CodeGen (
+        cmmTopCodeGen,
+        generateJumpTableForInstr,
+        InstrBlock
+)
+
+where
+
+#include "HsVersions.h"
+
+-- NCG stuff:
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.SPARC.CodeGen.Sanity
+import GHC.CmmToAsm.SPARC.CodeGen.Amode
+import GHC.CmmToAsm.SPARC.CodeGen.CondCode
+import GHC.CmmToAsm.SPARC.CodeGen.Gen64
+import GHC.CmmToAsm.SPARC.CodeGen.Gen32
+import GHC.CmmToAsm.SPARC.CodeGen.Base
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Stack
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Monad   ( NatM, getNewRegNat, getNewLabelNat )
+
+-- Our intermediate code:
+import GHC.Cmm.BlockId
+import GHC.Cmm
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Graph
+import GHC.CmmToAsm.PIC
+import GHC.Platform.Reg
+import GHC.Cmm.CLabel
+import GHC.CmmToAsm.CPrim
+
+-- The rest:
+import BasicTypes
+import GHC.Driver.Session
+import FastString
+import OrdList
+import Outputable
+import GHC.Platform
+
+import Control.Monad    ( mapAndUnzipM )
+
+-- | Top level code generation
+cmmTopCodeGen :: RawCmmDecl
+              -> NatM [NatCmmDecl RawCmmStatics Instr]
+
+cmmTopCodeGen (CmmProc info lab live graph)
+ = do let blocks = toBlockListEntryFirst graph
+      (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
+
+      let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
+      let tops = proc : concat statics
+
+      return tops
+
+cmmTopCodeGen (CmmData sec dat) = do
+  return [CmmData sec dat]  -- no translation, we just use CmmStatic
+
+
+-- | Do code generation on a single block of CMM code.
+--      code generation may introduce new basic block boundaries, which
+--      are indicated by the NEWBLOCK instruction.  We must split up the
+--      instruction stream into basic blocks again.  Also, we extract
+--      LDATAs here too.
+basicBlockCodeGen :: CmmBlock
+                  -> NatM ( [NatBasicBlock Instr]
+                          , [NatCmmDecl RawCmmStatics Instr])
+
+basicBlockCodeGen block = do
+  let (_, nodes, tail)  = blockSplit block
+      id = entryLabel block
+      stmts = blockToList nodes
+  mid_instrs <- stmtsToInstrs stmts
+  tail_instrs <- stmtToInstrs tail
+  let instrs = mid_instrs `appOL` tail_instrs
+  let
+        (top,other_blocks,statics)
+                = foldrOL mkBlocks ([],[],[]) instrs
+
+        mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
+          = ([], BasicBlock id instrs : blocks, statics)
+
+        mkBlocks (LDATA sec dat) (instrs,blocks,statics)
+          = (instrs, blocks, CmmData sec dat:statics)
+
+        mkBlocks instr (instrs,blocks,statics)
+          = (instr:instrs, blocks, statics)
+
+        -- do intra-block sanity checking
+        blocksChecked
+                = map (checkBlock block)
+                $ BasicBlock id top : other_blocks
+
+  return (blocksChecked, statics)
+
+
+-- | Convert some Cmm statements to SPARC instructions.
+stmtsToInstrs :: [CmmNode e x] -> NatM InstrBlock
+stmtsToInstrs stmts
+   = do instrss <- mapM stmtToInstrs stmts
+        return (concatOL instrss)
+
+
+stmtToInstrs :: CmmNode e x -> NatM InstrBlock
+stmtToInstrs stmt = do
+  dflags <- getDynFlags
+  case stmt of
+    CmmComment s   -> return (unitOL (COMMENT s))
+    CmmTick {}     -> return nilOL
+    CmmUnwind {}   -> return nilOL
+
+    CmmAssign reg src
+      | isFloatType ty  -> assignReg_FltCode format reg src
+      | isWord64 ty     -> assignReg_I64Code        reg src
+      | otherwise       -> assignReg_IntCode format reg src
+        where ty = cmmRegType dflags reg
+              format = cmmTypeFormat ty
+
+    CmmStore addr src
+      | isFloatType ty  -> assignMem_FltCode format addr src
+      | isWord64 ty     -> assignMem_I64Code      addr src
+      | otherwise       -> assignMem_IntCode format addr src
+        where ty = cmmExprType dflags src
+              format = cmmTypeFormat ty
+
+    CmmUnsafeForeignCall target result_regs args
+       -> genCCall target result_regs args
+
+    CmmBranch   id              -> genBranch id
+    CmmCondBranch arg true false _ -> do
+      b1 <- genCondJump true arg
+      b2 <- genBranch false
+      return (b1 `appOL` b2)
+    CmmSwitch arg ids   -> do dflags <- getDynFlags
+                              genSwitch dflags arg ids
+    CmmCall { cml_target = arg } -> genJump arg
+
+    _
+     -> panic "stmtToInstrs: statement should have been cps'd away"
+
+
+{-
+Now, given a tree (the argument to a CmmLoad) that references memory,
+produce a suitable addressing mode.
+
+A Rule of the Game (tm) for Amodes: use of the addr bit must
+immediately follow use of the code part, since the code part puts
+values in registers which the addr then refers to.  So you can't put
+anything in between, lest it overwrite some of those registers.  If
+you need to do some other computation between the code part and use of
+the addr bit, first store the effective address from the amode in a
+temporary, then do the other computation, and then use the temporary:
+
+    code
+    LEA amode, tmp
+    ... other computation ...
+    ... (tmp) ...
+-}
+
+
+
+-- | Convert a BlockId to some CmmStatic data
+jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic
+jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags))
+jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
+    where blockLabel = blockLbl blockid
+
+
+
+-- -----------------------------------------------------------------------------
+-- Generating assignments
+
+-- Assignments are really at the heart of the whole code generation
+-- business.  Almost all top-level nodes of any real importance are
+-- assignments, which correspond to loads, stores, or register
+-- transfers.  If we're really lucky, some of the register transfers
+-- will go away, because we can use the destination register to
+-- complete the code generation for the right hand side.  This only
+-- fails when the right hand side is forced into a fixed register
+-- (e.g. the result of a call).
+
+assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignMem_IntCode pk addr src = do
+    (srcReg, code) <- getSomeReg src
+    Amode dstAddr addr_code <- getAmode addr
+    return $ code `appOL` addr_code `snocOL` ST pk srcReg dstAddr
+
+
+assignReg_IntCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+assignReg_IntCode _ reg src = do
+    dflags <- getDynFlags
+    r <- getRegister src
+    let dst = getRegisterReg (targetPlatform dflags) reg
+    return $ case r of
+        Any _ code         -> code dst
+        Fixed _ freg fcode -> fcode `snocOL` OR False g0 (RIReg freg) dst
+
+
+
+-- Floating point assignment to memory
+assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignMem_FltCode pk addr src = do
+    dflags <- getDynFlags
+    Amode dst__2 code1 <- getAmode addr
+    (src__2, code2) <- getSomeReg src
+    tmp1 <- getNewRegNat pk
+    let
+        pk__2   = cmmExprType dflags src
+        code__2 = code1 `appOL` code2 `appOL`
+            if   formatToWidth pk == typeWidth pk__2
+            then unitOL (ST pk src__2 dst__2)
+            else toOL   [ FxTOy (cmmTypeFormat pk__2) pk src__2 tmp1
+                        , ST    pk tmp1 dst__2]
+    return code__2
+
+-- Floating point assignment to a register/temporary
+assignReg_FltCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+assignReg_FltCode pk dstCmmReg srcCmmExpr = do
+    dflags <- getDynFlags
+    let platform = targetPlatform dflags
+    srcRegister <- getRegister srcCmmExpr
+    let dstReg  = getRegisterReg platform dstCmmReg
+
+    return $ case srcRegister of
+        Any _ code                  -> code dstReg
+        Fixed _ srcFixedReg srcCode -> srcCode `snocOL` FMOV pk srcFixedReg dstReg
+
+
+
+
+genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock
+
+genJump (CmmLit (CmmLabel lbl))
+  = return (toOL [CALL (Left target) 0 True, NOP])
+  where
+    target = ImmCLbl lbl
+
+genJump tree
+  = do
+        (target, code) <- getSomeReg tree
+        return (code `snocOL` JMP (AddrRegReg target g0)  `snocOL` NOP)
+
+-- -----------------------------------------------------------------------------
+--  Unconditional branches
+
+genBranch :: BlockId -> NatM InstrBlock
+genBranch = return . toOL . mkJumpInstr
+
+
+-- -----------------------------------------------------------------------------
+--  Conditional jumps
+
+{-
+Conditional jumps are always to local labels, so we can use branch
+instructions.  We peek at the arguments to decide what kind of
+comparison to do.
+
+SPARC: First, we have to ensure that the condition codes are set
+according to the supplied comparison operation.  We generate slightly
+different code for floating point comparisons, because a floating
+point operation cannot directly precede a @BF@.  We assume the worst
+and fill that slot with a @NOP@.
+
+SPARC: Do not fill the delay slots here; you will confuse the register
+allocator.
+-}
+
+
+genCondJump
+    :: BlockId      -- the branch target
+    -> CmmExpr      -- the condition on which to branch
+    -> NatM InstrBlock
+
+
+
+genCondJump bid bool = do
+  CondCode is_float cond code <- getCondCode bool
+  return (
+       code `appOL`
+       toOL (
+         if   is_float
+         then [NOP, BF cond False bid, NOP]
+         else [BI cond False bid, NOP]
+       )
+    )
+
+
+
+-- -----------------------------------------------------------------------------
+-- Generating a table-branch
+
+genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock
+genSwitch dflags expr targets
+        | positionIndependent dflags
+        = error "MachCodeGen: sparc genSwitch PIC not finished\n"
+
+        | otherwise
+        = do    (e_reg, e_code) <- getSomeReg (cmmOffset dflags expr offset)
+
+                base_reg        <- getNewRegNat II32
+                offset_reg      <- getNewRegNat II32
+                dst             <- getNewRegNat II32
+
+                label           <- getNewLabelNat
+
+                return $ e_code `appOL`
+                 toOL
+                        [ -- load base of jump table
+                          SETHI (HI (ImmCLbl label)) base_reg
+                        , OR    False base_reg (RIImm $ LO $ ImmCLbl label) base_reg
+
+                        -- the addrs in the table are 32 bits wide..
+                        , SLL   e_reg (RIImm $ ImmInt 2) offset_reg
+
+                        -- load and jump to the destination
+                        , LD      II32 (AddrRegReg base_reg offset_reg) dst
+                        , JMP_TBL (AddrRegImm dst (ImmInt 0)) ids label
+                        , NOP ]
+  where (offset, ids) = switchTargetsToTable targets
+
+generateJumpTableForInstr :: DynFlags -> Instr
+                          -> Maybe (NatCmmDecl RawCmmStatics Instr)
+generateJumpTableForInstr dflags (JMP_TBL _ ids label) =
+  let jumpTable = map (jumpTableEntry dflags) ids
+  in Just (CmmData (Section ReadOnlyData label) (RawCmmStatics label jumpTable))
+generateJumpTableForInstr _ _ = Nothing
+
+
+
+-- -----------------------------------------------------------------------------
+-- Generating C calls
+
+{-
+   Now the biggest nightmare---calls.  Most of the nastiness is buried in
+   @get_arg@, which moves the arguments to the correct registers/stack
+   locations.  Apart from that, the code is easy.
+
+   The SPARC calling convention is an absolute
+   nightmare.  The first 6x32 bits of arguments are mapped into
+   %o0 through %o5, and the remaining arguments are dumped to the
+   stack, beginning at [%sp+92].  (Note that %o6 == %sp.)
+
+   If we have to put args on the stack, move %o6==%sp down by
+   the number of words to go on the stack, to ensure there's enough space.
+
+   According to Fraser and Hanson's lcc book, page 478, fig 17.2,
+   16 words above the stack pointer is a word for the address of
+   a structure return value.  I use this as a temporary location
+   for moving values from float to int regs.  Certainly it isn't
+   safe to put anything in the 16 words starting at %sp, since
+   this area can get trashed at any time due to window overflows
+   caused by signal handlers.
+
+   A final complication (if the above isn't enough) is that
+   we can't blithely calculate the arguments one by one into
+   %o0 .. %o5.  Consider the following nested calls:
+
+       fff a (fff b c)
+
+   Naive code moves a into %o0, and (fff b c) into %o1.  Unfortunately
+   the inner call will itself use %o0, which trashes the value put there
+   in preparation for the outer call.  Upshot: we need to calculate the
+   args into temporary regs, and move those to arg regs or onto the
+   stack only immediately prior to the call proper.  Sigh.
+-}
+
+genCCall
+    :: ForeignTarget            -- function to call
+    -> [CmmFormal]        -- where to put the result
+    -> [CmmActual]        -- arguments (of mixed type)
+    -> NatM InstrBlock
+
+
+
+-- On SPARC under TSO (Total Store Ordering), writes earlier in the instruction stream
+-- are guaranteed to take place before writes afterwards (unlike on PowerPC).
+-- Ref: Section 8.4 of the SPARC V9 Architecture manual.
+--
+-- In the SPARC case we don't need a barrier.
+--
+genCCall (PrimTarget MO_ReadBarrier) _ _
+ = return $ nilOL
+genCCall (PrimTarget MO_WriteBarrier) _ _
+ = return $ nilOL
+
+genCCall (PrimTarget (MO_Prefetch_Data _)) _ _
+ = return $ nilOL
+
+genCCall target dest_regs args
+ = do   -- work out the arguments, and assign them to integer regs
+        argcode_and_vregs       <- mapM arg_to_int_vregs args
+        let (argcodes, vregss)  = unzip argcode_and_vregs
+        let vregs               = concat vregss
+
+        let n_argRegs           = length allArgRegs
+        let n_argRegs_used      = min (length vregs) n_argRegs
+
+
+        -- deal with static vs dynamic call targets
+        callinsns <- case target of
+                ForeignTarget (CmmLit (CmmLabel lbl)) _ ->
+                        return (unitOL (CALL (Left (litToImm (CmmLabel lbl))) n_argRegs_used False))
+
+                ForeignTarget expr _
+                 -> do  (dyn_c, dyn_rs) <- arg_to_int_vregs expr
+                        let dyn_r = case dyn_rs of
+                                      [dyn_r'] -> dyn_r'
+                                      _ -> panic "SPARC.CodeGen.genCCall: arg_to_int"
+                        return (dyn_c `snocOL` CALL (Right dyn_r) n_argRegs_used False)
+
+                PrimTarget mop
+                 -> do  res     <- outOfLineMachOp mop
+                        lblOrMopExpr <- case res of
+                                Left lbl -> do
+                                        return (unitOL (CALL (Left (litToImm (CmmLabel lbl))) n_argRegs_used False))
+
+                                Right mopExpr -> do
+                                        (dyn_c, dyn_rs) <- arg_to_int_vregs mopExpr
+                                        let dyn_r = case dyn_rs of
+                                                      [dyn_r'] -> dyn_r'
+                                                      _ -> panic "SPARC.CodeGen.genCCall: arg_to_int"
+                                        return (dyn_c `snocOL` CALL (Right dyn_r) n_argRegs_used False)
+
+                        return lblOrMopExpr
+
+        let argcode = concatOL argcodes
+
+        let (move_sp_down, move_sp_up)
+                   = let diff = length vregs - n_argRegs
+                         nn   = if odd diff then diff + 1 else diff -- keep 8-byte alignment
+                     in  if   nn <= 0
+                         then (nilOL, nilOL)
+                         else (unitOL (moveSp (-1*nn)), unitOL (moveSp (1*nn)))
+
+        let transfer_code
+                = toOL (move_final vregs allArgRegs extraStackArgsHere)
+
+        dflags <- getDynFlags
+        return
+         $      argcode                 `appOL`
+                move_sp_down            `appOL`
+                transfer_code           `appOL`
+                callinsns               `appOL`
+                unitOL NOP              `appOL`
+                move_sp_up              `appOL`
+                assign_code (targetPlatform dflags) dest_regs
+
+
+-- | Generate code to calculate an argument, and move it into one
+--      or two integer vregs.
+arg_to_int_vregs :: CmmExpr -> NatM (OrdList Instr, [Reg])
+arg_to_int_vregs arg = do dflags <- getDynFlags
+                          arg_to_int_vregs' dflags arg
+
+arg_to_int_vregs' :: DynFlags -> CmmExpr -> NatM (OrdList Instr, [Reg])
+arg_to_int_vregs' dflags arg
+
+        -- If the expr produces a 64 bit int, then we can just use iselExpr64
+        | isWord64 (cmmExprType dflags arg)
+        = do    (ChildCode64 code r_lo) <- iselExpr64 arg
+                let r_hi                = getHiVRegFromLo r_lo
+                return (code, [r_hi, r_lo])
+
+        | otherwise
+        = do    (src, code)     <- getSomeReg arg
+                let pk          = cmmExprType dflags arg
+
+                case cmmTypeFormat pk of
+
+                 -- Load a 64 bit float return value into two integer regs.
+                 FF64 -> do
+                        v1 <- getNewRegNat II32
+                        v2 <- getNewRegNat II32
+
+                        let code2 =
+                                code                            `snocOL`
+                                FMOV FF64 src f0                `snocOL`
+                                ST   FF32  f0 (spRel 16)        `snocOL`
+                                LD   II32  (spRel 16) v1        `snocOL`
+                                ST   FF32  f1 (spRel 16)        `snocOL`
+                                LD   II32  (spRel 16) v2
+
+                        return  (code2, [v1,v2])
+
+                 -- Load a 32 bit float return value into an integer reg
+                 FF32 -> do
+                        v1 <- getNewRegNat II32
+
+                        let code2 =
+                                code                            `snocOL`
+                                ST   FF32  src (spRel 16)       `snocOL`
+                                LD   II32  (spRel 16) v1
+
+                        return (code2, [v1])
+
+                 -- Move an integer return value into its destination reg.
+                 _ -> do
+                        v1 <- getNewRegNat II32
+
+                        let code2 =
+                                code                            `snocOL`
+                                OR False g0 (RIReg src) v1
+
+                        return (code2, [v1])
+
+
+-- | Move args from the integer vregs into which they have been
+--      marshalled, into %o0 .. %o5, and the rest onto the stack.
+--
+move_final :: [Reg] -> [Reg] -> Int -> [Instr]
+
+-- all args done
+move_final [] _ _
+        = []
+
+-- out of aregs; move to stack
+move_final (v:vs) [] offset
+        = ST II32 v (spRel offset)
+        : move_final vs [] (offset+1)
+
+-- move into an arg (%o[0..5]) reg
+move_final (v:vs) (a:az) offset
+        = OR False g0 (RIReg v) a
+        : move_final vs az offset
+
+
+-- | Assign results returned from the call into their
+--      destination regs.
+--
+assign_code :: Platform -> [LocalReg] -> OrdList Instr
+
+assign_code _ [] = nilOL
+
+assign_code platform [dest]
+ = let  rep     = localRegType dest
+        width   = typeWidth rep
+        r_dest  = getRegisterReg platform (CmmLocal dest)
+
+        result
+                | isFloatType rep
+                , W32   <- width
+                = unitOL $ FMOV FF32 (regSingle $ fReg 0) r_dest
+
+                | isFloatType rep
+                , W64   <- width
+                = unitOL $ FMOV FF64 (regSingle $ fReg 0) r_dest
+
+                | not $ isFloatType rep
+                , W32   <- width
+                = unitOL $ mkRegRegMoveInstr platform (regSingle $ oReg 0) r_dest
+
+                | not $ isFloatType rep
+                , W64           <- width
+                , r_dest_hi     <- getHiVRegFromLo r_dest
+                = toOL  [ mkRegRegMoveInstr platform (regSingle $ oReg 0) r_dest_hi
+                        , mkRegRegMoveInstr platform (regSingle $ oReg 1) r_dest]
+
+                | otherwise
+                = panic "SPARC.CodeGen.GenCCall: no match"
+
+   in   result
+
+assign_code _ _
+        = panic "SPARC.CodeGen.GenCCall: no match"
+
+
+
+-- | Generate a call to implement an out-of-line floating point operation
+outOfLineMachOp
+        :: CallishMachOp
+        -> NatM (Either CLabel CmmExpr)
+
+outOfLineMachOp mop
+ = do   let functionName
+                = outOfLineMachOp_table mop
+
+        dflags  <- getDynFlags
+        mopExpr <- cmmMakeDynamicReference dflags CallReference
+                $  mkForeignLabel functionName Nothing ForeignLabelInExternalPackage IsFunction
+
+        let mopLabelOrExpr
+                = case mopExpr of
+                        CmmLit (CmmLabel lbl)   -> Left lbl
+                        _                       -> Right mopExpr
+
+        return mopLabelOrExpr
+
+
+-- | Decide what C function to use to implement a CallishMachOp
+--
+outOfLineMachOp_table
+        :: CallishMachOp
+        -> FastString
+
+outOfLineMachOp_table mop
+ = case mop of
+        MO_F32_Exp    -> fsLit "expf"
+        MO_F32_ExpM1  -> fsLit "expm1f"
+        MO_F32_Log    -> fsLit "logf"
+        MO_F32_Log1P  -> fsLit "log1pf"
+        MO_F32_Sqrt   -> fsLit "sqrtf"
+        MO_F32_Fabs   -> unsupported
+        MO_F32_Pwr    -> fsLit "powf"
+
+        MO_F32_Sin    -> fsLit "sinf"
+        MO_F32_Cos    -> fsLit "cosf"
+        MO_F32_Tan    -> fsLit "tanf"
+
+        MO_F32_Asin   -> fsLit "asinf"
+        MO_F32_Acos   -> fsLit "acosf"
+        MO_F32_Atan   -> fsLit "atanf"
+
+        MO_F32_Sinh   -> fsLit "sinhf"
+        MO_F32_Cosh   -> fsLit "coshf"
+        MO_F32_Tanh   -> fsLit "tanhf"
+
+        MO_F32_Asinh  -> fsLit "asinhf"
+        MO_F32_Acosh  -> fsLit "acoshf"
+        MO_F32_Atanh  -> fsLit "atanhf"
+
+        MO_F64_Exp    -> fsLit "exp"
+        MO_F64_ExpM1  -> fsLit "expm1"
+        MO_F64_Log    -> fsLit "log"
+        MO_F64_Log1P  -> fsLit "log1p"
+        MO_F64_Sqrt   -> fsLit "sqrt"
+        MO_F64_Fabs   -> unsupported
+        MO_F64_Pwr    -> fsLit "pow"
+
+        MO_F64_Sin    -> fsLit "sin"
+        MO_F64_Cos    -> fsLit "cos"
+        MO_F64_Tan    -> fsLit "tan"
+
+        MO_F64_Asin   -> fsLit "asin"
+        MO_F64_Acos   -> fsLit "acos"
+        MO_F64_Atan   -> fsLit "atan"
+
+        MO_F64_Sinh   -> fsLit "sinh"
+        MO_F64_Cosh   -> fsLit "cosh"
+        MO_F64_Tanh   -> fsLit "tanh"
+
+        MO_F64_Asinh  -> fsLit "asinh"
+        MO_F64_Acosh  -> fsLit "acosh"
+        MO_F64_Atanh  -> fsLit "atanh"
+
+        MO_UF_Conv w -> fsLit $ word2FloatLabel w
+
+        MO_Memcpy _  -> fsLit "memcpy"
+        MO_Memset _  -> fsLit "memset"
+        MO_Memmove _ -> fsLit "memmove"
+        MO_Memcmp _  -> fsLit "memcmp"
+
+        MO_BSwap w   -> fsLit $ bSwapLabel w
+        MO_BRev w    -> fsLit $ bRevLabel w
+        MO_PopCnt w  -> fsLit $ popCntLabel w
+        MO_Pdep w    -> fsLit $ pdepLabel w
+        MO_Pext w    -> fsLit $ pextLabel w
+        MO_Clz w     -> fsLit $ clzLabel w
+        MO_Ctz w     -> fsLit $ ctzLabel w
+        MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
+        MO_Cmpxchg w -> fsLit $ cmpxchgLabel w
+        MO_AtomicRead w -> fsLit $ atomicReadLabel w
+        MO_AtomicWrite w -> fsLit $ atomicWriteLabel w
+
+        MO_S_Mul2    {}  -> unsupported
+        MO_S_QuotRem {}  -> unsupported
+        MO_U_QuotRem {}  -> unsupported
+        MO_U_QuotRem2 {} -> unsupported
+        MO_Add2 {}       -> unsupported
+        MO_AddWordC {}   -> unsupported
+        MO_SubWordC {}   -> unsupported
+        MO_AddIntC {}    -> unsupported
+        MO_SubIntC {}    -> unsupported
+        MO_U_Mul2 {}     -> unsupported
+        MO_ReadBarrier   -> unsupported
+        MO_WriteBarrier  -> unsupported
+        MO_Touch         -> unsupported
+        (MO_Prefetch_Data _) -> unsupported
+    where unsupported = panic ("outOfLineCmmOp: " ++ show mop
+                            ++ " not supported here")
+
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Amode.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Amode.hs
new file mode 100644
index 0000000000..d6c9d7b360
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Amode.hs
@@ -0,0 +1,74 @@
+module GHC.CmmToAsm.SPARC.CodeGen.Amode (
+        getAmode
+)
+
+where
+
+import GhcPrelude
+
+import {-# SOURCE #-} GHC.CmmToAsm.SPARC.CodeGen.Gen32
+import GHC.CmmToAsm.SPARC.CodeGen.Base
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.Format
+
+import GHC.Cmm
+
+import OrdList
+
+
+-- | Generate code to reference a memory address.
+getAmode
+        :: CmmExpr      -- ^ expr producing an address
+        -> NatM Amode
+
+getAmode tree@(CmmRegOff _ _)
+    = do dflags <- getDynFlags
+         getAmode (mangleIndexTree dflags tree)
+
+getAmode (CmmMachOp (MO_Sub _) [x, CmmLit (CmmInt i _)])
+  | fits13Bits (-i)
+  = do
+       (reg, code) <- getSomeReg x
+       let
+         off  = ImmInt (-(fromInteger i))
+       return (Amode (AddrRegImm reg off) code)
+
+
+getAmode (CmmMachOp (MO_Add _) [x, CmmLit (CmmInt i _)])
+  | fits13Bits i
+  = do
+       (reg, code) <- getSomeReg x
+       let
+         off  = ImmInt (fromInteger i)
+       return (Amode (AddrRegImm reg off) code)
+
+getAmode (CmmMachOp (MO_Add _) [x, y])
+  = do
+    (regX, codeX) <- getSomeReg x
+    (regY, codeY) <- getSomeReg y
+    let
+        code = codeX `appOL` codeY
+    return (Amode (AddrRegReg regX regY) code)
+
+getAmode (CmmLit lit)
+  = do
+        let imm__2      = litToImm lit
+        tmp1    <- getNewRegNat II32
+        tmp2    <- getNewRegNat II32
+
+        let code = toOL [ SETHI (HI imm__2) tmp1
+                        , OR    False tmp1 (RIImm (LO imm__2)) tmp2]
+
+        return (Amode (AddrRegReg tmp2 g0) code)
+
+getAmode other
+  = do
+       (reg, code) <- getSomeReg other
+       let
+            off  = ImmInt 0
+       return (Amode (AddrRegImm reg off) code)
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Base.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Base.hs
new file mode 100644
index 0000000000..cf249303e4
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Base.hs
@@ -0,0 +1,119 @@
+module GHC.CmmToAsm.SPARC.CodeGen.Base (
+        InstrBlock,
+        CondCode(..),
+        ChildCode64(..),
+        Amode(..),
+
+        Register(..),
+        setFormatOfRegister,
+
+        getRegisterReg,
+        mangleIndexTree
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Cond
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+
+import GHC.Platform.Regs
+import GHC.Driver.Session
+import GHC.Cmm
+import GHC.Cmm.Ppr.Expr () -- For Outputable instances
+import GHC.Platform
+
+import Outputable
+import OrdList
+
+--------------------------------------------------------------------------------
+-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
+--      They are really trees of insns to facilitate fast appending, where a
+--      left-to-right traversal yields the insns in the correct order.
+--
+type InstrBlock
+        = OrdList Instr
+
+
+-- | Condition codes passed up the tree.
+--
+data CondCode
+        = CondCode Bool Cond InstrBlock
+
+
+-- | a.k.a "Register64"
+--      Reg is the lower 32-bit temporary which contains the result.
+--      Use getHiVRegFromLo to find the other VRegUnique.
+--
+--      Rules of this simplified insn selection game are therefore that
+--      the returned Reg may be modified
+--
+data ChildCode64
+   = ChildCode64
+        InstrBlock
+        Reg
+
+
+-- | Holds code that references a memory address.
+data Amode
+        = Amode
+                -- the AddrMode we can use in the instruction
+                --      that does the real load\/store.
+                AddrMode
+
+                -- other setup code we have to run first before we can use the
+                --      above AddrMode.
+                InstrBlock
+
+
+
+--------------------------------------------------------------------------------
+-- | Code to produce a result into a register.
+--      If the result must go in a specific register, it comes out as Fixed.
+--      Otherwise, the parent can decide which register to put it in.
+--
+data Register
+        = Fixed Format Reg InstrBlock
+        | Any   Format (Reg -> InstrBlock)
+
+
+-- | Change the format field in a Register.
+setFormatOfRegister
+        :: Register -> Format -> Register
+
+setFormatOfRegister reg format
+ = case reg of
+        Fixed _ reg code        -> Fixed format reg code
+        Any _ codefn            -> Any   format codefn
+
+
+--------------------------------------------------------------------------------
+-- | Grab the Reg for a CmmReg
+getRegisterReg :: Platform -> CmmReg -> Reg
+
+getRegisterReg _ (CmmLocal (LocalReg u pk))
+        = RegVirtual $ mkVirtualReg u (cmmTypeFormat pk)
+
+getRegisterReg platform (CmmGlobal mid)
+  = case globalRegMaybe platform mid of
+        Just reg -> RegReal reg
+        Nothing  -> pprPanic
+                        "SPARC.CodeGen.Base.getRegisterReg: global is in memory"
+                        (ppr $ CmmGlobal mid)
+
+
+-- Expand CmmRegOff.  ToDo: should we do it this way around, or convert
+-- CmmExprs into CmmRegOff?
+mangleIndexTree :: DynFlags -> CmmExpr -> CmmExpr
+
+mangleIndexTree dflags (CmmRegOff reg off)
+        = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
+        where width = typeWidth (cmmRegType dflags reg)
+
+mangleIndexTree _ _
+        = panic "SPARC.CodeGen.Base.mangleIndexTree: no match"
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/CondCode.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/CondCode.hs
new file mode 100644
index 0000000000..e501d799f2
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/CondCode.hs
@@ -0,0 +1,110 @@
+module GHC.CmmToAsm.SPARC.CodeGen.CondCode (
+        getCondCode,
+        condIntCode,
+        condFltCode
+)
+
+where
+
+import GhcPrelude
+
+import {-# SOURCE #-} GHC.CmmToAsm.SPARC.CodeGen.Gen32
+import GHC.CmmToAsm.SPARC.CodeGen.Base
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Cond
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.Format
+
+import GHC.Cmm
+
+import OrdList
+import Outputable
+
+
+getCondCode :: CmmExpr -> NatM CondCode
+getCondCode (CmmMachOp mop [x, y])
+  =
+    case mop of
+      MO_F_Eq W32 -> condFltCode EQQ x y
+      MO_F_Ne W32 -> condFltCode NE  x y
+      MO_F_Gt W32 -> condFltCode GTT x y
+      MO_F_Ge W32 -> condFltCode GE  x y
+      MO_F_Lt W32 -> condFltCode LTT x y
+      MO_F_Le W32 -> condFltCode LE  x y
+
+      MO_F_Eq W64 -> condFltCode EQQ x y
+      MO_F_Ne W64 -> condFltCode NE  x y
+      MO_F_Gt W64 -> condFltCode GTT x y
+      MO_F_Ge W64 -> condFltCode GE  x y
+      MO_F_Lt W64 -> condFltCode LTT x y
+      MO_F_Le W64 -> condFltCode LE  x y
+
+      MO_Eq   _   -> condIntCode EQQ  x y
+      MO_Ne   _   -> condIntCode NE   x y
+
+      MO_S_Gt _   -> condIntCode GTT  x y
+      MO_S_Ge _   -> condIntCode GE   x y
+      MO_S_Lt _   -> condIntCode LTT  x y
+      MO_S_Le _   -> condIntCode LE   x y
+
+      MO_U_Gt _   -> condIntCode GU   x y
+      MO_U_Ge _   -> condIntCode GEU  x y
+      MO_U_Lt _   -> condIntCode LU   x y
+      MO_U_Le _   -> condIntCode LEU  x y
+
+      _           -> pprPanic "SPARC.CodeGen.CondCode.getCondCode" (ppr (CmmMachOp mop [x,y]))
+
+getCondCode other = pprPanic "SPARC.CodeGen.CondCode.getCondCode" (ppr other)
+
+
+
+
+
+-- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
+-- passed back up the tree.
+
+condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+condIntCode cond x (CmmLit (CmmInt y _))
+  | fits13Bits y
+  = do
+       (src1, code) <- getSomeReg x
+       let
+           src2 = ImmInt (fromInteger y)
+           code' = code `snocOL` SUB False True src1 (RIImm src2) g0
+       return (CondCode False cond code')
+
+condIntCode cond x y = do
+    (src1, code1) <- getSomeReg x
+    (src2, code2) <- getSomeReg y
+    let
+        code__2 = code1 `appOL` code2 `snocOL`
+                  SUB False True src1 (RIReg src2) g0
+    return (CondCode False cond code__2)
+
+
+condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+condFltCode cond x y = do
+    dflags <- getDynFlags
+    (src1, code1) <- getSomeReg x
+    (src2, code2) <- getSomeReg y
+    tmp <- getNewRegNat FF64
+    let
+        promote x = FxTOy FF32 FF64 x tmp
+
+        pk1   = cmmExprType dflags x
+        pk2   = cmmExprType dflags y
+
+        code__2 =
+                if pk1 `cmmEqType` pk2 then
+                    code1 `appOL` code2 `snocOL`
+                    FCMP True (cmmTypeFormat pk1) src1 src2
+                else if typeWidth pk1 == W32 then
+                    code1 `snocOL` promote src1 `appOL` code2 `snocOL`
+                    FCMP True FF64 tmp src2
+                else
+                    code1 `appOL` code2 `snocOL` promote src2 `snocOL`
+                    FCMP True FF64 src1 tmp
+    return (CondCode True cond code__2)
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Expand.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Expand.hs
new file mode 100644
index 0000000000..77732cf70c
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Expand.hs
@@ -0,0 +1,156 @@
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-- | Expand out synthetic instructions into single machine instrs.
+module GHC.CmmToAsm.SPARC.CodeGen.Expand (
+        expandTop
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Format
+import GHC.Cmm
+
+
+import Outputable
+import OrdList
+
+-- | Expand out synthetic instructions in this top level thing
+expandTop :: NatCmmDecl RawCmmStatics Instr -> NatCmmDecl RawCmmStatics Instr
+expandTop top@(CmmData{})
+        = top
+
+expandTop (CmmProc info lbl live (ListGraph blocks))
+        = CmmProc info lbl live (ListGraph $ map expandBlock blocks)
+
+
+-- | Expand out synthetic instructions in this block
+expandBlock :: NatBasicBlock Instr -> NatBasicBlock Instr
+
+expandBlock (BasicBlock label instrs)
+ = let  instrs_ol       = expandBlockInstrs instrs
+        instrs'         = fromOL instrs_ol
+   in   BasicBlock label instrs'
+
+
+-- | Expand out some instructions
+expandBlockInstrs :: [Instr] -> OrdList Instr
+expandBlockInstrs []    = nilOL
+
+expandBlockInstrs (ii:is)
+ = let  ii_doubleRegs   = remapRegPair ii
+        is_misaligned   = expandMisalignedDoubles ii_doubleRegs
+
+   in   is_misaligned `appOL` expandBlockInstrs is
+
+
+
+-- | In the SPARC instruction set the FP register pairs that are used
+--      to hold 64 bit floats are referred to by just the first reg
+--      of the pair. Remap our internal reg pairs to the appropriate reg.
+--
+--      For example:
+--          ldd [%l1], (%f0 | %f1)
+--
+--      gets mapped to
+--          ldd [$l1], %f0
+--
+remapRegPair :: Instr -> Instr
+remapRegPair instr
+ = let  patchF reg
+         = case reg of
+                RegReal (RealRegSingle _)
+                        -> reg
+
+                RegReal (RealRegPair r1 r2)
+
+                        -- sanity checking
+                        | r1         >= 32
+                        , r1         <= 63
+                        , r1 `mod` 2 == 0
+                        , r2         == r1 + 1
+                        -> RegReal (RealRegSingle r1)
+
+                        | otherwise
+                        -> pprPanic "SPARC.CodeGen.Expand: not remapping dodgy looking reg pair " (ppr reg)
+
+                RegVirtual _
+                        -> pprPanic "SPARC.CodeGen.Expand: not remapping virtual reg " (ppr reg)
+
+   in   patchRegsOfInstr instr patchF
+
+
+
+
+-- Expand out 64 bit load/stores into individual instructions to handle
+--      possible double alignment problems.
+--
+--      TODO:   It'd be better to use a scratch reg instead of the add/sub thing.
+--              We might be able to do this faster if we use the UA2007 instr set
+--              instead of restricting ourselves to SPARC V9.
+--
+expandMisalignedDoubles :: Instr -> OrdList Instr
+expandMisalignedDoubles instr
+
+        -- Translate to:
+        --    add g1,g2,g1
+        --    ld  [g1],%fn
+        --    ld  [g1+4],%f(n+1)
+        --    sub g1,g2,g1           -- to restore g1
+        | LD FF64 (AddrRegReg r1 r2) fReg       <- instr
+        =       toOL    [ ADD False False r1 (RIReg r2) r1
+                        , LD  FF32  (AddrRegReg r1 g0)          fReg
+                        , LD  FF32  (AddrRegImm r1 (ImmInt 4))  (fRegHi fReg)
+                        , SUB False False r1 (RIReg r2) r1 ]
+
+        -- Translate to
+        --    ld  [addr],%fn
+        --    ld  [addr+4],%f(n+1)
+        | LD FF64 addr fReg                     <- instr
+        = let   Just addr'      = addrOffset addr 4
+          in    toOL    [ LD  FF32  addr        fReg
+                        , LD  FF32  addr'       (fRegHi fReg) ]
+
+        -- Translate to:
+        --    add g1,g2,g1
+        --    st  %fn,[g1]
+        --    st  %f(n+1),[g1+4]
+        --    sub g1,g2,g1           -- to restore g1
+        | ST FF64 fReg (AddrRegReg r1 r2)       <- instr
+        =       toOL    [ ADD False False r1 (RIReg r2) r1
+                        , ST  FF32  fReg           (AddrRegReg r1 g0)
+                        , ST  FF32  (fRegHi fReg)  (AddrRegImm r1 (ImmInt 4))
+                        , SUB False False r1 (RIReg r2) r1 ]
+
+        -- Translate to
+        --    ld  [addr],%fn
+        --    ld  [addr+4],%f(n+1)
+        | ST FF64 fReg addr                     <- instr
+        = let   Just addr'      = addrOffset addr 4
+          in    toOL    [ ST  FF32  fReg           addr
+                        , ST  FF32  (fRegHi fReg)  addr'         ]
+
+        -- some other instr
+        | otherwise
+        = unitOL instr
+
+
+
+-- | The high partner for this float reg.
+fRegHi :: Reg -> Reg
+fRegHi (RegReal (RealRegSingle r1))
+        | r1            >= 32
+        , r1            <= 63
+        , r1 `mod` 2 == 0
+        = (RegReal $ RealRegSingle (r1 + 1))
+
+-- Can't take high partner for non-low reg.
+fRegHi reg
+        = pprPanic "SPARC.CodeGen.Expand: can't take fRegHi from " (ppr reg)
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs
new file mode 100644
index 0000000000..ee67bd4a9d
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs
@@ -0,0 +1,692 @@
+-- | Evaluation of 32 bit values.
+module GHC.CmmToAsm.SPARC.CodeGen.Gen32 (
+        getSomeReg,
+        getRegister
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.CodeGen.CondCode
+import GHC.CmmToAsm.SPARC.CodeGen.Amode
+import GHC.CmmToAsm.SPARC.CodeGen.Gen64
+import GHC.CmmToAsm.SPARC.CodeGen.Base
+import GHC.CmmToAsm.SPARC.Stack
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Cond
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+
+import GHC.Cmm
+
+import Control.Monad (liftM)
+import GHC.Driver.Session
+import OrdList
+import Outputable
+
+-- | The dual to getAnyReg: compute an expression into a register, but
+--      we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getSomeReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code -> do
+        tmp <- getNewRegNat rep
+        return (tmp, code tmp)
+    Fixed _ reg code ->
+        return (reg, code)
+
+
+
+-- | Make code to evaluate a 32 bit expression.
+--
+getRegister :: CmmExpr -> NatM Register
+
+getRegister (CmmReg reg)
+  = do dflags <- getDynFlags
+       let platform = targetPlatform dflags
+       return (Fixed (cmmTypeFormat (cmmRegType dflags reg))
+                     (getRegisterReg platform reg) nilOL)
+
+getRegister tree@(CmmRegOff _ _)
+  = do dflags <- getDynFlags
+       getRegister (mangleIndexTree dflags tree)
+
+getRegister (CmmMachOp (MO_UU_Conv W64 W32)
+             [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]]) = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister (CmmMachOp (MO_SS_Conv W64 W32)
+             [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]]) = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister (CmmMachOp (MO_UU_Conv W64 W32) [x]) = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 rlo code
+
+getRegister (CmmMachOp (MO_SS_Conv W64 W32) [x]) = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 rlo code
+
+
+-- Load a literal float into a float register.
+--      The actual literal is stored in a new data area, and we load it
+--      at runtime.
+getRegister (CmmLit (CmmFloat f W32)) = do
+
+    -- a label for the new data area
+    lbl <- getNewLabelNat
+    tmp <- getNewRegNat II32
+
+    let code dst = toOL [
+            -- the data area
+            LDATA (Section ReadOnlyData lbl) $ RawCmmStatics lbl
+                         [CmmStaticLit (CmmFloat f W32)],
+
+            -- load the literal
+            SETHI (HI (ImmCLbl lbl)) tmp,
+            LD II32 (AddrRegImm tmp (LO (ImmCLbl lbl))) dst]
+
+    return (Any FF32 code)
+
+getRegister (CmmLit (CmmFloat d W64)) = do
+    lbl <- getNewLabelNat
+    tmp <- getNewRegNat II32
+    let code dst = toOL [
+            LDATA (Section ReadOnlyData lbl) $ RawCmmStatics lbl
+                         [CmmStaticLit (CmmFloat d W64)],
+            SETHI (HI (ImmCLbl lbl)) tmp,
+            LD II64 (AddrRegImm tmp (LO (ImmCLbl lbl))) dst]
+    return (Any FF64 code)
+
+
+-- Unary machine ops
+getRegister (CmmMachOp mop [x])
+  = case mop of
+        -- Floating point negation -------------------------
+        MO_F_Neg W32            -> trivialUFCode FF32 (FNEG FF32) x
+        MO_F_Neg W64            -> trivialUFCode FF64 (FNEG FF64) x
+
+
+        -- Integer negation --------------------------------
+        MO_S_Neg rep            -> trivialUCode (intFormat rep) (SUB False False g0) x
+        MO_Not rep              -> trivialUCode (intFormat rep) (XNOR False g0) x
+
+
+        -- Float word size conversion ----------------------
+        MO_FF_Conv W64 W32      -> coerceDbl2Flt x
+        MO_FF_Conv W32 W64      -> coerceFlt2Dbl x
+
+
+        -- Float <-> Signed Int conversion -----------------
+        MO_FS_Conv from to      -> coerceFP2Int from to x
+        MO_SF_Conv from to      -> coerceInt2FP from to x
+
+
+        -- Unsigned integer word size conversions ----------
+
+        -- If it's the same size, then nothing needs to be done.
+        MO_UU_Conv from to
+         | from == to           -> conversionNop (intFormat to)  x
+
+        -- To narrow an unsigned word, mask out the high bits to simulate what would
+        --      happen if we copied the value into a smaller register.
+        MO_UU_Conv W16 W8       -> trivialCode W8  (AND False) x (CmmLit (CmmInt 255 W8))
+        MO_UU_Conv W32 W8       -> trivialCode W8  (AND False) x (CmmLit (CmmInt 255 W8))
+
+        -- for narrowing 32 bit to 16 bit, don't use a literal mask value like the W16->W8
+        --      case because the only way we can load it is via SETHI, which needs 2 ops.
+        --      Do some shifts to chop out the high bits instead.
+        MO_UU_Conv W32 W16
+         -> do  tmpReg          <- getNewRegNat II32
+                (xReg, xCode)   <- getSomeReg x
+                let code dst
+                        =       xCode
+                        `appOL` toOL
+                                [ SLL xReg   (RIImm $ ImmInt 16) tmpReg
+                                , SRL tmpReg (RIImm $ ImmInt 16) dst]
+
+                return  $ Any II32 code
+
+                --       trivialCode W16 (AND False) x (CmmLit (CmmInt 65535 W16))
+
+        -- To widen an unsigned word we don't have to do anything.
+        --      Just leave it in the same register and mark the result as the new size.
+        MO_UU_Conv W8  W16      -> conversionNop (intFormat W16)  x
+        MO_UU_Conv W8  W32      -> conversionNop (intFormat W32)  x
+        MO_UU_Conv W16 W32      -> conversionNop (intFormat W32)  x
+
+
+        -- Signed integer word size conversions ------------
+
+        -- Mask out high bits when narrowing them
+        MO_SS_Conv W16 W8       -> trivialCode W8  (AND False) x (CmmLit (CmmInt 255 W8))
+        MO_SS_Conv W32 W8       -> trivialCode W8  (AND False) x (CmmLit (CmmInt 255 W8))
+        MO_SS_Conv W32 W16      -> trivialCode W16 (AND False) x (CmmLit (CmmInt 65535 W16))
+
+        -- Sign extend signed words when widening them.
+        MO_SS_Conv W8  W16      -> integerExtend W8  W16 x
+        MO_SS_Conv W8  W32      -> integerExtend W8  W32 x
+        MO_SS_Conv W16 W32      -> integerExtend W16 W32 x
+
+        _                       -> panic ("Unknown unary mach op: " ++ show mop)
+
+
+-- Binary machine ops
+getRegister (CmmMachOp mop [x, y])
+  = case mop of
+      MO_Eq _           -> condIntReg EQQ x y
+      MO_Ne _           -> condIntReg NE x y
+
+      MO_S_Gt _         -> condIntReg GTT x y
+      MO_S_Ge _         -> condIntReg GE x y
+      MO_S_Lt _         -> condIntReg LTT x y
+      MO_S_Le _         -> condIntReg LE x y
+
+      MO_U_Gt W32       -> condIntReg GU  x y
+      MO_U_Ge W32       -> condIntReg GEU x y
+      MO_U_Lt W32       -> condIntReg LU  x y
+      MO_U_Le W32       -> condIntReg LEU x y
+
+      MO_U_Gt W16       -> condIntReg GU  x y
+      MO_U_Ge W16       -> condIntReg GEU x y
+      MO_U_Lt W16       -> condIntReg LU  x y
+      MO_U_Le W16       -> condIntReg LEU x y
+
+      MO_Add W32        -> trivialCode W32 (ADD False False) x y
+      MO_Sub W32        -> trivialCode W32 (SUB False False) x y
+
+      MO_S_MulMayOflo rep -> imulMayOflo rep x y
+
+      MO_S_Quot W32     -> idiv True  False x y
+      MO_U_Quot W32     -> idiv False False x y
+
+      MO_S_Rem  W32     -> irem True  x y
+      MO_U_Rem  W32     -> irem False x y
+
+      MO_F_Eq _         -> condFltReg EQQ x y
+      MO_F_Ne _         -> condFltReg NE x y
+
+      MO_F_Gt _         -> condFltReg GTT x y
+      MO_F_Ge _         -> condFltReg GE x y
+      MO_F_Lt _         -> condFltReg LTT x y
+      MO_F_Le _         -> condFltReg LE x y
+
+      MO_F_Add  w       -> trivialFCode w FADD x y
+      MO_F_Sub  w       -> trivialFCode w FSUB x y
+      MO_F_Mul  w       -> trivialFCode w FMUL x y
+      MO_F_Quot w       -> trivialFCode w FDIV x y
+
+      MO_And rep        -> trivialCode rep (AND False) x y
+      MO_Or  rep        -> trivialCode rep (OR  False) x y
+      MO_Xor rep        -> trivialCode rep (XOR False) x y
+
+      MO_Mul rep        -> trivialCode rep (SMUL False) x y
+
+      MO_Shl rep        -> trivialCode rep SLL  x y
+      MO_U_Shr rep      -> trivialCode rep SRL x y
+      MO_S_Shr rep      -> trivialCode rep SRA x y
+
+      _                 -> pprPanic "getRegister(sparc) - binary CmmMachOp (1)" (pprMachOp mop)
+
+getRegister (CmmLoad mem pk) = do
+    Amode src code <- getAmode mem
+    let
+        code__2 dst     = code `snocOL` LD (cmmTypeFormat pk) src dst
+    return (Any (cmmTypeFormat pk) code__2)
+
+getRegister (CmmLit (CmmInt i _))
+  | fits13Bits i
+  = let
+        src = ImmInt (fromInteger i)
+        code dst = unitOL (OR False g0 (RIImm src) dst)
+    in
+        return (Any II32 code)
+
+getRegister (CmmLit lit)
+  = let imm = litToImm lit
+        code dst = toOL [
+            SETHI (HI imm) dst,
+            OR False dst (RIImm (LO imm)) dst]
+    in return (Any II32 code)
+
+
+getRegister _
+        = panic "SPARC.CodeGen.Gen32.getRegister: no match"
+
+
+-- | sign extend and widen
+integerExtend
+        :: Width                -- ^ width of source expression
+        -> Width                -- ^ width of result
+        -> CmmExpr              -- ^ source expression
+        -> NatM Register
+
+integerExtend from to expr
+ = do   -- load the expr into some register
+        (reg, e_code)   <- getSomeReg expr
+        tmp             <- getNewRegNat II32
+        let bitCount
+                = case (from, to) of
+                        (W8,  W32)      -> 24
+                        (W16, W32)      -> 16
+                        (W8,  W16)      -> 24
+                        _               -> panic "SPARC.CodeGen.Gen32: no match"
+        let code dst
+                = e_code
+
+                -- local shift word left to load the sign bit
+                `snocOL`  SLL reg (RIImm (ImmInt bitCount)) tmp
+
+                -- arithmetic shift right to sign extend
+                `snocOL`  SRA tmp (RIImm (ImmInt bitCount)) dst
+
+        return (Any (intFormat to) code)
+
+
+-- | For nop word format conversions we set the resulting value to have the
+--      required size, but don't need to generate any actual code.
+--
+conversionNop
+        :: Format -> CmmExpr -> NatM Register
+
+conversionNop new_rep expr
+ = do   e_code <- getRegister expr
+        return (setFormatOfRegister e_code new_rep)
+
+
+
+-- | Generate an integer division instruction.
+idiv :: Bool -> Bool -> CmmExpr -> CmmExpr -> NatM Register
+
+-- For unsigned division with a 32 bit numerator,
+--              we can just clear the Y register.
+idiv False cc x y
+ = do
+        (a_reg, a_code)         <- getSomeReg x
+        (b_reg, b_code)         <- getSomeReg y
+
+        let code dst
+                =       a_code
+                `appOL` b_code
+                `appOL` toOL
+                        [ WRY  g0 g0
+                        , UDIV cc a_reg (RIReg b_reg) dst]
+
+        return (Any II32 code)
+
+
+-- For _signed_ division with a 32 bit numerator,
+--              we have to sign extend the numerator into the Y register.
+idiv True cc x y
+ = do
+        (a_reg, a_code)         <- getSomeReg x
+        (b_reg, b_code)         <- getSomeReg y
+
+        tmp                     <- getNewRegNat II32
+
+        let code dst
+                =       a_code
+                `appOL` b_code
+                `appOL` toOL
+                        [ SRA  a_reg (RIImm (ImmInt 16)) tmp            -- sign extend
+                        , SRA  tmp   (RIImm (ImmInt 16)) tmp
+
+                        , WRY  tmp g0
+                        , SDIV cc a_reg (RIReg b_reg) dst]
+
+        return (Any II32 code)
+
+
+-- | Do an integer remainder.
+--
+--       NOTE:  The SPARC v8 architecture manual says that integer division
+--              instructions _may_ generate a remainder, depending on the implementation.
+--              If so it is _recommended_ that the remainder is placed in the Y register.
+--
+--          The UltraSparc 2007 manual says Y is _undefined_ after division.
+--
+--              The SPARC T2 doesn't store the remainder, not sure about the others.
+--              It's probably best not to worry about it, and just generate our own
+--              remainders.
+--
+irem :: Bool -> CmmExpr -> CmmExpr -> NatM Register
+
+-- For unsigned operands:
+--              Division is between a 64 bit numerator and a 32 bit denominator,
+--              so we still have to clear the Y register.
+irem False x y
+ = do
+        (a_reg, a_code) <- getSomeReg x
+        (b_reg, b_code) <- getSomeReg y
+
+        tmp_reg         <- getNewRegNat II32
+
+        let code dst
+                =       a_code
+                `appOL` b_code
+                `appOL` toOL
+                        [ WRY   g0 g0
+                        , UDIV  False         a_reg (RIReg b_reg) tmp_reg
+                        , UMUL  False       tmp_reg (RIReg b_reg) tmp_reg
+                        , SUB   False False   a_reg (RIReg tmp_reg) dst]
+
+        return  (Any II32 code)
+
+
+
+-- For signed operands:
+--              Make sure to sign extend into the Y register, or the remainder
+--              will have the wrong sign when the numerator is negative.
+--
+--      TODO:   When sign extending, GCC only shifts the a_reg right by 17 bits,
+--              not the full 32. Not sure why this is, something to do with overflow?
+--              If anyone cares enough about the speed of signed remainder they
+--              can work it out themselves (then tell me). -- BL 2009/01/20
+irem True x y
+ = do
+        (a_reg, a_code) <- getSomeReg x
+        (b_reg, b_code) <- getSomeReg y
+
+        tmp1_reg        <- getNewRegNat II32
+        tmp2_reg        <- getNewRegNat II32
+
+        let code dst
+                =       a_code
+                `appOL` b_code
+                `appOL` toOL
+                        [ SRA   a_reg      (RIImm (ImmInt 16)) tmp1_reg -- sign extend
+                        , SRA   tmp1_reg   (RIImm (ImmInt 16)) tmp1_reg -- sign extend
+                        , WRY   tmp1_reg g0
+
+                        , SDIV  False          a_reg (RIReg b_reg)    tmp2_reg
+                        , SMUL  False       tmp2_reg (RIReg b_reg)    tmp2_reg
+                        , SUB   False False    a_reg (RIReg tmp2_reg) dst]
+
+        return (Any II32 code)
+
+
+imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
+imulMayOflo rep a b
+ = do
+        (a_reg, a_code) <- getSomeReg a
+        (b_reg, b_code) <- getSomeReg b
+        res_lo <- getNewRegNat II32
+        res_hi <- getNewRegNat II32
+
+        let shift_amt  = case rep of
+                          W32 -> 31
+                          W64 -> 63
+                          _ -> panic "shift_amt"
+
+        let code dst = a_code `appOL` b_code `appOL`
+                       toOL [
+                           SMUL False a_reg (RIReg b_reg) res_lo,
+                           RDY res_hi,
+                           SRA res_lo (RIImm (ImmInt shift_amt)) res_lo,
+                           SUB False False res_lo (RIReg res_hi) dst
+                        ]
+        return (Any II32 code)
+
+
+-- -----------------------------------------------------------------------------
+-- 'trivial*Code': deal with trivial instructions
+
+-- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
+-- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
+-- Only look for constants on the right hand side, because that's
+-- where the generic optimizer will have put them.
+
+-- Similarly, for unary instructions, we don't have to worry about
+-- matching an StInt as the argument, because genericOpt will already
+-- have handled the constant-folding.
+
+trivialCode
+        :: Width
+        -> (Reg -> RI -> Reg -> Instr)
+        -> CmmExpr
+        -> CmmExpr
+        -> NatM Register
+
+trivialCode _ instr x (CmmLit (CmmInt y _))
+  | fits13Bits y
+  = do
+      (src1, code) <- getSomeReg x
+      let
+        src2 = ImmInt (fromInteger y)
+        code__2 dst = code `snocOL` instr src1 (RIImm src2) dst
+      return (Any II32 code__2)
+
+
+trivialCode _ instr x y = do
+    (src1, code1) <- getSomeReg x
+    (src2, code2) <- getSomeReg y
+    let
+        code__2 dst = code1 `appOL` code2 `snocOL`
+                      instr src1 (RIReg src2) dst
+    return (Any II32 code__2)
+
+
+trivialFCode
+        :: Width
+        -> (Format -> Reg -> Reg -> Reg -> Instr)
+        -> CmmExpr
+        -> CmmExpr
+        -> NatM Register
+
+trivialFCode pk instr x y = do
+    dflags <- getDynFlags
+    (src1, code1) <- getSomeReg x
+    (src2, code2) <- getSomeReg y
+    tmp <- getNewRegNat FF64
+    let
+        promote x = FxTOy FF32 FF64 x tmp
+
+        pk1   = cmmExprType dflags x
+        pk2   = cmmExprType dflags y
+
+        code__2 dst =
+                if pk1 `cmmEqType` pk2 then
+                    code1 `appOL` code2 `snocOL`
+                    instr (floatFormat pk) src1 src2 dst
+                else if typeWidth pk1 == W32 then
+                    code1 `snocOL` promote src1 `appOL` code2 `snocOL`
+                    instr FF64 tmp src2 dst
+                else
+                    code1 `appOL` code2 `snocOL` promote src2 `snocOL`
+                    instr FF64 src1 tmp dst
+    return (Any (cmmTypeFormat $ if pk1 `cmmEqType` pk2 then pk1 else cmmFloat W64)
+                code__2)
+
+
+
+trivialUCode
+        :: Format
+        -> (RI -> Reg -> Instr)
+        -> CmmExpr
+        -> NatM Register
+
+trivialUCode format instr x = do
+    (src, code) <- getSomeReg x
+    let
+        code__2 dst = code `snocOL` instr (RIReg src) dst
+    return (Any format code__2)
+
+
+trivialUFCode
+        :: Format
+        -> (Reg -> Reg -> Instr)
+        -> CmmExpr
+        -> NatM Register
+
+trivialUFCode pk instr x = do
+    (src, code) <- getSomeReg x
+    let
+        code__2 dst = code `snocOL` instr src dst
+    return (Any pk code__2)
+
+
+
+
+-- Coercions -------------------------------------------------------------------
+
+-- | Coerce a integer value to floating point
+coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
+coerceInt2FP width1 width2 x = do
+    (src, code) <- getSomeReg x
+    let
+        code__2 dst = code `appOL` toOL [
+            ST (intFormat width1) src (spRel (-2)),
+            LD (intFormat width1) (spRel (-2)) dst,
+            FxTOy (intFormat width1) (floatFormat width2) dst dst]
+    return (Any (floatFormat $ width2) code__2)
+
+
+
+-- | Coerce a floating point value to integer
+--
+--   NOTE: On sparc v9 there are no instructions to move a value from an
+--         FP register directly to an int register, so we have to use a load/store.
+--
+coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
+coerceFP2Int width1 width2 x
+ = do   let fformat1      = floatFormat width1
+            fformat2      = floatFormat width2
+
+            iformat2      = intFormat   width2
+
+        (fsrc, code)    <- getSomeReg x
+        fdst            <- getNewRegNat fformat2
+
+        let code2 dst
+                =       code
+                `appOL` toOL
+                        -- convert float to int format, leaving it in a float reg.
+                        [ FxTOy fformat1 iformat2 fsrc fdst
+
+                        -- store the int into mem, then load it back to move
+                        --      it into an actual int reg.
+                        , ST    fformat2 fdst (spRel (-2))
+                        , LD    iformat2 (spRel (-2)) dst]
+
+        return (Any iformat2 code2)
+
+
+-- | Coerce a double precision floating point value to single precision.
+coerceDbl2Flt :: CmmExpr -> NatM Register
+coerceDbl2Flt x = do
+    (src, code) <- getSomeReg x
+    return (Any FF32 (\dst -> code `snocOL` FxTOy FF64 FF32 src dst))
+
+
+-- | Coerce a single precision floating point value to double precision
+coerceFlt2Dbl :: CmmExpr -> NatM Register
+coerceFlt2Dbl x = do
+    (src, code) <- getSomeReg x
+    return (Any FF64 (\dst -> code `snocOL` FxTOy FF32 FF64 src dst))
+
+
+
+
+-- Condition Codes -------------------------------------------------------------
+--
+-- Evaluate a comparison, and get the result into a register.
+--
+-- Do not fill the delay slots here. you will confuse the register allocator.
+--
+condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
+condIntReg EQQ x (CmmLit (CmmInt 0 _)) = do
+    (src, code) <- getSomeReg x
+    let
+        code__2 dst = code `appOL` toOL [
+            SUB False True g0 (RIReg src) g0,
+            SUB True False g0 (RIImm (ImmInt (-1))) dst]
+    return (Any II32 code__2)
+
+condIntReg EQQ x y = do
+    (src1, code1) <- getSomeReg x
+    (src2, code2) <- getSomeReg y
+    let
+        code__2 dst = code1 `appOL` code2 `appOL` toOL [
+            XOR False src1 (RIReg src2) dst,
+            SUB False True g0 (RIReg dst) g0,
+            SUB True False g0 (RIImm (ImmInt (-1))) dst]
+    return (Any II32 code__2)
+
+condIntReg NE x (CmmLit (CmmInt 0 _)) = do
+    (src, code) <- getSomeReg x
+    let
+        code__2 dst = code `appOL` toOL [
+            SUB False True g0 (RIReg src) g0,
+            ADD True False g0 (RIImm (ImmInt 0)) dst]
+    return (Any II32 code__2)
+
+condIntReg NE x y = do
+    (src1, code1) <- getSomeReg x
+    (src2, code2) <- getSomeReg y
+    let
+        code__2 dst = code1 `appOL` code2 `appOL` toOL [
+            XOR False src1 (RIReg src2) dst,
+            SUB False True g0 (RIReg dst) g0,
+            ADD True False g0 (RIImm (ImmInt 0)) dst]
+    return (Any II32 code__2)
+
+condIntReg cond x y = do
+    bid1 <- liftM (\a -> seq a a) getBlockIdNat
+    bid2 <- liftM (\a -> seq a a) getBlockIdNat
+    CondCode _ cond cond_code <- condIntCode cond x y
+    let
+        code__2 dst
+         =      cond_code
+          `appOL` toOL
+                [ BI cond False bid1
+                , NOP
+
+                , OR False g0 (RIImm (ImmInt 0)) dst
+                , BI ALWAYS False bid2
+                , NOP
+
+                , NEWBLOCK bid1
+                , OR False g0 (RIImm (ImmInt 1)) dst
+                , BI ALWAYS False bid2
+                , NOP
+
+                , NEWBLOCK bid2]
+
+    return (Any II32 code__2)
+
+
+condFltReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
+condFltReg cond x y = do
+    bid1 <- liftM (\a -> seq a a) getBlockIdNat
+    bid2 <- liftM (\a -> seq a a) getBlockIdNat
+
+    CondCode _ cond cond_code <- condFltCode cond x y
+    let
+        code__2 dst
+         =      cond_code
+          `appOL` toOL
+                [ NOP
+                , BF cond False bid1
+                , NOP
+
+                , OR False g0 (RIImm (ImmInt 0)) dst
+                , BI ALWAYS False bid2
+                , NOP
+
+                , NEWBLOCK bid1
+                , OR False g0 (RIImm (ImmInt 1)) dst
+                , BI ALWAYS False bid2
+                , NOP
+
+                , NEWBLOCK bid2 ]
+
+    return (Any II32 code__2)
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs-boot b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs-boot
new file mode 100644
index 0000000000..cf9553a63c
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen32.hs-boot
@@ -0,0 +1,16 @@
+
+module GHC.CmmToAsm.SPARC.CodeGen.Gen32 (
+        getSomeReg,
+        getRegister
+)
+
+where
+
+import GHC.CmmToAsm.SPARC.CodeGen.Base
+import GHC.CmmToAsm.Monad
+import GHC.Platform.Reg
+
+import GHC.Cmm
+
+getSomeReg  :: CmmExpr -> NatM (Reg, InstrBlock)
+getRegister :: CmmExpr -> NatM Register
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen64.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen64.hs
new file mode 100644
index 0000000000..43807ec027
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Gen64.hs
@@ -0,0 +1,216 @@
+-- | Evaluation of 64 bit values on 32 bit platforms.
+module GHC.CmmToAsm.SPARC.CodeGen.Gen64 (
+        assignMem_I64Code,
+        assignReg_I64Code,
+        iselExpr64
+)
+
+where
+
+import GhcPrelude
+
+import {-# SOURCE #-} GHC.CmmToAsm.SPARC.CodeGen.Gen32
+import GHC.CmmToAsm.SPARC.CodeGen.Base
+import GHC.CmmToAsm.SPARC.CodeGen.Amode
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.Instr
+-- GHC.CmmToAsm.SPARC.Ppr()
+import GHC.CmmToAsm.Monad
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+
+import GHC.Cmm
+
+import GHC.Driver.Session
+import OrdList
+import Outputable
+
+-- | Code to assign a 64 bit value to memory.
+assignMem_I64Code
+        :: CmmExpr              -- ^ expr producing the destination address
+        -> CmmExpr              -- ^ expr producing the source value.
+        -> NatM InstrBlock
+
+assignMem_I64Code addrTree valueTree
+ = do
+     ChildCode64 vcode rlo      <- iselExpr64 valueTree
+
+     (src, acode) <- getSomeReg addrTree
+     let
+         rhi = getHiVRegFromLo rlo
+
+         -- Big-endian store
+         mov_hi = ST II32 rhi (AddrRegImm src (ImmInt 0))
+         mov_lo = ST II32 rlo (AddrRegImm src (ImmInt 4))
+
+         code   = vcode `appOL` acode `snocOL` mov_hi `snocOL` mov_lo
+
+{-     pprTrace "assignMem_I64Code"
+        (vcat   [ text "addrTree:  " <+> ppr addrTree
+                , text "valueTree: " <+> ppr valueTree
+                , text "vcode:"
+                , vcat $ map ppr $ fromOL vcode
+                , text ""
+                , text "acode:"
+                , vcat $ map ppr $ fromOL acode ])
+       $ -}
+     return code
+
+
+-- | Code to assign a 64 bit value to a register.
+assignReg_I64Code
+        :: CmmReg               -- ^ the destination register
+        -> CmmExpr              -- ^ expr producing the source value
+        -> NatM InstrBlock
+
+assignReg_I64Code (CmmLocal (LocalReg u_dst pk)) valueTree
+ = do
+     ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
+     let
+         r_dst_lo = RegVirtual $ mkVirtualReg u_dst (cmmTypeFormat pk)
+         r_dst_hi = getHiVRegFromLo r_dst_lo
+         r_src_hi = getHiVRegFromLo r_src_lo
+         mov_lo = mkMOV r_src_lo r_dst_lo
+         mov_hi = mkMOV r_src_hi r_dst_hi
+         mkMOV sreg dreg = OR False g0 (RIReg sreg) dreg
+
+     return (vcode `snocOL` mov_hi `snocOL` mov_lo)
+
+assignReg_I64Code _ _
+   = panic "assignReg_I64Code(sparc): invalid lvalue"
+
+
+
+
+-- | Get the value of an expression into a 64 bit register.
+
+iselExpr64 :: CmmExpr -> NatM ChildCode64
+
+-- Load a 64 bit word
+iselExpr64 (CmmLoad addrTree ty)
+ | isWord64 ty
+ = do   Amode amode addr_code   <- getAmode addrTree
+        let result
+
+                | AddrRegReg r1 r2      <- amode
+                = do    rlo     <- getNewRegNat II32
+                        tmp     <- getNewRegNat II32
+                        let rhi = getHiVRegFromLo rlo
+
+                        return  $ ChildCode64
+                                (        addr_code
+                                `appOL`  toOL
+                                         [ ADD False False r1 (RIReg r2) tmp
+                                         , LD II32 (AddrRegImm tmp (ImmInt 0)) rhi
+                                         , LD II32 (AddrRegImm tmp (ImmInt 4)) rlo ])
+                                rlo
+
+                | AddrRegImm r1 (ImmInt i) <- amode
+                = do    rlo     <- getNewRegNat II32
+                        let rhi = getHiVRegFromLo rlo
+
+                        return  $ ChildCode64
+                                (        addr_code
+                                `appOL`  toOL
+                                         [ LD II32 (AddrRegImm r1 (ImmInt $ 0 + i)) rhi
+                                         , LD II32 (AddrRegImm r1 (ImmInt $ 4 + i)) rlo ])
+                                rlo
+
+                | otherwise
+                = panic "SPARC.CodeGen.Gen64: no match"
+
+        result
+
+
+-- Add a literal to a 64 bit integer
+iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)])
+ = do   ChildCode64 code1 r1_lo <- iselExpr64 e1
+        let r1_hi       = getHiVRegFromLo r1_lo
+
+        r_dst_lo        <- getNewRegNat II32
+        let r_dst_hi    =  getHiVRegFromLo r_dst_lo
+
+        let code =      code1
+                `appOL` toOL
+                        [ ADD False True  r1_lo (RIImm (ImmInteger i)) r_dst_lo
+                        , ADD True  False r1_hi (RIReg g0)         r_dst_hi ]
+
+        return  $ ChildCode64 code r_dst_lo
+
+
+-- Addition of II64
+iselExpr64 (CmmMachOp (MO_Add _) [e1, e2])
+ = do   ChildCode64 code1 r1_lo <- iselExpr64 e1
+        let r1_hi       = getHiVRegFromLo r1_lo
+
+        ChildCode64 code2 r2_lo <- iselExpr64 e2
+        let r2_hi       = getHiVRegFromLo r2_lo
+
+        r_dst_lo        <- getNewRegNat II32
+        let r_dst_hi    = getHiVRegFromLo r_dst_lo
+
+        let code =      code1
+                `appOL` code2
+                `appOL` toOL
+                        [ ADD False True  r1_lo (RIReg r2_lo) r_dst_lo
+                        , ADD True  False r1_hi (RIReg r2_hi) r_dst_hi ]
+
+        return  $ ChildCode64 code r_dst_lo
+
+
+iselExpr64 (CmmReg (CmmLocal (LocalReg uq ty)))
+ | isWord64 ty
+ = do
+     r_dst_lo <-  getNewRegNat II32
+     let r_dst_hi = getHiVRegFromLo r_dst_lo
+         r_src_lo = RegVirtual $ mkVirtualReg uq II32
+         r_src_hi = getHiVRegFromLo r_src_lo
+         mov_lo = mkMOV r_src_lo r_dst_lo
+         mov_hi = mkMOV r_src_hi r_dst_hi
+         mkMOV sreg dreg = OR False g0 (RIReg sreg) dreg
+     return (
+            ChildCode64 (toOL [mov_hi, mov_lo]) r_dst_lo
+         )
+
+-- Convert something into II64
+iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr])
+ = do
+        r_dst_lo        <- getNewRegNat II32
+        let r_dst_hi    = getHiVRegFromLo r_dst_lo
+
+        -- compute expr and load it into r_dst_lo
+        (a_reg, a_code) <- getSomeReg expr
+
+        dflags <- getDynFlags
+        let platform = targetPlatform dflags
+            code        = a_code
+                `appOL` toOL
+                        [ mkRegRegMoveInstr platform g0    r_dst_hi     -- clear high 32 bits
+                        , mkRegRegMoveInstr platform a_reg r_dst_lo ]
+
+        return  $ ChildCode64 code r_dst_lo
+
+-- only W32 supported for now
+iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr])
+ = do
+        r_dst_lo        <- getNewRegNat II32
+        let r_dst_hi    = getHiVRegFromLo r_dst_lo
+
+        -- compute expr and load it into r_dst_lo
+        (a_reg, a_code) <- getSomeReg expr
+
+        dflags          <- getDynFlags
+        let platform    = targetPlatform dflags
+            code        = a_code
+                `appOL` toOL
+                        [ SRA a_reg (RIImm (ImmInt 31)) r_dst_hi
+                        , mkRegRegMoveInstr platform a_reg r_dst_lo ]
+
+        return  $ ChildCode64 code r_dst_lo
+
+
+iselExpr64 expr
+   = pprPanic "iselExpr64(sparc)" (ppr expr)
diff --git a/compiler/GHC/CmmToAsm/SPARC/CodeGen/Sanity.hs b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Sanity.hs
new file mode 100644
index 0000000000..f8648c4549
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/CodeGen/Sanity.hs
@@ -0,0 +1,69 @@
+-- | One ounce of sanity checking is worth 10000000000000000 ounces
+-- of staring blindly at assembly code trying to find the problem..
+module GHC.CmmToAsm.SPARC.CodeGen.Sanity (
+        checkBlock
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Ppr        () -- For Outputable instances
+import GHC.CmmToAsm.Instr
+
+import GHC.Cmm
+
+import Outputable
+
+
+-- | Enforce intra-block invariants.
+--
+checkBlock :: CmmBlock
+           -> NatBasicBlock Instr
+           -> NatBasicBlock Instr
+
+checkBlock cmm block@(BasicBlock _ instrs)
+        | checkBlockInstrs instrs
+        = block
+
+        | otherwise
+        = pprPanic
+                ("SPARC.CodeGen: bad block\n")
+                ( vcat  [ text " -- cmm -----------------\n"
+                        , ppr cmm
+                        , text " -- native code ---------\n"
+                        , ppr block ])
+
+
+checkBlockInstrs :: [Instr] -> Bool
+checkBlockInstrs ii
+
+        -- An unconditional jumps end the block.
+        --      There must be an unconditional jump in the block, otherwise
+        --      the register liveness determinator will get the liveness
+        --      information wrong.
+        --
+        --      If the block ends with a cmm call that never returns
+        --      then there can be unreachable instructions after the jump,
+        --      but we don't mind here.
+        --
+        | instr : NOP : _       <- ii
+        , isUnconditionalJump instr
+        = True
+
+        -- All jumps must have a NOP in their branch delay slot.
+        --      The liveness determinator and register allocators aren't smart
+        --      enough to handle branch delay slots.
+        --
+        | instr : NOP : is      <- ii
+        , isJumpishInstr instr
+        = checkBlockInstrs is
+
+        -- keep checking
+        | _:i2:is               <- ii
+        = checkBlockInstrs (i2:is)
+
+        -- this block is no good
+        | otherwise
+        = False
diff --git a/compiler/GHC/CmmToAsm/SPARC/Cond.hs b/compiler/GHC/CmmToAsm/SPARC/Cond.hs
new file mode 100644
index 0000000000..89b64b7c3a
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Cond.hs
@@ -0,0 +1,54 @@
+module GHC.CmmToAsm.SPARC.Cond (
+        Cond(..),
+        condUnsigned,
+        condToSigned,
+        condToUnsigned
+)
+
+where
+
+import GhcPrelude
+
+-- | Branch condition codes.
+data Cond
+        = ALWAYS
+        | EQQ
+        | GE
+        | GEU
+        | GTT
+        | GU
+        | LE
+        | LEU
+        | LTT
+        | LU
+        | NE
+        | NEG
+        | NEVER
+        | POS
+        | VC
+        | VS
+        deriving Eq
+
+
+condUnsigned :: Cond -> Bool
+condUnsigned GU  = True
+condUnsigned LU  = True
+condUnsigned GEU = True
+condUnsigned LEU = True
+condUnsigned _   = False
+
+
+condToSigned :: Cond -> Cond
+condToSigned GU  = GTT
+condToSigned LU  = LTT
+condToSigned GEU = GE
+condToSigned LEU = LE
+condToSigned x   = x
+
+
+condToUnsigned :: Cond -> Cond
+condToUnsigned GTT = GU
+condToUnsigned LTT = LU
+condToUnsigned GE  = GEU
+condToUnsigned LE  = LEU
+condToUnsigned x   = x
diff --git a/compiler/GHC/CmmToAsm/SPARC/Imm.hs b/compiler/GHC/CmmToAsm/SPARC/Imm.hs
new file mode 100644
index 0000000000..71b0257ac5
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Imm.hs
@@ -0,0 +1,67 @@
+module GHC.CmmToAsm.SPARC.Imm (
+        -- immediate values
+        Imm(..),
+        strImmLit,
+        litToImm
+)
+
+where
+
+import GhcPrelude
+
+import GHC.Cmm
+import GHC.Cmm.CLabel
+
+import Outputable
+
+-- | An immediate value.
+--      Not all of these are directly representable by the machine.
+--      Things like ImmLit are slurped out and put in a data segment instead.
+--
+data Imm
+        = ImmInt        Int
+
+        -- Sigh.
+        | ImmInteger    Integer
+
+        -- AbstractC Label (with baggage)
+        | ImmCLbl       CLabel
+
+        -- Simple string
+        | ImmLit        SDoc
+        | ImmIndex      CLabel Int
+        | ImmFloat      Rational
+        | ImmDouble     Rational
+
+        | ImmConstantSum  Imm Imm
+        | ImmConstantDiff Imm Imm
+
+        | LO    Imm
+        | HI    Imm
+
+
+-- | Create a ImmLit containing this string.
+strImmLit :: String -> Imm
+strImmLit s = ImmLit (text s)
+
+
+-- | Convert a CmmLit to an Imm.
+--      Narrow to the width: a CmmInt might be out of
+--      range, but we assume that ImmInteger only contains
+--      in-range values.  A signed value should be fine here.
+--
+litToImm :: CmmLit -> Imm
+litToImm lit
+ = case lit of
+        CmmInt i w              -> ImmInteger (narrowS w i)
+        CmmFloat f W32          -> ImmFloat f
+        CmmFloat f W64          -> ImmDouble f
+        CmmLabel l              -> ImmCLbl l
+        CmmLabelOff l off       -> ImmIndex l off
+
+        CmmLabelDiffOff l1 l2 off _
+         -> ImmConstantSum
+                (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2))
+                (ImmInt off)
+
+        _               -> panic "SPARC.Regs.litToImm: no match"
diff --git a/compiler/GHC/CmmToAsm/SPARC/Instr.hs b/compiler/GHC/CmmToAsm/SPARC/Instr.hs
new file mode 100644
index 0000000000..ec74d3723b
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Instr.hs
@@ -0,0 +1,481 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Machine-dependent assembly language
+--
+-- (c) The University of Glasgow 1993-2004
+--
+-----------------------------------------------------------------------------
+#include "HsVersions.h"
+
+module GHC.CmmToAsm.SPARC.Instr (
+        RI(..),
+        riZero,
+
+        fpRelEA,
+        moveSp,
+
+        isUnconditionalJump,
+
+        Instr(..),
+        maxSpillSlots
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Stack
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Cond
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.Reg.Target
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Format
+
+import GHC.Cmm.CLabel
+import GHC.Platform.Regs
+import GHC.Cmm.BlockId
+import GHC.Driver.Session
+import GHC.Cmm
+import FastString
+import Outputable
+import GHC.Platform
+
+
+-- | Register or immediate
+data RI
+        = RIReg Reg
+        | RIImm Imm
+
+-- | Check if a RI represents a zero value.
+--      - a literal zero
+--      - register %g0, which is always zero.
+--
+riZero :: RI -> Bool
+riZero (RIImm (ImmInt 0))                       = True
+riZero (RIImm (ImmInteger 0))                   = True
+riZero (RIReg (RegReal (RealRegSingle 0)))      = True
+riZero _                                        = False
+
+
+-- | Calculate the effective address which would be used by the
+--      corresponding fpRel sequence.
+fpRelEA :: Int -> Reg -> Instr
+fpRelEA n dst
+   = ADD False False fp (RIImm (ImmInt (n * wordLength))) dst
+
+
+-- | Code to shift the stack pointer by n words.
+moveSp :: Int -> Instr
+moveSp n
+   = ADD False False sp (RIImm (ImmInt (n * wordLength))) sp
+
+-- | An instruction that will cause the one after it never to be exectuted
+isUnconditionalJump :: Instr -> Bool
+isUnconditionalJump ii
+ = case ii of
+        CALL{}          -> True
+        JMP{}           -> True
+        JMP_TBL{}       -> True
+        BI ALWAYS _ _   -> True
+        BF ALWAYS _ _   -> True
+        _               -> False
+
+
+-- | instance for sparc instruction set
+instance Instruction Instr where
+        regUsageOfInstr         = sparc_regUsageOfInstr
+        patchRegsOfInstr        = sparc_patchRegsOfInstr
+        isJumpishInstr          = sparc_isJumpishInstr
+        jumpDestsOfInstr        = sparc_jumpDestsOfInstr
+        patchJumpInstr          = sparc_patchJumpInstr
+        mkSpillInstr            = sparc_mkSpillInstr
+        mkLoadInstr             = sparc_mkLoadInstr
+        takeDeltaInstr          = sparc_takeDeltaInstr
+        isMetaInstr             = sparc_isMetaInstr
+        mkRegRegMoveInstr       = sparc_mkRegRegMoveInstr
+        takeRegRegMoveInstr     = sparc_takeRegRegMoveInstr
+        mkJumpInstr             = sparc_mkJumpInstr
+        mkStackAllocInstr       = panic "no sparc_mkStackAllocInstr"
+        mkStackDeallocInstr     = panic "no sparc_mkStackDeallocInstr"
+
+
+-- | SPARC instruction set.
+--      Not complete. This is only the ones we need.
+--
+data Instr
+
+        -- meta ops --------------------------------------------------
+        -- comment pseudo-op
+        = COMMENT FastString
+
+        -- some static data spat out during code generation.
+        -- Will be extracted before pretty-printing.
+        | LDATA   Section RawCmmStatics
+
+        -- Start a new basic block.  Useful during codegen, removed later.
+        -- Preceding instruction should be a jump, as per the invariants
+        -- for a BasicBlock (see Cmm).
+        | NEWBLOCK BlockId
+
+        -- specify current stack offset for benefit of subsequent passes.
+        | DELTA   Int
+
+        -- real instrs -----------------------------------------------
+        -- Loads and stores.
+        | LD            Format AddrMode Reg             -- format, src, dst
+        | ST            Format Reg AddrMode             -- format, src, dst
+
+        -- Int Arithmetic.
+        --      x:   add/sub with carry bit.
+        --              In SPARC V9 addx and friends were renamed addc.
+        --
+        --      cc:  modify condition codes
+        --
+        | ADD           Bool Bool Reg RI Reg            -- x?, cc?, src1, src2, dst
+        | SUB           Bool Bool Reg RI Reg            -- x?, cc?, src1, src2, dst
+
+        | UMUL          Bool Reg RI Reg                 --     cc?, src1, src2, dst
+        | SMUL          Bool Reg RI Reg                 --     cc?, src1, src2, dst
+
+
+        -- The SPARC divide instructions perform 64bit by 32bit division
+        --   The Y register is xored into the first operand.
+
+        --   On _some implementations_ the Y register is overwritten by
+        --   the remainder, so we have to make sure it is 0 each time.
+
+        --   dst <- ((Y `shiftL` 32) `or` src1) `div` src2
+        | UDIV          Bool Reg RI Reg                 --     cc?, src1, src2, dst
+        | SDIV          Bool Reg RI Reg                 --     cc?, src1, src2, dst
+
+        | RDY           Reg                             -- move contents of Y register to reg
+        | WRY           Reg  Reg                        -- Y <- src1 `xor` src2
+
+        -- Logic operations.
+        | AND           Bool Reg RI Reg                 -- cc?, src1, src2, dst
+        | ANDN          Bool Reg RI Reg                 -- cc?, src1, src2, dst
+        | OR            Bool Reg RI Reg                 -- cc?, src1, src2, dst
+        | ORN           Bool Reg RI Reg                 -- cc?, src1, src2, dst
+        | XOR           Bool Reg RI Reg                 -- cc?, src1, src2, dst
+        | XNOR          Bool Reg RI Reg                 -- cc?, src1, src2, dst
+        | SLL           Reg RI Reg                      -- src1, src2, dst
+        | SRL           Reg RI Reg                      -- src1, src2, dst
+        | SRA           Reg RI Reg                      -- src1, src2, dst
+
+        -- Load immediates.
+        | SETHI         Imm Reg                         -- src, dst
+
+        -- Do nothing.
+        -- Implemented by the assembler as SETHI 0, %g0, but worth an alias
+        | NOP
+
+        -- Float Arithmetic.
+        -- Note that we cheat by treating F{ABS,MOV,NEG} of doubles as single
+        -- instructions right up until we spit them out.
+        --
+        | FABS          Format Reg Reg                  -- src dst
+        | FADD          Format Reg Reg Reg              -- src1, src2, dst
+        | FCMP          Bool Format Reg Reg             -- exception?, src1, src2, dst
+        | FDIV          Format Reg Reg Reg              -- src1, src2, dst
+        | FMOV          Format Reg Reg                  -- src, dst
+        | FMUL          Format Reg Reg Reg              -- src1, src2, dst
+        | FNEG          Format Reg Reg                  -- src, dst
+        | FSQRT         Format Reg Reg                  -- src, dst
+        | FSUB          Format Reg Reg Reg              -- src1, src2, dst
+        | FxTOy         Format Format Reg Reg           -- src, dst
+
+        -- Jumping around.
+        | BI            Cond Bool BlockId               -- cond, annul?, target
+        | BF            Cond Bool BlockId               -- cond, annul?, target
+
+        | JMP           AddrMode                        -- target
+
+        -- With a tabled jump we know all the possible destinations.
+        -- We also need this info so we can work out what regs are live across the jump.
+        --
+        | JMP_TBL       AddrMode [Maybe BlockId] CLabel
+
+        | CALL          (Either Imm Reg) Int Bool       -- target, args, terminal
+
+
+-- | regUsage returns the sets of src and destination registers used
+--      by a particular instruction.  Machine registers that are
+--      pre-allocated to stgRegs are filtered out, because they are
+--      uninteresting from a register allocation standpoint.  (We wouldn't
+--      want them to end up on the free list!)  As far as we are concerned,
+--      the fixed registers simply don't exist (for allocation purposes,
+--      anyway).
+
+--      regUsage doesn't need to do any trickery for jumps and such.  Just
+--      state precisely the regs read and written by that insn.  The
+--      consequences of control flow transfers, as far as register
+--      allocation goes, are taken care of by the register allocator.
+--
+sparc_regUsageOfInstr :: Platform -> Instr -> RegUsage
+sparc_regUsageOfInstr platform instr
+ = case instr of
+    LD    _ addr reg            -> usage (regAddr addr,         [reg])
+    ST    _ reg addr            -> usage (reg : regAddr addr,   [])
+    ADD   _ _ r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    SUB   _ _ r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    UMUL    _ r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    SMUL    _ r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    UDIV    _ r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    SDIV    _ r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    RDY       rd                -> usage ([],                   [rd])
+    WRY       r1 r2             -> usage ([r1, r2],             [])
+    AND     _ r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    ANDN    _ r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    OR      _ r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    ORN     _ r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    XOR     _ r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    XNOR    _ r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    SLL       r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    SRL       r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    SRA       r1 ar r2          -> usage (r1 : regRI ar,        [r2])
+    SETHI   _ reg               -> usage ([],                   [reg])
+    FABS    _ r1 r2             -> usage ([r1],                 [r2])
+    FADD    _ r1 r2 r3          -> usage ([r1, r2],             [r3])
+    FCMP    _ _  r1 r2          -> usage ([r1, r2],             [])
+    FDIV    _ r1 r2 r3          -> usage ([r1, r2],             [r3])
+    FMOV    _ r1 r2             -> usage ([r1],                 [r2])
+    FMUL    _ r1 r2 r3          -> usage ([r1, r2],             [r3])
+    FNEG    _ r1 r2             -> usage ([r1],                 [r2])
+    FSQRT   _ r1 r2             -> usage ([r1],                 [r2])
+    FSUB    _ r1 r2 r3          -> usage ([r1, r2],             [r3])
+    FxTOy   _ _  r1 r2          -> usage ([r1],                 [r2])
+
+    JMP     addr                -> usage (regAddr addr, [])
+    JMP_TBL addr _ _            -> usage (regAddr addr, [])
+
+    CALL  (Left _  )  _ True    -> noUsage
+    CALL  (Left _  )  n False   -> usage (argRegs n, callClobberedRegs)
+    CALL  (Right reg) _ True    -> usage ([reg], [])
+    CALL  (Right reg) n False   -> usage (reg : (argRegs n), callClobberedRegs)
+    _                           -> noUsage
+
+  where
+    usage (src, dst)
+     = RU (filter (interesting platform) src)
+          (filter (interesting platform) dst)
+
+    regAddr (AddrRegReg r1 r2)  = [r1, r2]
+    regAddr (AddrRegImm r1 _)   = [r1]
+
+    regRI (RIReg r)             = [r]
+    regRI  _                    = []
+
+
+-- | Interesting regs are virtuals, or ones that are allocatable
+--      by the register allocator.
+interesting :: Platform -> Reg -> Bool
+interesting platform reg
+ = case reg of
+        RegVirtual _                    -> True
+        RegReal (RealRegSingle r1)      -> freeReg platform r1
+        RegReal (RealRegPair r1 _)      -> freeReg platform r1
+
+
+
+-- | Apply a given mapping to tall the register references in this instruction.
+sparc_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
+sparc_patchRegsOfInstr instr env = case instr of
+    LD    fmt addr reg          -> LD fmt (fixAddr addr) (env reg)
+    ST    fmt reg addr          -> ST fmt (env reg) (fixAddr addr)
+
+    ADD   x cc r1 ar r2         -> ADD   x cc  (env r1) (fixRI ar) (env r2)
+    SUB   x cc r1 ar r2         -> SUB   x cc  (env r1) (fixRI ar) (env r2)
+    UMUL    cc r1 ar r2         -> UMUL    cc  (env r1) (fixRI ar) (env r2)
+    SMUL    cc r1 ar r2         -> SMUL    cc  (env r1) (fixRI ar) (env r2)
+    UDIV    cc r1 ar r2         -> UDIV    cc  (env r1) (fixRI ar) (env r2)
+    SDIV    cc r1 ar r2         -> SDIV    cc  (env r1) (fixRI ar) (env r2)
+    RDY   rd                    -> RDY         (env rd)
+    WRY   r1 r2                 -> WRY         (env r1) (env r2)
+    AND   b r1 ar r2            -> AND   b     (env r1) (fixRI ar) (env r2)
+    ANDN  b r1 ar r2            -> ANDN  b     (env r1) (fixRI ar) (env r2)
+    OR    b r1 ar r2            -> OR    b     (env r1) (fixRI ar) (env r2)
+    ORN   b r1 ar r2            -> ORN   b     (env r1) (fixRI ar) (env r2)
+    XOR   b r1 ar r2            -> XOR   b     (env r1) (fixRI ar) (env r2)
+    XNOR  b r1 ar r2            -> XNOR  b     (env r1) (fixRI ar) (env r2)
+    SLL   r1 ar r2              -> SLL         (env r1) (fixRI ar) (env r2)
+    SRL   r1 ar r2              -> SRL         (env r1) (fixRI ar) (env r2)
+    SRA   r1 ar r2              -> SRA         (env r1) (fixRI ar) (env r2)
+
+    SETHI imm reg               -> SETHI imm (env reg)
+
+    FABS  s r1 r2               -> FABS    s   (env r1) (env r2)
+    FADD  s r1 r2 r3            -> FADD    s   (env r1) (env r2) (env r3)
+    FCMP  e s r1 r2             -> FCMP e  s   (env r1) (env r2)
+    FDIV  s r1 r2 r3            -> FDIV    s   (env r1) (env r2) (env r3)
+    FMOV  s r1 r2               -> FMOV    s   (env r1) (env r2)
+    FMUL  s r1 r2 r3            -> FMUL    s   (env r1) (env r2) (env r3)
+    FNEG  s r1 r2               -> FNEG    s   (env r1) (env r2)
+    FSQRT s r1 r2               -> FSQRT   s   (env r1) (env r2)
+    FSUB  s r1 r2 r3            -> FSUB    s   (env r1) (env r2) (env r3)
+    FxTOy s1 s2 r1 r2           -> FxTOy s1 s2 (env r1) (env r2)
+
+    JMP     addr                -> JMP     (fixAddr addr)
+    JMP_TBL addr ids l          -> JMP_TBL (fixAddr addr) ids l
+
+    CALL  (Left i) n t          -> CALL (Left i) n t
+    CALL  (Right r) n t         -> CALL (Right (env r)) n t
+    _                           -> instr
+
+  where
+    fixAddr (AddrRegReg r1 r2)  = AddrRegReg   (env r1) (env r2)
+    fixAddr (AddrRegImm r1 i)   = AddrRegImm   (env r1) i
+
+    fixRI (RIReg r)             = RIReg (env r)
+    fixRI other                 = other
+
+
+--------------------------------------------------------------------------------
+sparc_isJumpishInstr :: Instr -> Bool
+sparc_isJumpishInstr instr
+ = case instr of
+        BI{}            -> True
+        BF{}            -> True
+        JMP{}           -> True
+        JMP_TBL{}       -> True
+        CALL{}          -> True
+        _               -> False
+
+sparc_jumpDestsOfInstr :: Instr -> [BlockId]
+sparc_jumpDestsOfInstr insn
+  = case insn of
+        BI   _ _ id     -> [id]
+        BF   _ _ id     -> [id]
+        JMP_TBL _ ids _ -> [id | Just id <- ids]
+        _               -> []
+
+
+sparc_patchJumpInstr :: Instr -> (BlockId -> BlockId) -> Instr
+sparc_patchJumpInstr insn patchF
+  = case insn of
+        BI cc annul id  -> BI cc annul (patchF id)
+        BF cc annul id  -> BF cc annul (patchF id)
+        JMP_TBL n ids l -> JMP_TBL n (map (fmap patchF) ids) l
+        _               -> insn
+
+
+--------------------------------------------------------------------------------
+-- | Make a spill instruction.
+--      On SPARC we spill below frame pointer leaving 2 words/spill
+sparc_mkSpillInstr
+    :: DynFlags
+    -> Reg      -- ^ register to spill
+    -> Int      -- ^ current stack delta
+    -> Int      -- ^ spill slot to use
+    -> Instr
+
+sparc_mkSpillInstr dflags reg _ slot
+ = let  platform = targetPlatform dflags
+        off      = spillSlotToOffset dflags slot
+        off_w    = 1 + (off `div` 4)
+        fmt      = case targetClassOfReg platform reg of
+                        RcInteger -> II32
+                        RcFloat   -> FF32
+                        RcDouble  -> FF64
+
+    in ST fmt reg (fpRel (negate off_w))
+
+
+-- | Make a spill reload instruction.
+sparc_mkLoadInstr
+    :: DynFlags
+    -> Reg      -- ^ register to load into
+    -> Int      -- ^ current stack delta
+    -> Int      -- ^ spill slot to use
+    -> Instr
+
+sparc_mkLoadInstr dflags reg _ slot
+  = let platform = targetPlatform dflags
+        off      = spillSlotToOffset dflags slot
+        off_w    = 1 + (off `div` 4)
+        fmt      = case targetClassOfReg platform reg of
+                        RcInteger -> II32
+                        RcFloat   -> FF32
+                        RcDouble  -> FF64
+
+        in LD fmt (fpRel (- off_w)) reg
+
+
+--------------------------------------------------------------------------------
+-- | See if this instruction is telling us the current C stack delta
+sparc_takeDeltaInstr
+        :: Instr
+        -> Maybe Int
+
+sparc_takeDeltaInstr instr
+ = case instr of
+        DELTA i         -> Just i
+        _               -> Nothing
+
+
+sparc_isMetaInstr
+        :: Instr
+        -> Bool
+
+sparc_isMetaInstr instr
+ = case instr of
+        COMMENT{}       -> True
+        LDATA{}         -> True
+        NEWBLOCK{}      -> True
+        DELTA{}         -> True
+        _               -> False
+
+
+-- | Make a reg-reg move instruction.
+--      On SPARC v8 there are no instructions to move directly between
+--      floating point and integer regs. If we need to do that then we
+--      have to go via memory.
+--
+sparc_mkRegRegMoveInstr
+    :: Platform
+    -> Reg
+    -> Reg
+    -> Instr
+
+sparc_mkRegRegMoveInstr platform src dst
+        | srcClass      <- targetClassOfReg platform src
+        , dstClass      <- targetClassOfReg platform dst
+        , srcClass == dstClass
+        = case srcClass of
+                RcInteger -> ADD  False False src (RIReg g0) dst
+                RcDouble  -> FMOV FF64 src dst
+                RcFloat   -> FMOV FF32 src dst
+
+        | otherwise
+        = panic "SPARC.Instr.mkRegRegMoveInstr: classes of src and dest not the same"
+
+
+-- | Check whether an instruction represents a reg-reg move.
+--      The register allocator attempts to eliminate reg->reg moves whenever it can,
+--      by assigning the src and dest temporaries to the same real register.
+--
+sparc_takeRegRegMoveInstr :: Instr -> Maybe (Reg,Reg)
+sparc_takeRegRegMoveInstr instr
+ = case instr of
+        ADD False False src (RIReg src2) dst
+         | g0 == src2           -> Just (src, dst)
+
+        FMOV FF64 src dst       -> Just (src, dst)
+        FMOV FF32  src dst      -> Just (src, dst)
+        _                       -> Nothing
+
+
+-- | Make an unconditional branch instruction.
+sparc_mkJumpInstr
+        :: BlockId
+        -> [Instr]
+
+sparc_mkJumpInstr id
+ =       [BI ALWAYS False id
+        , NOP]                  -- fill the branch delay slot.
diff --git a/compiler/GHC/CmmToAsm/SPARC/Ppr.hs b/compiler/GHC/CmmToAsm/SPARC/Ppr.hs
new file mode 100644
index 0000000000..2f3ea778f7
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Ppr.hs
@@ -0,0 +1,645 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Pretty-printing assembly language
+--
+-- (c) The University of Glasgow 1993-2005
+--
+-----------------------------------------------------------------------------
+
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+module GHC.CmmToAsm.SPARC.Ppr (
+        pprNatCmmDecl,
+        pprBasicBlock,
+        pprData,
+        pprInstr,
+        pprFormat,
+        pprImm,
+        pprDataItem
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Cond
+import GHC.CmmToAsm.SPARC.Imm
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.Instr
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Format
+import GHC.CmmToAsm.Ppr
+
+import GHC.Cmm hiding (topInfoTable)
+import GHC.Cmm.Ppr() -- For Outputable instances
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.Dataflow.Collections
+
+import Unique           ( pprUniqueAlways )
+import Outputable
+import GHC.Platform
+import FastString
+
+-- -----------------------------------------------------------------------------
+-- Printing this stuff out
+
+pprNatCmmDecl :: NatCmmDecl RawCmmStatics Instr -> SDoc
+pprNatCmmDecl (CmmData section dats) =
+  pprSectionAlign section $$ pprDatas dats
+
+pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
+  case topInfoTable proc of
+    Nothing ->
+        -- special case for code without info table:
+        pprSectionAlign (Section Text lbl) $$
+        pprLabel lbl $$ -- blocks guaranteed not null, so label needed
+        vcat (map (pprBasicBlock top_info) blocks)
+
+    Just (RawCmmStatics info_lbl _) ->
+      sdocWithPlatform $ \platform ->
+      (if platformHasSubsectionsViaSymbols platform
+          then pprSectionAlign dspSection $$
+               ppr (mkDeadStripPreventer info_lbl) <> char ':'
+          else empty) $$
+      vcat (map (pprBasicBlock top_info) blocks) $$
+      -- above: Even the first block gets a label, because with branch-chain
+      -- elimination, it might be the target of a goto.
+      (if platformHasSubsectionsViaSymbols platform
+       then
+       -- See Note [Subsections Via Symbols] in X86/Ppr.hs
+                text "\t.long "
+            <+> ppr info_lbl
+            <+> char '-'
+            <+> ppr (mkDeadStripPreventer info_lbl)
+       else empty)
+
+dspSection :: Section
+dspSection = Section Text $
+    panic "subsections-via-symbols doesn't combine with split-sections"
+
+pprBasicBlock :: LabelMap RawCmmStatics -> NatBasicBlock Instr -> SDoc
+pprBasicBlock info_env (BasicBlock blockid instrs)
+  = maybe_infotable $$
+    pprLabel (blockLbl blockid) $$
+    vcat (map pprInstr instrs)
+  where
+    maybe_infotable = case mapLookup blockid info_env of
+       Nothing   -> empty
+       Just (RawCmmStatics info_lbl info) ->
+           pprAlignForSection Text $$
+           vcat (map pprData info) $$
+           pprLabel info_lbl
+
+
+pprDatas :: RawCmmStatics -> SDoc
+-- See note [emit-time elimination of static indirections] in CLabel.
+pprDatas (RawCmmStatics alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _])
+  | lbl == mkIndStaticInfoLabel
+  , let labelInd (CmmLabelOff l _) = Just l
+        labelInd (CmmLabel l) = Just l
+        labelInd _ = Nothing
+  , Just ind' <- labelInd ind
+  , alias `mayRedirectTo` ind'
+  = pprGloblDecl alias
+    $$ text ".equiv" <+> ppr alias <> comma <> ppr (CmmLabel ind')
+pprDatas (RawCmmStatics lbl dats) = vcat (pprLabel lbl : map pprData dats)
+
+pprData :: CmmStatic -> SDoc
+pprData (CmmString str)          = pprBytes str
+pprData (CmmUninitialised bytes) = text ".skip " <> int bytes
+pprData (CmmStaticLit lit)       = pprDataItem lit
+
+pprGloblDecl :: CLabel -> SDoc
+pprGloblDecl lbl
+  | not (externallyVisibleCLabel lbl) = empty
+  | otherwise = text ".global " <> ppr lbl
+
+pprTypeAndSizeDecl :: CLabel -> SDoc
+pprTypeAndSizeDecl lbl
+    = sdocWithPlatform $ \platform ->
+      if platformOS platform == OSLinux && externallyVisibleCLabel lbl
+      then text ".type " <> ppr lbl <> ptext (sLit ", @object")
+      else empty
+
+pprLabel :: CLabel -> SDoc
+pprLabel lbl = pprGloblDecl lbl
+            $$ pprTypeAndSizeDecl lbl
+            $$ (ppr lbl <> char ':')
+
+-- -----------------------------------------------------------------------------
+-- pprInstr: print an 'Instr'
+
+instance Outputable Instr where
+    ppr instr = pprInstr instr
+
+
+-- | Pretty print a register.
+pprReg :: Reg -> SDoc
+pprReg reg
+ = case reg of
+        RegVirtual vr
+         -> case vr of
+                VirtualRegI   u -> text "%vI_"   <> pprUniqueAlways u
+                VirtualRegHi  u -> text "%vHi_"  <> pprUniqueAlways u
+                VirtualRegF   u -> text "%vF_"   <> pprUniqueAlways u
+                VirtualRegD   u -> text "%vD_"   <> pprUniqueAlways u
+
+
+        RegReal rr
+         -> case rr of
+                RealRegSingle r1
+                 -> pprReg_ofRegNo r1
+
+                RealRegPair r1 r2
+                 -> text "(" <> pprReg_ofRegNo r1
+                 <> vbar     <> pprReg_ofRegNo r2
+                 <> text ")"
+
+
+
+-- | Pretty print a register name, based on this register number.
+--   The definition has been unfolded so we get a jump-table in the
+--   object code. This function is called quite a lot when emitting
+--   the asm file..
+--
+pprReg_ofRegNo :: Int -> SDoc
+pprReg_ofRegNo i
+ = ptext
+    (case i of {
+         0 -> sLit "%g0";   1 -> sLit "%g1";
+         2 -> sLit "%g2";   3 -> sLit "%g3";
+         4 -> sLit "%g4";   5 -> sLit "%g5";
+         6 -> sLit "%g6";   7 -> sLit "%g7";
+         8 -> sLit "%o0";   9 -> sLit "%o1";
+        10 -> sLit "%o2";  11 -> sLit "%o3";
+        12 -> sLit "%o4";  13 -> sLit "%o5";
+        14 -> sLit "%o6";  15 -> sLit "%o7";
+        16 -> sLit "%l0";  17 -> sLit "%l1";
+        18 -> sLit "%l2";  19 -> sLit "%l3";
+        20 -> sLit "%l4";  21 -> sLit "%l5";
+        22 -> sLit "%l6";  23 -> sLit "%l7";
+        24 -> sLit "%i0";  25 -> sLit "%i1";
+        26 -> sLit "%i2";  27 -> sLit "%i3";
+        28 -> sLit "%i4";  29 -> sLit "%i5";
+        30 -> sLit "%i6";  31 -> sLit "%i7";
+        32 -> sLit "%f0";  33 -> sLit "%f1";
+        34 -> sLit "%f2";  35 -> sLit "%f3";
+        36 -> sLit "%f4";  37 -> sLit "%f5";
+        38 -> sLit "%f6";  39 -> sLit "%f7";
+        40 -> sLit "%f8";  41 -> sLit "%f9";
+        42 -> sLit "%f10"; 43 -> sLit "%f11";
+        44 -> sLit "%f12"; 45 -> sLit "%f13";
+        46 -> sLit "%f14"; 47 -> sLit "%f15";
+        48 -> sLit "%f16"; 49 -> sLit "%f17";
+        50 -> sLit "%f18"; 51 -> sLit "%f19";
+        52 -> sLit "%f20"; 53 -> sLit "%f21";
+        54 -> sLit "%f22"; 55 -> sLit "%f23";
+        56 -> sLit "%f24"; 57 -> sLit "%f25";
+        58 -> sLit "%f26"; 59 -> sLit "%f27";
+        60 -> sLit "%f28"; 61 -> sLit "%f29";
+        62 -> sLit "%f30"; 63 -> sLit "%f31";
+        _  -> sLit "very naughty sparc register" })
+
+
+-- | Pretty print a format for an instruction suffix.
+pprFormat :: Format -> SDoc
+pprFormat x
+ = ptext
+    (case x of
+        II8     -> sLit "ub"
+        II16    -> sLit "uh"
+        II32    -> sLit ""
+        II64    -> sLit "d"
+        FF32    -> sLit ""
+        FF64    -> sLit "d")
+
+
+-- | Pretty print a format for an instruction suffix.
+--      eg LD is 32bit on sparc, but LDD is 64 bit.
+pprStFormat :: Format -> SDoc
+pprStFormat x
+ = ptext
+    (case x of
+        II8   -> sLit "b"
+        II16  -> sLit "h"
+        II32  -> sLit ""
+        II64  -> sLit "x"
+        FF32  -> sLit ""
+        FF64  -> sLit "d")
+
+
+
+-- | Pretty print a condition code.
+pprCond :: Cond -> SDoc
+pprCond c
+ = ptext
+    (case c of
+        ALWAYS  -> sLit ""
+        NEVER   -> sLit "n"
+        GEU     -> sLit "geu"
+        LU      -> sLit "lu"
+        EQQ     -> sLit "e"
+        GTT     -> sLit "g"
+        GE      -> sLit "ge"
+        GU      -> sLit "gu"
+        LTT     -> sLit "l"
+        LE      -> sLit "le"
+        LEU     -> sLit "leu"
+        NE      -> sLit "ne"
+        NEG     -> sLit "neg"
+        POS     -> sLit "pos"
+        VC      -> sLit "vc"
+        VS      -> sLit "vs")
+
+
+-- | Pretty print an address mode.
+pprAddr :: AddrMode -> SDoc
+pprAddr am
+ = case am of
+        AddrRegReg r1 (RegReal (RealRegSingle 0))
+         -> pprReg r1
+
+        AddrRegReg r1 r2
+         -> hcat [ pprReg r1, char '+', pprReg r2 ]
+
+        AddrRegImm r1 (ImmInt i)
+         | i == 0               -> pprReg r1
+         | not (fits13Bits i)   -> largeOffsetError i
+         | otherwise            -> hcat [ pprReg r1, pp_sign, int i ]
+         where
+                pp_sign = if i > 0 then char '+' else empty
+
+        AddrRegImm r1 (ImmInteger i)
+         | i == 0               -> pprReg r1
+         | not (fits13Bits i)   -> largeOffsetError i
+         | otherwise            -> hcat [ pprReg r1, pp_sign, integer i ]
+         where
+                pp_sign = if i > 0 then char '+' else empty
+
+        AddrRegImm r1 imm
+         -> hcat [ pprReg r1, char '+', pprImm imm ]
+
+
+-- | Pretty print an immediate value.
+pprImm :: Imm -> SDoc
+pprImm imm
+ = case imm of
+        ImmInt i        -> int i
+        ImmInteger i    -> integer i
+        ImmCLbl l       -> ppr l
+        ImmIndex l i    -> ppr l <> char '+' <> int i
+        ImmLit s        -> s
+
+        ImmConstantSum a b
+         -> pprImm a <> char '+' <> pprImm b
+
+        ImmConstantDiff a b
+         -> pprImm a <> char '-' <> lparen <> pprImm b <> rparen
+
+        LO i
+         -> hcat [ text "%lo(", pprImm i, rparen ]
+
+        HI i
+         -> hcat [ text "%hi(", pprImm i, rparen ]
+
+        -- these should have been converted to bytes and placed
+        --      in the data section.
+        ImmFloat _      -> text "naughty float immediate"
+        ImmDouble _     -> text "naughty double immediate"
+
+
+-- | Pretty print a section \/ segment header.
+--      On SPARC all the data sections must be at least 8 byte aligned
+--      incase we store doubles in them.
+--
+pprSectionAlign :: Section -> SDoc
+pprSectionAlign sec@(Section seg _) =
+  sdocWithPlatform $ \platform ->
+    pprSectionHeader platform sec $$
+    pprAlignForSection seg
+
+-- | Print appropriate alignment for the given section type.
+pprAlignForSection :: SectionType -> SDoc
+pprAlignForSection seg =
+    ptext (case seg of
+      Text              -> sLit ".align 4"
+      Data              -> sLit ".align 8"
+      ReadOnlyData      -> sLit ".align 8"
+      RelocatableReadOnlyData
+                        -> sLit ".align 8"
+      UninitialisedData -> sLit ".align 8"
+      ReadOnlyData16    -> sLit ".align 16"
+      -- TODO: This is copied from the ReadOnlyData case, but it can likely be
+      -- made more efficient.
+      CString           -> sLit ".align 8"
+      OtherSection _    -> panic "PprMach.pprSectionHeader: unknown section")
+
+-- | Pretty print a data item.
+pprDataItem :: CmmLit -> SDoc
+pprDataItem lit
+  = sdocWithDynFlags $ \dflags ->
+    vcat (ppr_item (cmmTypeFormat $ cmmLitType dflags lit) lit)
+    where
+        imm = litToImm lit
+
+        ppr_item II8   _        = [text "\t.byte\t" <> pprImm imm]
+        ppr_item II32  _        = [text "\t.long\t" <> pprImm imm]
+
+        ppr_item FF32  (CmmFloat r _)
+         = let bs = floatToBytes (fromRational r)
+           in  map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+        ppr_item FF64 (CmmFloat r _)
+         = let bs = doubleToBytes (fromRational r)
+           in  map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+        ppr_item II16  _        = [text "\t.short\t" <> pprImm imm]
+        ppr_item II64  _        = [text "\t.quad\t" <> pprImm imm]
+        ppr_item _ _            = panic "SPARC.Ppr.pprDataItem: no match"
+
+
+-- | Pretty print an instruction.
+pprInstr :: Instr -> SDoc
+
+-- nuke comments.
+pprInstr (COMMENT _)
+        = empty
+
+pprInstr (DELTA d)
+        = pprInstr (COMMENT (mkFastString ("\tdelta = " ++ show d)))
+
+-- Newblocks and LData should have been slurped out before producing the .s file.
+pprInstr (NEWBLOCK _)
+        = panic "X86.Ppr.pprInstr: NEWBLOCK"
+
+pprInstr (LDATA _ _)
+        = panic "PprMach.pprInstr: LDATA"
+
+-- 64 bit FP loads are expanded into individual instructions in CodeGen.Expand
+pprInstr (LD FF64 _ reg)
+        | RegReal (RealRegSingle{})     <- reg
+        = panic "SPARC.Ppr: not emitting potentially misaligned LD FF64 instr"
+
+pprInstr (LD format addr reg)
+        = hcat [
+               text "\tld",
+               pprFormat format,
+               char '\t',
+               lbrack,
+               pprAddr addr,
+               pp_rbracket_comma,
+               pprReg reg
+            ]
+
+-- 64 bit FP stores are expanded into individual instructions in CodeGen.Expand
+pprInstr (ST FF64 reg _)
+        | RegReal (RealRegSingle{}) <- reg
+        = panic "SPARC.Ppr: not emitting potentially misaligned ST FF64 instr"
+
+-- no distinction is made between signed and unsigned bytes on stores for the
+-- Sparc opcodes (at least I cannot see any, and gas is nagging me --SOF),
+-- so we call a special-purpose pprFormat for ST..
+pprInstr (ST format reg addr)
+        = hcat [
+               text "\tst",
+               pprStFormat format,
+               char '\t',
+               pprReg reg,
+               pp_comma_lbracket,
+               pprAddr addr,
+               rbrack
+            ]
+
+
+pprInstr (ADD x cc reg1 ri reg2)
+        | not x && not cc && riZero ri
+        = hcat [ text "\tmov\t", pprReg reg1, comma, pprReg reg2 ]
+
+        | otherwise
+        = pprRegRIReg (if x then sLit "addx" else sLit "add") cc reg1 ri reg2
+
+
+pprInstr (SUB x cc reg1 ri reg2)
+        | not x && cc && reg2 == g0
+        = hcat [ text "\tcmp\t", pprReg reg1, comma, pprRI ri ]
+
+        | not x && not cc && riZero ri
+        = hcat [ text "\tmov\t", pprReg reg1, comma, pprReg reg2 ]
+
+        | otherwise
+        = pprRegRIReg (if x then sLit "subx" else sLit "sub") cc reg1 ri reg2
+
+pprInstr (AND  b reg1 ri reg2) = pprRegRIReg (sLit "and")  b reg1 ri reg2
+
+pprInstr (ANDN b reg1 ri reg2) = pprRegRIReg (sLit "andn") b reg1 ri reg2
+
+pprInstr (OR b reg1 ri reg2)
+        | not b && reg1 == g0
+        = let doit = hcat [ text "\tmov\t", pprRI ri, comma, pprReg reg2 ]
+          in  case ri of
+                   RIReg rrr | rrr == reg2 -> empty
+                   _                       -> doit
+
+        | otherwise
+        = pprRegRIReg (sLit "or") b reg1 ri reg2
+
+pprInstr (ORN b reg1 ri reg2)  = pprRegRIReg (sLit "orn") b reg1 ri reg2
+
+pprInstr (XOR  b reg1 ri reg2) = pprRegRIReg (sLit "xor")  b reg1 ri reg2
+pprInstr (XNOR b reg1 ri reg2) = pprRegRIReg (sLit "xnor") b reg1 ri reg2
+
+pprInstr (SLL reg1 ri reg2)    = pprRegRIReg (sLit "sll") False reg1 ri reg2
+pprInstr (SRL reg1 ri reg2)    = pprRegRIReg (sLit "srl") False reg1 ri reg2
+pprInstr (SRA reg1 ri reg2)    = pprRegRIReg (sLit "sra") False reg1 ri reg2
+
+pprInstr (RDY rd)              = text "\trd\t%y," <> pprReg rd
+pprInstr (WRY reg1 reg2)
+        = text "\twr\t"
+                <> pprReg reg1
+                <> char ','
+                <> pprReg reg2
+                <> char ','
+                <> text "%y"
+
+pprInstr (SMUL b reg1 ri reg2) = pprRegRIReg (sLit "smul")  b reg1 ri reg2
+pprInstr (UMUL b reg1 ri reg2) = pprRegRIReg (sLit "umul")  b reg1 ri reg2
+pprInstr (SDIV b reg1 ri reg2) = pprRegRIReg (sLit "sdiv")  b reg1 ri reg2
+pprInstr (UDIV b reg1 ri reg2) = pprRegRIReg (sLit "udiv")  b reg1 ri reg2
+
+pprInstr (SETHI imm reg)
+  = hcat [
+        text "\tsethi\t",
+        pprImm imm,
+        comma,
+        pprReg reg
+    ]
+
+pprInstr NOP
+        = text "\tnop"
+
+pprInstr (FABS format reg1 reg2)
+        = pprFormatRegReg (sLit "fabs") format reg1 reg2
+
+pprInstr (FADD format reg1 reg2 reg3)
+        = pprFormatRegRegReg (sLit "fadd") format reg1 reg2 reg3
+
+pprInstr (FCMP e format reg1 reg2)
+        = pprFormatRegReg (if e then sLit "fcmpe" else sLit "fcmp")
+                          format reg1 reg2
+
+pprInstr (FDIV format reg1 reg2 reg3)
+        = pprFormatRegRegReg (sLit "fdiv") format reg1 reg2 reg3
+
+pprInstr (FMOV format reg1 reg2)
+        = pprFormatRegReg (sLit "fmov") format reg1 reg2
+
+pprInstr (FMUL format reg1 reg2 reg3)
+        = pprFormatRegRegReg (sLit "fmul") format reg1 reg2 reg3
+
+pprInstr (FNEG format reg1 reg2)
+        = pprFormatRegReg (sLit "fneg") format reg1 reg2
+
+pprInstr (FSQRT format reg1 reg2)
+        = pprFormatRegReg (sLit "fsqrt") format reg1 reg2
+
+pprInstr (FSUB format reg1 reg2 reg3)
+        = pprFormatRegRegReg (sLit "fsub") format reg1 reg2 reg3
+
+pprInstr (FxTOy format1 format2 reg1 reg2)
+  = hcat [
+        text "\tf",
+        ptext
+        (case format1 of
+            II32  -> sLit "ito"
+            FF32  -> sLit "sto"
+            FF64  -> sLit "dto"
+            _     -> panic "SPARC.Ppr.pprInstr.FxToY: no match"),
+        ptext
+        (case format2 of
+            II32  -> sLit "i\t"
+            II64  -> sLit "x\t"
+            FF32  -> sLit "s\t"
+            FF64  -> sLit "d\t"
+            _     -> panic "SPARC.Ppr.pprInstr.FxToY: no match"),
+        pprReg reg1, comma, pprReg reg2
+    ]
+
+
+pprInstr (BI cond b blockid)
+  = hcat [
+        text "\tb", pprCond cond,
+        if b then pp_comma_a else empty,
+        char '\t',
+        ppr (blockLbl blockid)
+    ]
+
+pprInstr (BF cond b blockid)
+  = hcat [
+        text "\tfb", pprCond cond,
+        if b then pp_comma_a else empty,
+        char '\t',
+        ppr (blockLbl blockid)
+    ]
+
+pprInstr (JMP addr) = text "\tjmp\t" <> pprAddr addr
+pprInstr (JMP_TBL op _ _)  = pprInstr (JMP op)
+
+pprInstr (CALL (Left imm) n _)
+  = hcat [ text "\tcall\t", pprImm imm, comma, int n ]
+
+pprInstr (CALL (Right reg) n _)
+  = hcat [ text "\tcall\t", pprReg reg, comma, int n ]
+
+
+-- | Pretty print a RI
+pprRI :: RI -> SDoc
+pprRI (RIReg r) = pprReg r
+pprRI (RIImm r) = pprImm r
+
+
+-- | Pretty print a two reg instruction.
+pprFormatRegReg :: PtrString -> Format -> Reg -> Reg -> SDoc
+pprFormatRegReg name format reg1 reg2
+  = hcat [
+        char '\t',
+        ptext name,
+        (case format of
+            FF32 -> text "s\t"
+            FF64 -> text "d\t"
+            _    -> panic "SPARC.Ppr.pprFormatRegReg: no match"),
+
+        pprReg reg1,
+        comma,
+        pprReg reg2
+    ]
+
+
+-- | Pretty print a three reg instruction.
+pprFormatRegRegReg :: PtrString -> Format -> Reg -> Reg -> Reg -> SDoc
+pprFormatRegRegReg name format reg1 reg2 reg3
+  = hcat [
+        char '\t',
+        ptext name,
+        (case format of
+            FF32  -> text "s\t"
+            FF64  -> text "d\t"
+            _    -> panic "SPARC.Ppr.pprFormatRegReg: no match"),
+        pprReg reg1,
+        comma,
+        pprReg reg2,
+        comma,
+        pprReg reg3
+    ]
+
+
+-- | Pretty print an instruction of two regs and a ri.
+pprRegRIReg :: PtrString -> Bool -> Reg -> RI -> Reg -> SDoc
+pprRegRIReg name b reg1 ri reg2
+  = hcat [
+        char '\t',
+        ptext name,
+        if b then text "cc\t" else char '\t',
+        pprReg reg1,
+        comma,
+        pprRI ri,
+        comma,
+        pprReg reg2
+    ]
+
+{-
+pprRIReg :: PtrString -> Bool -> RI -> Reg -> SDoc
+pprRIReg name b ri reg1
+  = hcat [
+        char '\t',
+        ptext name,
+        if b then text "cc\t" else char '\t',
+        pprRI ri,
+        comma,
+        pprReg reg1
+    ]
+-}
+
+{-
+pp_ld_lbracket :: SDoc
+pp_ld_lbracket    = text "\tld\t["
+-}
+
+pp_rbracket_comma :: SDoc
+pp_rbracket_comma = text "],"
+
+
+pp_comma_lbracket :: SDoc
+pp_comma_lbracket = text ",["
+
+
+pp_comma_a :: SDoc
+pp_comma_a        = text ",a"
diff --git a/compiler/GHC/CmmToAsm/SPARC/Regs.hs b/compiler/GHC/CmmToAsm/SPARC/Regs.hs
new file mode 100644
index 0000000000..ba22470912
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Regs.hs
@@ -0,0 +1,259 @@
+-- -----------------------------------------------------------------------------
+--
+-- (c) The University of Glasgow 1994-2004
+--
+-- -----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.SPARC.Regs (
+        -- registers
+        showReg,
+        virtualRegSqueeze,
+        realRegSqueeze,
+        classOfRealReg,
+        allRealRegs,
+
+        -- machine specific info
+        gReg, iReg, lReg, oReg, fReg,
+        fp, sp, g0, g1, g2, o0, o1, f0, f1, f6, f8, f22, f26, f27,
+
+        -- allocatable
+        allocatableRegs,
+
+        -- args
+        argRegs,
+        allArgRegs,
+        callClobberedRegs,
+
+        --
+        mkVirtualReg,
+        regDotColor
+)
+
+where
+
+
+import GhcPrelude
+
+import GHC.Platform.SPARC
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+import GHC.CmmToAsm.Format
+
+import Unique
+import Outputable
+
+{-
+        The SPARC has 64 registers of interest; 32 integer registers and 32
+        floating point registers.  The mapping of STG registers to SPARC
+        machine registers is defined in StgRegs.h.  We are, of course,
+        prepared for any eventuality.
+
+        The whole fp-register pairing thing on sparcs is a huge nuisance.  See
+        includes/stg/MachRegs.h for a description of what's going on
+        here.
+-}
+
+
+-- | Get the standard name for the register with this number.
+showReg :: RegNo -> String
+showReg n
+        | n >= 0  && n < 8   = "%g" ++ show n
+        | n >= 8  && n < 16  = "%o" ++ show (n-8)
+        | n >= 16 && n < 24  = "%l" ++ show (n-16)
+        | n >= 24 && n < 32  = "%i" ++ show (n-24)
+        | n >= 32 && n < 64  = "%f" ++ show (n-32)
+        | otherwise          = panic "SPARC.Regs.showReg: unknown sparc register"
+
+
+-- Get the register class of a certain real reg
+classOfRealReg :: RealReg -> RegClass
+classOfRealReg reg
+ = case reg of
+        RealRegSingle i
+                | i < 32        -> RcInteger
+                | otherwise     -> RcFloat
+
+        RealRegPair{}           -> RcDouble
+
+
+-- | regSqueeze_class reg
+--      Calculate the maximum number of register colors that could be
+--      denied to a node of this class due to having this reg
+--      as a neighbour.
+--
+{-# INLINE virtualRegSqueeze #-}
+virtualRegSqueeze :: RegClass -> VirtualReg -> Int
+
+virtualRegSqueeze cls vr
+ = case cls of
+        RcInteger
+         -> case vr of
+                VirtualRegI{}           -> 1
+                VirtualRegHi{}          -> 1
+                _other                  -> 0
+
+        RcFloat
+         -> case vr of
+                VirtualRegF{}           -> 1
+                VirtualRegD{}           -> 2
+                _other                  -> 0
+
+        RcDouble
+         -> case vr of
+                VirtualRegF{}           -> 1
+                VirtualRegD{}           -> 1
+                _other                  -> 0
+
+
+{-# INLINE realRegSqueeze #-}
+realRegSqueeze :: RegClass -> RealReg -> Int
+
+realRegSqueeze cls rr
+ = case cls of
+        RcInteger
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo < 32    -> 1
+                        | otherwise     -> 0
+
+                RealRegPair{}           -> 0
+
+        RcFloat
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo < 32    -> 0
+                        | otherwise     -> 1
+
+                RealRegPair{}           -> 2
+
+        RcDouble
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo < 32    -> 0
+                        | otherwise     -> 1
+
+                RealRegPair{}           -> 1
+
+
+-- | All the allocatable registers in the machine,
+--      including register pairs.
+allRealRegs :: [RealReg]
+allRealRegs
+        =  [ (RealRegSingle i)          | i <- [0..63] ]
+        ++ [ (RealRegPair   i (i+1))    | i <- [32, 34 .. 62 ] ]
+
+
+-- | Get the regno for this sort of reg
+gReg, lReg, iReg, oReg, fReg :: Int -> RegNo
+
+gReg x  = x             -- global regs
+oReg x  = (8 + x)       -- output regs
+lReg x  = (16 + x)      -- local regs
+iReg x  = (24 + x)      -- input regs
+fReg x  = (32 + x)      -- float regs
+
+
+-- | Some specific regs used by the code generator.
+g0, g1, g2, fp, sp, o0, o1, f0, f1, f6, f8, f22, f26, f27 :: Reg
+
+f6  = RegReal (RealRegSingle (fReg 6))
+f8  = RegReal (RealRegSingle (fReg 8))
+f22 = RegReal (RealRegSingle (fReg 22))
+f26 = RegReal (RealRegSingle (fReg 26))
+f27 = RegReal (RealRegSingle (fReg 27))
+
+-- g0 is always zero, and writes to it vanish.
+g0  = RegReal (RealRegSingle (gReg 0))
+g1  = RegReal (RealRegSingle (gReg 1))
+g2  = RegReal (RealRegSingle (gReg 2))
+
+-- FP, SP, int and float return (from C) regs.
+fp  = RegReal (RealRegSingle (iReg 6))
+sp  = RegReal (RealRegSingle (oReg 6))
+o0  = RegReal (RealRegSingle (oReg 0))
+o1  = RegReal (RealRegSingle (oReg 1))
+f0  = RegReal (RealRegSingle (fReg 0))
+f1  = RegReal (RealRegSingle (fReg 1))
+
+-- | Produce the second-half-of-a-double register given the first half.
+{-
+fPair :: Reg -> Maybe Reg
+fPair (RealReg n)
+        | n >= 32 && n `mod` 2 == 0  = Just (RealReg (n+1))
+
+fPair (VirtualRegD u)
+        = Just (VirtualRegHi u)
+
+fPair reg
+        = trace ("MachInstrs.fPair: can't get high half of supposed double reg " ++ showPpr reg)
+                Nothing
+-}
+
+
+-- | All the regs that the register allocator can allocate to,
+--      with the fixed use regs removed.
+--
+allocatableRegs :: [RealReg]
+allocatableRegs
+   = let isFree rr
+           = case rr of
+                RealRegSingle r     -> freeReg r
+                RealRegPair   r1 r2 -> freeReg r1 && freeReg r2
+     in filter isFree allRealRegs
+
+
+-- | The registers to place arguments for function calls,
+--      for some number of arguments.
+--
+argRegs :: RegNo -> [Reg]
+argRegs r
+ = case r of
+        0       -> []
+        1       -> map (RegReal . RealRegSingle . oReg) [0]
+        2       -> map (RegReal . RealRegSingle . oReg) [0,1]
+        3       -> map (RegReal . RealRegSingle . oReg) [0,1,2]
+        4       -> map (RegReal . RealRegSingle . oReg) [0,1,2,3]
+        5       -> map (RegReal . RealRegSingle . oReg) [0,1,2,3,4]
+        6       -> map (RegReal . RealRegSingle . oReg) [0,1,2,3,4,5]
+        _       -> panic "MachRegs.argRegs(sparc): don't know about >6 arguments!"
+
+
+-- | All all the regs that could possibly be returned by argRegs
+--
+allArgRegs :: [Reg]
+allArgRegs
+        = map (RegReal . RealRegSingle) [oReg i | i <- [0..5]]
+
+
+-- These are the regs that we cannot assume stay alive over a C call.
+--      TODO: Why can we assume that o6 isn't clobbered? -- BL 2009/02
+--
+callClobberedRegs :: [Reg]
+callClobberedRegs
+        = map (RegReal . RealRegSingle)
+                (  oReg 7 :
+                  [oReg i | i <- [0..5]] ++
+                  [gReg i | i <- [1..7]] ++
+                  [fReg i | i <- [0..31]] )
+
+
+
+-- | Make a virtual reg with this format.
+mkVirtualReg :: Unique -> Format -> VirtualReg
+mkVirtualReg u format
+        | not (isFloatFormat format)
+        = VirtualRegI u
+
+        | otherwise
+        = case format of
+                FF32    -> VirtualRegF u
+                FF64    -> VirtualRegD u
+                _       -> panic "mkVReg"
+
+
+regDotColor :: RealReg -> SDoc
+regDotColor reg
+ = case classOfRealReg reg of
+        RcInteger       -> text "blue"
+        RcFloat         -> text "red"
+        _other          -> text "green"
diff --git a/compiler/GHC/CmmToAsm/SPARC/ShortcutJump.hs b/compiler/GHC/CmmToAsm/SPARC/ShortcutJump.hs
new file mode 100644
index 0000000000..2d1f77d737
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/ShortcutJump.hs
@@ -0,0 +1,74 @@
+module GHC.CmmToAsm.SPARC.ShortcutJump (
+        JumpDest(..), getJumpDestBlockId,
+        canShortcut,
+        shortcutJump,
+        shortcutStatics,
+        shortBlockId
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.Instr
+import GHC.CmmToAsm.SPARC.Imm
+
+import GHC.Cmm.CLabel
+import GHC.Cmm.BlockId
+import GHC.Cmm
+
+import Panic
+import Outputable
+
+data JumpDest
+        = DestBlockId BlockId
+        | DestImm Imm
+
+-- Debug Instance
+instance Outputable JumpDest where
+  ppr (DestBlockId bid) = text "blk:" <> ppr bid
+  ppr (DestImm _bid)    = text "imm:?"
+
+getJumpDestBlockId :: JumpDest -> Maybe BlockId
+getJumpDestBlockId (DestBlockId bid) = Just bid
+getJumpDestBlockId _                 = Nothing
+
+
+canShortcut :: Instr -> Maybe JumpDest
+canShortcut _ = Nothing
+
+
+shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
+shortcutJump _ other = other
+
+
+
+shortcutStatics :: (BlockId -> Maybe JumpDest) -> RawCmmStatics -> RawCmmStatics
+shortcutStatics fn (RawCmmStatics lbl statics)
+  = RawCmmStatics lbl $ map (shortcutStatic fn) statics
+  -- we need to get the jump tables, so apply the mapping to the entries
+  -- of a CmmData too.
+
+shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel
+shortcutLabel fn lab
+  | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn blkId
+  | otherwise                              = lab
+
+shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic
+shortcutStatic fn (CmmStaticLit (CmmLabel lab))
+        = CmmStaticLit (CmmLabel (shortcutLabel fn lab))
+shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w))
+        = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w)
+-- slightly dodgy, we're ignoring the second label, but this
+-- works with the way we use CmmLabelDiffOff for jump tables now.
+shortcutStatic _ other_static
+        = other_static
+
+
+shortBlockId :: (BlockId -> Maybe JumpDest) -> BlockId -> CLabel
+shortBlockId fn blockid =
+   case fn blockid of
+      Nothing -> blockLbl blockid
+      Just (DestBlockId blockid')  -> shortBlockId fn blockid'
+      Just (DestImm (ImmCLbl lbl)) -> lbl
+      _other -> panic "shortBlockId"
diff --git a/compiler/GHC/CmmToAsm/SPARC/Stack.hs b/compiler/GHC/CmmToAsm/SPARC/Stack.hs
new file mode 100644
index 0000000000..d9a0ffd7cf
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/SPARC/Stack.hs
@@ -0,0 +1,59 @@
+module GHC.CmmToAsm.SPARC.Stack (
+        spRel,
+        fpRel,
+        spillSlotToOffset,
+        maxSpillSlots
+)
+
+where
+
+import GhcPrelude
+
+import GHC.CmmToAsm.SPARC.AddrMode
+import GHC.CmmToAsm.SPARC.Regs
+import GHC.CmmToAsm.SPARC.Base
+import GHC.CmmToAsm.SPARC.Imm
+
+import GHC.Driver.Session
+import Outputable
+
+-- | Get an AddrMode relative to the address in sp.
+--      This gives us a stack relative addressing mode for volatile
+--      temporaries and for excess call arguments.
+--
+spRel :: Int            -- ^ stack offset in words, positive or negative
+      -> AddrMode
+
+spRel n = AddrRegImm sp (ImmInt (n * wordLength))
+
+
+-- | Get an address relative to the frame pointer.
+--      This doesn't work work for offsets greater than 13 bits; we just hope for the best
+--
+fpRel :: Int -> AddrMode
+fpRel n
+        = AddrRegImm fp (ImmInt (n * wordLength))
+
+
+-- | Convert a spill slot number to a *byte* offset, with no sign.
+--
+spillSlotToOffset :: DynFlags -> Int -> Int
+spillSlotToOffset dflags slot
+        | slot >= 0 && slot < maxSpillSlots dflags
+        = 64 + spillSlotSize * slot
+
+        | otherwise
+        = pprPanic "spillSlotToOffset:"
+                      (   text "invalid spill location: " <> int slot
+                      $$  text "maxSpillSlots:          " <> int (maxSpillSlots dflags))
+
+
+-- | The maximum number of spill slots available on the C stack.
+--      If we use up all of the slots, then we're screwed.
+--
+--      Why do we reserve 64 bytes, instead of using the whole thing??
+--              -- BL 2009/02/15
+--
+maxSpillSlots :: DynFlags -> Int
+maxSpillSlots dflags
+        = ((spillAreaLength dflags - 64) `div` spillSlotSize) - 1
diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
new file mode 100644
index 0000000000..f3b20c19e1
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
@@ -0,0 +1,3747 @@
+{-# LANGUAGE CPP, GADTs, NondecreasingIndentation #-}
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE BangPatterns #-}
+
+#if __GLASGOW_HASKELL__ <= 808
+-- GHC 8.10 deprecates this flag, but GHC 8.8 needs it
+-- The default iteration limit is a bit too low for the definitions
+-- in this module.
+{-# OPTIONS_GHC -fmax-pmcheck-iterations=10000000 #-}
+#endif
+
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-----------------------------------------------------------------------------
+--
+-- Generating machine code (instruction selection)
+--
+-- (c) The University of Glasgow 1996-2004
+--
+-----------------------------------------------------------------------------
+
+-- This is a big module, but, if you pay attention to
+-- (a) the sectioning, and (b) the type signatures, the
+-- structure should not be too overwhelming.
+
+module GHC.CmmToAsm.X86.CodeGen (
+        cmmTopCodeGen,
+        generateJumpTableForInstr,
+        extractUnwindPoints,
+        invertCondBranches,
+        InstrBlock
+)
+
+where
+
+#include "HsVersions.h"
+
+-- NCG stuff:
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Instr
+import GHC.CmmToAsm.X86.Cond
+import GHC.CmmToAsm.X86.Regs
+import GHC.CmmToAsm.X86.Ppr (  )
+import GHC.CmmToAsm.X86.RegInfo
+
+import GHC.Platform.Regs
+import GHC.CmmToAsm.CPrim
+import GHC.Cmm.DebugBlock
+   ( DebugBlock(..), UnwindPoint(..), UnwindTable
+   , UnwindExpr(UwReg), toUnwindExpr
+   )
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.PIC
+import GHC.CmmToAsm.Monad
+   ( NatM, getNewRegNat, getNewLabelNat, setDeltaNat
+   , getDeltaNat, getBlockIdNat, getPicBaseNat, getNewRegPairNat
+   , getPicBaseMaybeNat, getDebugBlock, getFileId
+   , addImmediateSuccessorNat, updateCfgNat
+   )
+import GHC.CmmToAsm.CFG
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+import GHC.Platform
+
+-- Our intermediate code:
+import BasicTypes
+import GHC.Cmm.BlockId
+import Module           ( primUnitId )
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import GHC.Cmm
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.CLabel
+import CoreSyn          ( Tickish(..) )
+import SrcLoc           ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
+
+-- The rest:
+import ForeignCall      ( CCallConv(..) )
+import OrdList
+import Outputable
+import FastString
+import GHC.Driver.Session
+import Util
+import UniqSupply       ( getUniqueM )
+
+import Control.Monad
+import Data.Bits
+import Data.Foldable (fold)
+import Data.Int
+import Data.Maybe
+import Data.Word
+
+import qualified Data.Map as M
+
+is32BitPlatform :: NatM Bool
+is32BitPlatform = do
+    dflags <- getDynFlags
+    return $ target32Bit (targetPlatform dflags)
+
+sse2Enabled :: NatM Bool
+sse2Enabled = do
+  dflags <- getDynFlags
+  case platformArch (targetPlatform dflags) of
+  -- We Assume  SSE1 and SSE2 operations are available on both
+  -- x86 and x86_64. Historically we didn't default to SSE2 and
+  -- SSE1 on x86, which results in defacto nondeterminism for how
+  -- rounding behaves in the associated x87 floating point instructions
+  -- because variations in the spill/fpu stack placement of arguments for
+  -- operations would change the precision and final result of what
+  -- would otherwise be the same expressions with respect to single or
+  -- double precision IEEE floating point computations.
+    ArchX86_64 -> return True
+    ArchX86    -> return True
+    _          -> panic "trying to generate x86/x86_64 on the wrong platform"
+
+
+sse4_2Enabled :: NatM Bool
+sse4_2Enabled = do
+  dflags <- getDynFlags
+  return (isSse4_2Enabled dflags)
+
+
+cmmTopCodeGen
+        :: RawCmmDecl
+        -> NatM [NatCmmDecl (Alignment, RawCmmStatics) Instr]
+
+cmmTopCodeGen (CmmProc info lab live graph) = do
+  let blocks = toBlockListEntryFirst graph
+  (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
+  picBaseMb <- getPicBaseMaybeNat
+  dflags <- getDynFlags
+  let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
+      tops = proc : concat statics
+      os   = platformOS $ targetPlatform dflags
+
+  case picBaseMb of
+      Just picBase -> initializePicBase_x86 ArchX86 os picBase tops
+      Nothing -> return tops
+
+cmmTopCodeGen (CmmData sec dat) = do
+  return [CmmData sec (mkAlignment 1, dat)]  -- no translation, we just use CmmStatic
+
+{- Note [Verifying basic blocks]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   We want to guarantee a few things about the results
+   of instruction selection.
+
+   Namely that each basic blocks consists of:
+    * A (potentially empty) sequence of straight line instructions
+  followed by
+    * A (potentially empty) sequence of jump like instructions.
+
+    We can verify this by going through the instructions and
+    making sure that any non-jumpish instruction can't appear
+    after a jumpish instruction.
+
+    There are gotchas however:
+    * CALLs are strictly speaking control flow but here we care
+      not about them. Hence we treat them as regular instructions.
+
+      It's safe for them to appear inside a basic block
+      as (ignoring side effects inside the call) they will result in
+      straight line code.
+
+    * NEWBLOCK marks the start of a new basic block so can
+      be followed by any instructions.
+-}
+
+-- Verifying basic blocks is cheap, but not cheap enough to enable it unconditionally.
+verifyBasicBlock :: [Instr] -> ()
+verifyBasicBlock instrs
+  | debugIsOn     = go False instrs
+  | otherwise     = ()
+  where
+    go _     [] = ()
+    go atEnd (i:instr)
+        = case i of
+            -- Start a new basic block
+            NEWBLOCK {} -> go False instr
+            -- Calls are not viable block terminators
+            CALL {}     | atEnd -> faultyBlockWith i
+                        | not atEnd -> go atEnd instr
+            -- All instructions ok, check if we reached the end and continue.
+            _ | not atEnd -> go (isJumpishInstr i) instr
+              -- Only jumps allowed at the end of basic blocks.
+              | otherwise -> if isJumpishInstr i
+                                then go True instr
+                                else faultyBlockWith i
+    faultyBlockWith i
+        = pprPanic "Non control flow instructions after end of basic block."
+                   (ppr i <+> text "in:" $$ vcat (map ppr instrs))
+
+basicBlockCodeGen
+        :: CmmBlock
+        -> NatM ( [NatBasicBlock Instr]
+                , [NatCmmDecl (Alignment, RawCmmStatics) Instr])
+
+basicBlockCodeGen block = do
+  let (_, nodes, tail)  = blockSplit block
+      id = entryLabel block
+      stmts = blockToList nodes
+  -- Generate location directive
+  dbg <- getDebugBlock (entryLabel block)
+  loc_instrs <- case dblSourceTick =<< dbg of
+    Just (SourceNote span name)
+      -> do fileId <- getFileId (srcSpanFile span)
+            let line = srcSpanStartLine span; col = srcSpanStartCol span
+            return $ unitOL $ LOCATION fileId line col name
+    _ -> return nilOL
+  (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
+  (!tail_instrs,_) <- stmtToInstrs mid_bid tail
+  let instrs = loc_instrs `appOL` mid_instrs `appOL` tail_instrs
+  return $! verifyBasicBlock (fromOL instrs)
+  instrs' <- fold <$> traverse addSpUnwindings instrs
+  -- code generation may introduce new basic block boundaries, which
+  -- are indicated by the NEWBLOCK instruction.  We must split up the
+  -- instruction stream into basic blocks again.  Also, we extract
+  -- LDATAs here too.
+  let
+        (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs'
+
+        mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
+          = ([], BasicBlock id instrs : blocks, statics)
+        mkBlocks (LDATA sec dat) (instrs,blocks,statics)
+          = (instrs, blocks, CmmData sec dat:statics)
+        mkBlocks instr (instrs,blocks,statics)
+          = (instr:instrs, blocks, statics)
+  return (BasicBlock id top : other_blocks, statics)
+
+-- | Convert 'DELTA' instructions into 'UNWIND' instructions to capture changes
+-- in the @sp@ register. See Note [What is this unwinding business?] in Debug
+-- for details.
+addSpUnwindings :: Instr -> NatM (OrdList Instr)
+addSpUnwindings instr@(DELTA d) = do
+    dflags <- getDynFlags
+    if debugLevel dflags >= 1
+        then do lbl <- mkAsmTempLabel <$> getUniqueM
+                let unwind = M.singleton MachSp (Just $ UwReg MachSp $ negate d)
+                return $ toOL [ instr, UNWIND lbl unwind ]
+        else return (unitOL instr)
+addSpUnwindings instr = return $ unitOL instr
+
+{- Note [Keeping track of the current block]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When generating instructions for Cmm we sometimes require
+the current block for things like retry loops.
+
+We also sometimes change the current block, if a MachOP
+results in branching control flow.
+
+Issues arise if we have two statements in the same block,
+which both depend on the current block id *and* change the
+basic block after them. This happens for atomic primops
+in the X86 backend where we want to update the CFG data structure
+when introducing new basic blocks.
+
+For example in #17334 we got this Cmm code:
+
+        c3Bf: // global
+            (_s3t1::I64) = call MO_AtomicRMW W64 AMO_And(_s3sQ::P64 + 88, 18);
+            (_s3t4::I64) = call MO_AtomicRMW W64 AMO_Or(_s3sQ::P64 + 88, 0);
+            _s3sT::I64 = _s3sV::I64;
+            goto c3B1;
+
+This resulted in two new basic blocks being inserted:
+
+        c3Bf:
+                movl $18,%vI_n3Bo
+                movq 88(%vI_s3sQ),%rax
+                jmp _n3Bp
+        n3Bp:
+                ...
+                cmpxchgq %vI_n3Bq,88(%vI_s3sQ)
+                jne _n3Bp
+                ...
+                jmp _n3Bs
+        n3Bs:
+                ...
+                cmpxchgq %vI_n3Bt,88(%vI_s3sQ)
+                jne _n3Bs
+                ...
+                jmp _c3B1
+        ...
+
+Based on the Cmm we called stmtToInstrs we translated both atomic operations under
+the assumption they would be placed into their Cmm basic block `c3Bf`.
+However for the retry loop we introduce new labels, so this is not the case
+for the second statement.
+This resulted in a desync between the explicit control flow graph
+we construct as a separate data type and the actual control flow graph in the code.
+
+Instead we now return the new basic block if a statement causes a change
+in the current block and use the block for all following statements.
+
+For this reason genCCall is also split into two parts.
+One for calls which *won't* change the basic blocks in
+which successive instructions will be placed.
+A different one for calls which *are* known to change the
+basic block.
+
+-}
+
+-- See Note [Keeping track of the current block] for why
+-- we pass the BlockId.
+stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
+              -> [CmmNode O O] -- ^ Cmm Statement
+              -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
+stmtsToInstrs bid stmts =
+    go bid stmts nilOL
+  where
+    go bid  []        instrs = return (instrs,bid)
+    go bid (s:stmts)  instrs = do
+      (instrs',bid') <- stmtToInstrs bid s
+      -- If the statement introduced a new block, we use that one
+      let !newBid = fromMaybe bid bid'
+      go newBid stmts (instrs `appOL` instrs')
+
+-- | `bid` refers to the current block and is used to update the CFG
+--   if new blocks are inserted in the control flow.
+-- See Note [Keeping track of the current block] for more details.
+stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in.
+             -> CmmNode e x
+             -> NatM (InstrBlock, Maybe BlockId)
+             -- ^ Instructions, and bid of new block if successive
+             -- statements are placed in a different basic block.
+stmtToInstrs bid stmt = do
+  dflags <- getDynFlags
+  is32Bit <- is32BitPlatform
+  case stmt of
+    CmmUnsafeForeignCall target result_regs args
+       -> genCCall dflags is32Bit target result_regs args bid
+
+    _ -> (,Nothing) <$> case stmt of
+      CmmComment s   -> return (unitOL (COMMENT s))
+      CmmTick {}     -> return nilOL
+
+      CmmUnwind regs -> do
+        let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
+            to_unwind_entry (reg, expr) = M.singleton reg (fmap toUnwindExpr expr)
+        case foldMap to_unwind_entry regs of
+          tbl | M.null tbl -> return nilOL
+              | otherwise  -> do
+                  lbl <- mkAsmTempLabel <$> getUniqueM
+                  return $ unitOL $ UNWIND lbl tbl
+
+      CmmAssign reg src
+        | isFloatType ty         -> assignReg_FltCode format reg src
+        | is32Bit && isWord64 ty -> assignReg_I64Code      reg src
+        | otherwise              -> assignReg_IntCode format reg src
+          where ty = cmmRegType dflags reg
+                format = cmmTypeFormat ty
+
+      CmmStore addr src
+        | isFloatType ty         -> assignMem_FltCode format addr src
+        | is32Bit && isWord64 ty -> assignMem_I64Code      addr src
+        | otherwise              -> assignMem_IntCode format addr src
+          where ty = cmmExprType dflags src
+                format = cmmTypeFormat ty
+
+      CmmBranch id          -> return $ genBranch id
+
+      --We try to arrange blocks such that the likely branch is the fallthrough
+      --in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here.
+      CmmCondBranch arg true false _ -> genCondBranch bid true false arg
+      CmmSwitch arg ids -> do dflags <- getDynFlags
+                              genSwitch dflags arg ids
+      CmmCall { cml_target = arg
+              , cml_args_regs = gregs } -> do
+                                  dflags <- getDynFlags
+                                  genJump arg (jumpRegs dflags gregs)
+      _ ->
+        panic "stmtToInstrs: statement should have been cps'd away"
+
+
+jumpRegs :: DynFlags -> [GlobalReg] -> [Reg]
+jumpRegs dflags gregs = [ RegReal r | Just r <- map (globalRegMaybe platform) gregs ]
+    where platform = targetPlatform dflags
+
+--------------------------------------------------------------------------------
+-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
+--      They are really trees of insns to facilitate fast appending, where a
+--      left-to-right traversal yields the insns in the correct order.
+--
+type InstrBlock
+        = OrdList Instr
+
+
+-- | Condition codes passed up the tree.
+--
+data CondCode
+        = CondCode Bool Cond InstrBlock
+
+
+-- | a.k.a "Register64"
+--      Reg is the lower 32-bit temporary which contains the result.
+--      Use getHiVRegFromLo to find the other VRegUnique.
+--
+--      Rules of this simplified insn selection game are therefore that
+--      the returned Reg may be modified
+--
+data ChildCode64
+   = ChildCode64
+        InstrBlock
+        Reg
+
+
+-- | Register's passed up the tree.  If the stix code forces the register
+--      to live in a pre-decided machine register, it comes out as @Fixed@;
+--      otherwise, it comes out as @Any@, and the parent can decide which
+--      register to put it in.
+--
+data Register
+        = Fixed Format Reg InstrBlock
+        | Any   Format (Reg -> InstrBlock)
+
+
+swizzleRegisterRep :: Register -> Format -> Register
+swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code
+swizzleRegisterRep (Any _ codefn)     format = Any   format codefn
+
+
+-- | Grab the Reg for a CmmReg
+getRegisterReg :: Platform  -> CmmReg -> Reg
+
+getRegisterReg _   (CmmLocal (LocalReg u pk))
+  = -- by Assuming SSE2, Int,Word,Float,Double all can be register allocated
+   let fmt = cmmTypeFormat pk in
+        RegVirtual (mkVirtualReg u fmt)
+
+getRegisterReg platform  (CmmGlobal mid)
+  = case globalRegMaybe platform mid of
+        Just reg -> RegReal $ reg
+        Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
+        -- By this stage, the only MagicIds remaining should be the
+        -- ones which map to a real machine register on this
+        -- platform.  Hence ...
+
+
+-- | Memory addressing modes passed up the tree.
+data Amode
+        = Amode AddrMode InstrBlock
+
+{-
+Now, given a tree (the argument to a CmmLoad) that references memory,
+produce a suitable addressing mode.
+
+A Rule of the Game (tm) for Amodes: use of the addr bit must
+immediately follow use of the code part, since the code part puts
+values in registers which the addr then refers to.  So you can't put
+anything in between, lest it overwrite some of those registers.  If
+you need to do some other computation between the code part and use of
+the addr bit, first store the effective address from the amode in a
+temporary, then do the other computation, and then use the temporary:
+
+    code
+    LEA amode, tmp
+    ... other computation ...
+    ... (tmp) ...
+-}
+
+
+-- | Check whether an integer will fit in 32 bits.
+--      A CmmInt is intended to be truncated to the appropriate
+--      number of bits, so here we truncate it to Int64.  This is
+--      important because e.g. -1 as a CmmInt might be either
+--      -1 or 18446744073709551615.
+--
+is32BitInteger :: Integer -> Bool
+is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000
+  where i64 = fromIntegral i :: Int64
+
+
+-- | Convert a BlockId to some CmmStatic data
+jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic
+jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags))
+jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
+    where blockLabel = blockLbl blockid
+
+
+-- -----------------------------------------------------------------------------
+-- General things for putting together code sequences
+
+-- Expand CmmRegOff.  ToDo: should we do it this way around, or convert
+-- CmmExprs into CmmRegOff?
+mangleIndexTree :: DynFlags -> CmmReg -> Int -> CmmExpr
+mangleIndexTree dflags reg off
+  = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
+  where width = typeWidth (cmmRegType dflags reg)
+
+-- | The dual to getAnyReg: compute an expression into a register, but
+--      we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getSomeReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code -> do
+        tmp <- getNewRegNat rep
+        return (tmp, code tmp)
+    Fixed _ reg code ->
+        return (reg, code)
+
+
+assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
+assignMem_I64Code addrTree valueTree = do
+  Amode addr addr_code <- getAmode addrTree
+  ChildCode64 vcode rlo <- iselExpr64 valueTree
+  let
+        rhi = getHiVRegFromLo rlo
+
+        -- Little-endian store
+        mov_lo = MOV II32 (OpReg rlo) (OpAddr addr)
+        mov_hi = MOV II32 (OpReg rhi) (OpAddr (fromJust (addrOffset addr 4)))
+  return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
+
+
+assignReg_I64Code :: CmmReg  -> CmmExpr -> NatM InstrBlock
+assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do
+   ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
+   let
+         r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32
+         r_dst_hi = getHiVRegFromLo r_dst_lo
+         r_src_hi = getHiVRegFromLo r_src_lo
+         mov_lo = MOV II32 (OpReg r_src_lo) (OpReg r_dst_lo)
+         mov_hi = MOV II32 (OpReg r_src_hi) (OpReg r_dst_hi)
+   return (
+        vcode `snocOL` mov_lo `snocOL` mov_hi
+     )
+
+assignReg_I64Code _ _
+   = panic "assignReg_I64Code(i386): invalid lvalue"
+
+
+iselExpr64        :: CmmExpr -> NatM ChildCode64
+iselExpr64 (CmmLit (CmmInt i _)) = do
+  (rlo,rhi) <- getNewRegPairNat II32
+  let
+        r = fromIntegral (fromIntegral i :: Word32)
+        q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
+        code = toOL [
+                MOV II32 (OpImm (ImmInteger r)) (OpReg rlo),
+                MOV II32 (OpImm (ImmInteger q)) (OpReg rhi)
+                ]
+  return (ChildCode64 code rlo)
+
+iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do
+   Amode addr addr_code <- getAmode addrTree
+   (rlo,rhi) <- getNewRegPairNat II32
+   let
+        mov_lo = MOV II32 (OpAddr addr) (OpReg rlo)
+        mov_hi = MOV II32 (OpAddr (fromJust (addrOffset addr 4))) (OpReg rhi)
+   return (
+            ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi)
+                        rlo
+     )
+
+iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty
+   = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32))
+
+-- we handle addition, but rather badly
+iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do
+   ChildCode64 code1 r1lo <- iselExpr64 e1
+   (rlo,rhi) <- getNewRegPairNat II32
+   let
+        r = fromIntegral (fromIntegral i :: Word32)
+        q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
+        r1hi = getHiVRegFromLo r1lo
+        code =  code1 `appOL`
+                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                       ADD II32 (OpImm (ImmInteger r)) (OpReg rlo),
+                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       ADC II32 (OpImm (ImmInteger q)) (OpReg rhi) ]
+   return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
+   ChildCode64 code1 r1lo <- iselExpr64 e1
+   ChildCode64 code2 r2lo <- iselExpr64 e2
+   (rlo,rhi) <- getNewRegPairNat II32
+   let
+        r1hi = getHiVRegFromLo r1lo
+        r2hi = getHiVRegFromLo r2lo
+        code =  code1 `appOL`
+                code2 `appOL`
+                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                       ADD II32 (OpReg r2lo) (OpReg rlo),
+                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       ADC II32 (OpReg r2hi) (OpReg rhi) ]
+   return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do
+   ChildCode64 code1 r1lo <- iselExpr64 e1
+   ChildCode64 code2 r2lo <- iselExpr64 e2
+   (rlo,rhi) <- getNewRegPairNat II32
+   let
+        r1hi = getHiVRegFromLo r1lo
+        r2hi = getHiVRegFromLo r2lo
+        code =  code1 `appOL`
+                code2 `appOL`
+                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                       SUB II32 (OpReg r2lo) (OpReg rlo),
+                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       SBB II32 (OpReg r2hi) (OpReg rhi) ]
+   return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
+     fn <- getAnyReg expr
+     r_dst_lo <-  getNewRegNat II32
+     let r_dst_hi = getHiVRegFromLo r_dst_lo
+         code = fn r_dst_lo
+     return (
+             ChildCode64 (code `snocOL`
+                          MOV II32 (OpImm (ImmInt 0)) (OpReg r_dst_hi))
+                          r_dst_lo
+            )
+
+iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
+     fn <- getAnyReg expr
+     r_dst_lo <-  getNewRegNat II32
+     let r_dst_hi = getHiVRegFromLo r_dst_lo
+         code = fn r_dst_lo
+     return (
+             ChildCode64 (code `snocOL`
+                          MOV II32 (OpReg r_dst_lo) (OpReg eax) `snocOL`
+                          CLTD II32 `snocOL`
+                          MOV II32 (OpReg eax) (OpReg r_dst_lo) `snocOL`
+                          MOV II32 (OpReg edx) (OpReg r_dst_hi))
+                          r_dst_lo
+            )
+
+iselExpr64 expr
+   = pprPanic "iselExpr64(i386)" (ppr expr)
+
+
+--------------------------------------------------------------------------------
+getRegister :: CmmExpr -> NatM Register
+getRegister e = do dflags <- getDynFlags
+                   is32Bit <- is32BitPlatform
+                   getRegister' dflags is32Bit e
+
+getRegister' :: DynFlags -> Bool -> CmmExpr -> NatM Register
+
+getRegister' dflags is32Bit (CmmReg reg)
+  = case reg of
+        CmmGlobal PicBaseReg
+         | is32Bit ->
+            -- on x86_64, we have %rip for PicBaseReg, but it's not
+            -- a full-featured register, it can only be used for
+            -- rip-relative addressing.
+            do reg' <- getPicBaseNat (archWordFormat is32Bit)
+               return (Fixed (archWordFormat is32Bit) reg' nilOL)
+        _ ->
+            do
+               let
+                 fmt = cmmTypeFormat (cmmRegType dflags reg)
+                 format  = fmt
+               --
+               let platform = targetPlatform dflags
+               return (Fixed format
+                             (getRegisterReg platform  reg)
+                             nilOL)
+
+
+getRegister' dflags is32Bit (CmmRegOff r n)
+  = getRegister' dflags is32Bit $ mangleIndexTree dflags r n
+
+getRegister' dflags is32Bit (CmmMachOp (MO_AlignmentCheck align _) [e])
+  = addAlignmentCheck align <$> getRegister' dflags is32Bit e
+
+-- for 32-bit architectures, support some 64 -> 32 bit conversions:
+-- TO_W_(x), TO_W_(x >> 32)
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32)
+                     [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
+ | is32Bit = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32)
+                     [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
+ | is32Bit = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32) [x])
+ | is32Bit = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 rlo code
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) [x])
+ | is32Bit = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 rlo code
+
+getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
+  float_const_sse2  where
+  float_const_sse2
+    | f == 0.0 = do
+      let
+          format = floatFormat w
+          code dst = unitOL  (XOR format (OpReg dst) (OpReg dst))
+        -- I don't know why there are xorpd, xorps, and pxor instructions.
+        -- They all appear to do the same thing --SDM
+      return (Any format code)
+
+   | otherwise = do
+      Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
+      loadFloatAmode w addr code
+
+-- catch simple cases of zero- or sign-extended load
+getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVZxL II8) addr
+  return (Any II32 code)
+
+getRegister' _ _ (CmmMachOp (MO_SS_Conv W8 W32) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL II8) addr
+  return (Any II32 code)
+
+getRegister' _ _ (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVZxL II16) addr
+  return (Any II32 code)
+
+getRegister' _ _ (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL II16) addr
+  return (Any II32 code)
+
+-- catch simple cases of zero- or sign-extended load
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad addr _])
+ | not is32Bit = do
+  code <- intLoadCode (MOVZxL II8) addr
+  return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W8 W64) [CmmLoad addr _])
+ | not is32Bit = do
+  code <- intLoadCode (MOVSxL II8) addr
+  return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad addr _])
+ | not is32Bit = do
+  code <- intLoadCode (MOVZxL II16) addr
+  return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad addr _])
+ | not is32Bit = do
+  code <- intLoadCode (MOVSxL II16) addr
+  return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad addr _])
+ | not is32Bit = do
+  code <- intLoadCode (MOV II32) addr -- 32-bit loads zero-extend
+  return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad addr _])
+ | not is32Bit = do
+  code <- intLoadCode (MOVSxL II32) addr
+  return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
+                                     CmmLit displacement])
+ | not is32Bit = do
+      return $ Any II64 (\dst -> unitOL $
+        LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
+
+getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
+    case mop of
+      MO_F_Neg w  -> sse2NegCode w x
+
+
+      MO_S_Neg w -> triv_ucode NEGI (intFormat w)
+      MO_Not w   -> triv_ucode NOT  (intFormat w)
+
+      -- Nop conversions
+      MO_UU_Conv W32 W8  -> toI8Reg  W32 x
+      MO_SS_Conv W32 W8  -> toI8Reg  W32 x
+      MO_XX_Conv W32 W8  -> toI8Reg  W32 x
+      MO_UU_Conv W16 W8  -> toI8Reg  W16 x
+      MO_SS_Conv W16 W8  -> toI8Reg  W16 x
+      MO_XX_Conv W16 W8  -> toI8Reg  W16 x
+      MO_UU_Conv W32 W16 -> toI16Reg W32 x
+      MO_SS_Conv W32 W16 -> toI16Reg W32 x
+      MO_XX_Conv W32 W16 -> toI16Reg W32 x
+
+      MO_UU_Conv W64 W32 | not is32Bit -> conversionNop II64 x
+      MO_SS_Conv W64 W32 | not is32Bit -> conversionNop II64 x
+      MO_XX_Conv W64 W32 | not is32Bit -> conversionNop II64 x
+      MO_UU_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
+      MO_SS_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
+      MO_XX_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
+      MO_UU_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
+      MO_SS_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
+      MO_XX_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
+
+      MO_UU_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
+      MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
+      MO_XX_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
+
+      -- widenings
+      MO_UU_Conv W8  W32 -> integerExtend W8  W32 MOVZxL x
+      MO_UU_Conv W16 W32 -> integerExtend W16 W32 MOVZxL x
+      MO_UU_Conv W8  W16 -> integerExtend W8  W16 MOVZxL x
+
+      MO_SS_Conv W8  W32 -> integerExtend W8  W32 MOVSxL x
+      MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x
+      MO_SS_Conv W8  W16 -> integerExtend W8  W16 MOVSxL x
+
+      -- We don't care about the upper bits for MO_XX_Conv, so MOV is enough. However, on 32-bit we
+      -- have 8-bit registers only for a few registers (as opposed to x86-64 where every register
+      -- has 8-bit version). So for 32-bit code, we'll just zero-extend.
+      MO_XX_Conv W8  W32
+          | is32Bit   -> integerExtend W8 W32 MOVZxL x
+          | otherwise -> integerExtend W8 W32 MOV x
+      MO_XX_Conv W8  W16
+          | is32Bit   -> integerExtend W8 W16 MOVZxL x
+          | otherwise -> integerExtend W8 W16 MOV x
+      MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
+
+      MO_UU_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVZxL x
+      MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
+      MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVZxL x
+      MO_SS_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVSxL x
+      MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVSxL x
+      MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVSxL x
+      -- For 32-to-64 bit zero extension, amd64 uses an ordinary movl.
+      -- However, we don't want the register allocator to throw it
+      -- away as an unnecessary reg-to-reg move, so we keep it in
+      -- the form of a movzl and print it as a movl later.
+      -- This doesn't apply to MO_XX_Conv since in this case we don't care about
+      -- the upper bits. So we can just use MOV.
+      MO_XX_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOV x
+      MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
+      MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
+
+      MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
+
+
+      MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
+
+      MO_FS_Conv from to -> coerceFP2Int from to x
+      MO_SF_Conv from to -> coerceInt2FP from to x
+
+      MO_V_Insert {}   -> needLlvm
+      MO_V_Extract {}  -> needLlvm
+      MO_V_Add {}      -> needLlvm
+      MO_V_Sub {}      -> needLlvm
+      MO_V_Mul {}      -> needLlvm
+      MO_VS_Quot {}    -> needLlvm
+      MO_VS_Rem {}     -> needLlvm
+      MO_VS_Neg {}     -> needLlvm
+      MO_VU_Quot {}    -> needLlvm
+      MO_VU_Rem {}     -> needLlvm
+      MO_VF_Insert {}  -> needLlvm
+      MO_VF_Extract {} -> needLlvm
+      MO_VF_Add {}     -> needLlvm
+      MO_VF_Sub {}     -> needLlvm
+      MO_VF_Mul {}     -> needLlvm
+      MO_VF_Quot {}    -> needLlvm
+      MO_VF_Neg {}     -> needLlvm
+
+      _other -> pprPanic "getRegister" (pprMachOp mop)
+   where
+        triv_ucode :: (Format -> Operand -> Instr) -> Format -> NatM Register
+        triv_ucode instr format = trivialUCode format (instr format) x
+
+        -- signed or unsigned extension.
+        integerExtend :: Width -> Width
+                      -> (Format -> Operand -> Operand -> Instr)
+                      -> CmmExpr -> NatM Register
+        integerExtend from to instr expr = do
+            (reg,e_code) <- if from == W8 then getByteReg expr
+                                          else getSomeReg expr
+            let
+                code dst =
+                  e_code `snocOL`
+                  instr (intFormat from) (OpReg reg) (OpReg dst)
+            return (Any (intFormat to) code)
+
+        toI8Reg :: Width -> CmmExpr -> NatM Register
+        toI8Reg new_rep expr
+            = do codefn <- getAnyReg expr
+                 return (Any (intFormat new_rep) codefn)
+                -- HACK: use getAnyReg to get a byte-addressable register.
+                -- If the source was a Fixed register, this will add the
+                -- mov instruction to put it into the desired destination.
+                -- We're assuming that the destination won't be a fixed
+                -- non-byte-addressable register; it won't be, because all
+                -- fixed registers are word-sized.
+
+        toI16Reg = toI8Reg -- for now
+
+        conversionNop :: Format -> CmmExpr -> NatM Register
+        conversionNop new_format expr
+            = do e_code <- getRegister' dflags is32Bit expr
+                 return (swizzleRegisterRep e_code new_format)
+
+
+getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
+  case mop of
+      MO_F_Eq _ -> condFltReg is32Bit EQQ x y
+      MO_F_Ne _ -> condFltReg is32Bit NE  x y
+      MO_F_Gt _ -> condFltReg is32Bit GTT x y
+      MO_F_Ge _ -> condFltReg is32Bit GE  x y
+      -- Invert comparison condition and swap operands
+      -- See Note [SSE Parity Checks]
+      MO_F_Lt _ -> condFltReg is32Bit GTT  y x
+      MO_F_Le _ -> condFltReg is32Bit GE   y x
+
+      MO_Eq _   -> condIntReg EQQ x y
+      MO_Ne _   -> condIntReg NE  x y
+
+      MO_S_Gt _ -> condIntReg GTT x y
+      MO_S_Ge _ -> condIntReg GE  x y
+      MO_S_Lt _ -> condIntReg LTT x y
+      MO_S_Le _ -> condIntReg LE  x y
+
+      MO_U_Gt _ -> condIntReg GU  x y
+      MO_U_Ge _ -> condIntReg GEU x y
+      MO_U_Lt _ -> condIntReg LU  x y
+      MO_U_Le _ -> condIntReg LEU x y
+
+      MO_F_Add w   -> trivialFCode_sse2 w ADD  x y
+
+      MO_F_Sub w   -> trivialFCode_sse2 w SUB  x y
+
+      MO_F_Quot w  -> trivialFCode_sse2 w FDIV x y
+
+      MO_F_Mul w   -> trivialFCode_sse2 w MUL x y
+
+
+      MO_Add rep -> add_code rep x y
+      MO_Sub rep -> sub_code rep x y
+
+      MO_S_Quot rep -> div_code rep True  True  x y
+      MO_S_Rem  rep -> div_code rep True  False x y
+      MO_U_Quot rep -> div_code rep False True  x y
+      MO_U_Rem  rep -> div_code rep False False x y
+
+      MO_S_MulMayOflo rep -> imulMayOflo rep x y
+
+      MO_Mul W8  -> imulW8 x y
+      MO_Mul rep -> triv_op rep IMUL
+      MO_And rep -> triv_op rep AND
+      MO_Or  rep -> triv_op rep OR
+      MO_Xor rep -> triv_op rep XOR
+
+        {- Shift ops on x86s have constraints on their source, it
+           either has to be Imm, CL or 1
+            => trivialCode is not restrictive enough (sigh.)
+        -}
+      MO_Shl rep   -> shift_code rep SHL x y {-False-}
+      MO_U_Shr rep -> shift_code rep SHR x y {-False-}
+      MO_S_Shr rep -> shift_code rep SAR x y {-False-}
+
+      MO_V_Insert {}   -> needLlvm
+      MO_V_Extract {}  -> needLlvm
+      MO_V_Add {}      -> needLlvm
+      MO_V_Sub {}      -> needLlvm
+      MO_V_Mul {}      -> needLlvm
+      MO_VS_Quot {}    -> needLlvm
+      MO_VS_Rem {}     -> needLlvm
+      MO_VS_Neg {}     -> needLlvm
+      MO_VF_Insert {}  -> needLlvm
+      MO_VF_Extract {} -> needLlvm
+      MO_VF_Add {}     -> needLlvm
+      MO_VF_Sub {}     -> needLlvm
+      MO_VF_Mul {}     -> needLlvm
+      MO_VF_Quot {}    -> needLlvm
+      MO_VF_Neg {}     -> needLlvm
+
+      _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
+  where
+    --------------------
+    triv_op width instr = trivialCode width op (Just op) x y
+                        where op   = instr (intFormat width)
+
+    -- Special case for IMUL for bytes, since the result of IMULB will be in
+    -- %ax, the split to %dx/%edx/%rdx and %ax/%eax/%rax happens only for wider
+    -- values.
+    imulW8 :: CmmExpr -> CmmExpr -> NatM Register
+    imulW8 arg_a arg_b = do
+        (a_reg, a_code) <- getNonClobberedReg arg_a
+        b_code <- getAnyReg arg_b
+
+        let code = a_code `appOL` b_code eax `appOL`
+                   toOL [ IMUL2 format (OpReg a_reg) ]
+            format = intFormat W8
+
+        return (Fixed format eax code)
+
+
+    imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
+    imulMayOflo rep a b = do
+         (a_reg, a_code) <- getNonClobberedReg a
+         b_code <- getAnyReg b
+         let
+             shift_amt  = case rep of
+                           W32 -> 31
+                           W64 -> 63
+                           _ -> panic "shift_amt"
+
+             format = intFormat rep
+             code = a_code `appOL` b_code eax `appOL`
+                        toOL [
+                           IMUL2 format (OpReg a_reg),   -- result in %edx:%eax
+                           SAR format (OpImm (ImmInt shift_amt)) (OpReg eax),
+                                -- sign extend lower part
+                           SUB format (OpReg edx) (OpReg eax)
+                                -- compare against upper
+                           -- eax==0 if high part == sign extended low part
+                        ]
+         return (Fixed format eax code)
+
+    --------------------
+    shift_code :: Width
+               -> (Format -> Operand -> Operand -> Instr)
+               -> CmmExpr
+               -> CmmExpr
+               -> NatM Register
+
+    {- Case1: shift length as immediate -}
+    shift_code width instr x (CmmLit lit) = do
+          x_code <- getAnyReg x
+          let
+               format = intFormat width
+               code dst
+                  = x_code dst `snocOL`
+                    instr format (OpImm (litToImm lit)) (OpReg dst)
+          return (Any format code)
+
+    {- Case2: shift length is complex (non-immediate)
+      * y must go in %ecx.
+      * we cannot do y first *and* put its result in %ecx, because
+        %ecx might be clobbered by x.
+      * if we do y second, then x cannot be
+        in a clobbered reg.  Also, we cannot clobber x's reg
+        with the instruction itself.
+      * so we can either:
+        - do y first, put its result in a fresh tmp, then copy it to %ecx later
+        - do y second and put its result into %ecx.  x gets placed in a fresh
+          tmp.  This is likely to be better, because the reg alloc can
+          eliminate this reg->reg move here (it won't eliminate the other one,
+          because the move is into the fixed %ecx).
+    -}
+    shift_code width instr x y{-amount-} = do
+        x_code <- getAnyReg x
+        let format = intFormat width
+        tmp <- getNewRegNat format
+        y_code <- getAnyReg y
+        let
+           code = x_code tmp `appOL`
+                  y_code ecx `snocOL`
+                  instr format (OpReg ecx) (OpReg tmp)
+        return (Fixed format tmp code)
+
+    --------------------
+    add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
+    add_code rep x (CmmLit (CmmInt y _))
+        | is32BitInteger y = add_int rep x y
+    add_code rep x y = trivialCode rep (ADD format) (Just (ADD format)) x y
+      where format = intFormat rep
+    -- TODO: There are other interesting patterns we want to replace
+    --     with a LEA, e.g. `(x + offset) + (y << shift)`.
+
+    --------------------
+    sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
+    sub_code rep x (CmmLit (CmmInt y _))
+        | is32BitInteger (-y) = add_int rep x (-y)
+    sub_code rep x y = trivialCode rep (SUB (intFormat rep)) Nothing x y
+
+    -- our three-operand add instruction:
+    add_int width x y = do
+        (x_reg, x_code) <- getSomeReg x
+        let
+            format = intFormat width
+            imm = ImmInt (fromInteger y)
+            code dst
+               = x_code `snocOL`
+                 LEA format
+                        (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm))
+                        (OpReg dst)
+        --
+        return (Any format code)
+
+    ----------------------
+
+    -- See Note [DIV/IDIV for bytes]
+    div_code W8 signed quotient x y = do
+        let widen | signed    = MO_SS_Conv W8 W16
+                  | otherwise = MO_UU_Conv W8 W16
+        div_code
+            W16
+            signed
+            quotient
+            (CmmMachOp widen [x])
+            (CmmMachOp widen [y])
+
+    div_code width signed quotient x y = do
+           (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
+           x_code <- getAnyReg x
+           let
+             format = intFormat width
+             widen | signed    = CLTD format
+                   | otherwise = XOR format (OpReg edx) (OpReg edx)
+
+             instr | signed    = IDIV
+                   | otherwise = DIV
+
+             code = y_code `appOL`
+                    x_code eax `appOL`
+                    toOL [widen, instr format y_op]
+
+             result | quotient  = eax
+                    | otherwise = edx
+
+           return (Fixed format result code)
+
+
+getRegister' _ _ (CmmLoad mem pk)
+  | isFloatType pk
+  = do
+    Amode addr mem_code <- getAmode mem
+    loadFloatAmode  (typeWidth pk) addr mem_code
+
+getRegister' _ is32Bit (CmmLoad mem pk)
+  | is32Bit && not (isWord64 pk)
+  = do
+    code <- intLoadCode instr mem
+    return (Any format code)
+  where
+    width = typeWidth pk
+    format = intFormat width
+    instr = case width of
+                W8     -> MOVZxL II8
+                _other -> MOV format
+        -- We always zero-extend 8-bit loads, if we
+        -- can't think of anything better.  This is because
+        -- we can't guarantee access to an 8-bit variant of every register
+        -- (esi and edi don't have 8-bit variants), so to make things
+        -- simpler we do our 8-bit arithmetic with full 32-bit registers.
+
+-- Simpler memory load code on x86_64
+getRegister' _ is32Bit (CmmLoad mem pk)
+ | not is32Bit
+  = do
+    code <- intLoadCode (MOV format) mem
+    return (Any format code)
+  where format = intFormat $ typeWidth pk
+
+getRegister' _ is32Bit (CmmLit (CmmInt 0 width))
+  = let
+        format = intFormat width
+
+        -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
+        format1 = if is32Bit then format
+                           else case format of
+                                II64 -> II32
+                                _ -> format
+        code dst
+           = unitOL (XOR format1 (OpReg dst) (OpReg dst))
+    in
+        return (Any format code)
+
+  -- optimisation for loading small literals on x86_64: take advantage
+  -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
+  -- instruction forms are shorter.
+getRegister' dflags is32Bit (CmmLit lit)
+  | not is32Bit, isWord64 (cmmLitType dflags lit), not (isBigLit lit)
+  = let
+        imm = litToImm lit
+        code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst))
+    in
+        return (Any II64 code)
+  where
+   isBigLit (CmmInt i _) = i < 0 || i > 0xffffffff
+   isBigLit _ = False
+        -- note1: not the same as (not.is32BitLit), because that checks for
+        -- signed literals that fit in 32 bits, but we want unsigned
+        -- literals here.
+        -- note2: all labels are small, because we're assuming the
+        -- small memory model (see gcc docs, -mcmodel=small).
+
+getRegister' dflags _ (CmmLit lit)
+  = do let format = cmmTypeFormat (cmmLitType dflags lit)
+           imm = litToImm lit
+           code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
+       return (Any format code)
+
+getRegister' _ _ other
+    | isVecExpr other  = needLlvm
+    | otherwise        = pprPanic "getRegister(x86)" (ppr other)
+
+
+intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
+   -> NatM (Reg -> InstrBlock)
+intLoadCode instr mem = do
+  Amode src mem_code <- getAmode mem
+  return (\dst -> mem_code `snocOL` instr (OpAddr src) (OpReg dst))
+
+-- Compute an expression into *any* register, adding the appropriate
+-- move instruction if necessary.
+getAnyReg :: CmmExpr -> NatM (Reg -> InstrBlock)
+getAnyReg expr = do
+  r <- getRegister expr
+  anyReg r
+
+anyReg :: Register -> NatM (Reg -> InstrBlock)
+anyReg (Any _ code)          = return code
+anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
+
+-- A bit like getSomeReg, but we want a reg that can be byte-addressed.
+-- Fixed registers might not be byte-addressable, so we make sure we've
+-- got a temporary, inserting an extra reg copy if necessary.
+getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getByteReg expr = do
+  is32Bit <- is32BitPlatform
+  if is32Bit
+      then do r <- getRegister expr
+              case r of
+                Any rep code -> do
+                    tmp <- getNewRegNat rep
+                    return (tmp, code tmp)
+                Fixed rep reg code
+                    | isVirtualReg reg -> return (reg,code)
+                    | otherwise -> do
+                        tmp <- getNewRegNat rep
+                        return (tmp, code `snocOL` reg2reg rep reg tmp)
+                    -- ToDo: could optimise slightly by checking for
+                    -- byte-addressable real registers, but that will
+                    -- happen very rarely if at all.
+      else getSomeReg expr -- all regs are byte-addressable on x86_64
+
+-- Another variant: this time we want the result in a register that cannot
+-- be modified by code to evaluate an arbitrary expression.
+getNonClobberedReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getNonClobberedReg expr = do
+  dflags <- getDynFlags
+  r <- getRegister expr
+  case r of
+    Any rep code -> do
+        tmp <- getNewRegNat rep
+        return (tmp, code tmp)
+    Fixed rep reg code
+        -- only certain regs can be clobbered
+        | reg `elem` instrClobberedRegs (targetPlatform dflags)
+        -> do
+                tmp <- getNewRegNat rep
+                return (tmp, code `snocOL` reg2reg rep reg tmp)
+        | otherwise ->
+                return (reg, code)
+
+reg2reg :: Format -> Reg -> Reg -> Instr
+reg2reg format src dst = MOV format (OpReg src) (OpReg dst)
+
+
+--------------------------------------------------------------------------------
+getAmode :: CmmExpr -> NatM Amode
+getAmode e = do is32Bit <- is32BitPlatform
+                getAmode' is32Bit e
+
+getAmode' :: Bool -> CmmExpr -> NatM Amode
+getAmode' _ (CmmRegOff r n) = do dflags <- getDynFlags
+                                 getAmode $ mangleIndexTree dflags r n
+
+getAmode' is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
+                                                  CmmLit displacement])
+ | not is32Bit
+    = return $ Amode (ripRel (litToImm displacement)) nilOL
+
+
+-- This is all just ridiculous, since it carefully undoes
+-- what mangleIndexTree has just done.
+getAmode' is32Bit (CmmMachOp (MO_Sub _rep) [x, CmmLit lit@(CmmInt i _)])
+  | is32BitLit is32Bit lit
+  -- ASSERT(rep == II32)???
+  = do (x_reg, x_code) <- getSomeReg x
+       let off = ImmInt (-(fromInteger i))
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
+
+getAmode' is32Bit (CmmMachOp (MO_Add _rep) [x, CmmLit lit])
+  | is32BitLit is32Bit lit
+  -- ASSERT(rep == II32)???
+  = do (x_reg, x_code) <- getSomeReg x
+       let off = litToImm lit
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
+
+-- Turn (lit1 << n  + lit2) into  (lit2 + lit1 << n) so it will be
+-- recognised by the next rule.
+getAmode' is32Bit (CmmMachOp (MO_Add rep) [a@(CmmMachOp (MO_Shl _) _),
+                                  b@(CmmLit _)])
+  = getAmode' is32Bit (CmmMachOp (MO_Add rep) [b,a])
+
+-- Matches: (x + offset) + (y << shift)
+getAmode' _ (CmmMachOp (MO_Add _) [CmmRegOff x offset,
+                                   CmmMachOp (MO_Shl _)
+                                        [y, CmmLit (CmmInt shift _)]])
+  | shift == 0 || shift == 1 || shift == 2 || shift == 3
+  = x86_complex_amode (CmmReg x) y shift (fromIntegral offset)
+
+getAmode' _ (CmmMachOp (MO_Add _) [x, CmmMachOp (MO_Shl _)
+                                        [y, CmmLit (CmmInt shift _)]])
+  | shift == 0 || shift == 1 || shift == 2 || shift == 3
+  = x86_complex_amode x y shift 0
+
+getAmode' _ (CmmMachOp (MO_Add _)
+                [x, CmmMachOp (MO_Add _)
+                        [CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)],
+                         CmmLit (CmmInt offset _)]])
+  | shift == 0 || shift == 1 || shift == 2 || shift == 3
+  && is32BitInteger offset
+  = x86_complex_amode x y shift offset
+
+getAmode' _ (CmmMachOp (MO_Add _) [x,y])
+  = x86_complex_amode x y 0 0
+
+getAmode' is32Bit (CmmLit lit) | is32BitLit is32Bit lit
+  = return (Amode (ImmAddr (litToImm lit) 0) nilOL)
+
+getAmode' _ expr = do
+  (reg,code) <- getSomeReg expr
+  return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
+
+-- | Like 'getAmode', but on 32-bit use simple register addressing
+-- (i.e. no index register). This stops us from running out of
+-- registers on x86 when using instructions such as cmpxchg, which can
+-- use up to three virtual registers and one fixed register.
+getSimpleAmode :: DynFlags -> Bool -> CmmExpr -> NatM Amode
+getSimpleAmode dflags is32Bit addr
+    | is32Bit = do
+        addr_code <- getAnyReg addr
+        addr_r <- getNewRegNat (intFormat (wordWidth dflags))
+        let amode = AddrBaseIndex (EABaseReg addr_r) EAIndexNone (ImmInt 0)
+        return $! Amode amode (addr_code addr_r)
+    | otherwise = getAmode addr
+
+x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode
+x86_complex_amode base index shift offset
+  = do (x_reg, x_code) <- getNonClobberedReg base
+        -- x must be in a temp, because it has to stay live over y_code
+        -- we could compare x_reg and y_reg and do something better here...
+       (y_reg, y_code) <- getSomeReg index
+       let
+           code = x_code `appOL` y_code
+           base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8;
+                                n -> panic $ "x86_complex_amode: unhandled shift! (" ++ show n ++ ")"
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt (fromIntegral offset)))
+               code)
+
+
+
+
+-- -----------------------------------------------------------------------------
+-- getOperand: sometimes any operand will do.
+
+-- getNonClobberedOperand: the value of the operand will remain valid across
+-- the computation of an arbitrary expression, unless the expression
+-- is computed directly into a register which the operand refers to
+-- (see trivialCode where this function is used for an example).
+
+getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+getNonClobberedOperand (CmmLit lit) = do
+  if  isSuitableFloatingPointLit lit
+    then do
+      let CmmFloat _ w = lit
+      Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
+      return (OpAddr addr, code)
+     else do
+
+  is32Bit <- is32BitPlatform
+  dflags <- getDynFlags
+  if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit))
+    then return (OpImm (litToImm lit), nilOL)
+    else getNonClobberedOperand_generic (CmmLit lit)
+
+getNonClobberedOperand (CmmLoad mem pk) = do
+  is32Bit <- is32BitPlatform
+  -- this logic could be simplified
+  -- TODO FIXME
+  if   (if is32Bit then not (isWord64 pk) else True)
+      -- if 32bit and pk is at float/double/simd value
+      -- or if 64bit
+      --  this could use some eyeballs or i'll need to stare at it more later
+    then do
+      dflags <- getDynFlags
+      let platform = targetPlatform dflags
+      Amode src mem_code <- getAmode mem
+      (src',save_code) <-
+        if (amodeCouldBeClobbered platform src)
+                then do
+                   tmp <- getNewRegNat (archWordFormat is32Bit)
+                   return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
+                           unitOL (LEA (archWordFormat is32Bit)
+                                       (OpAddr src)
+                                       (OpReg tmp)))
+                else
+                   return (src, nilOL)
+      return (OpAddr src', mem_code `appOL` save_code)
+    else do
+      -- if its a word or gcptr on 32bit?
+      getNonClobberedOperand_generic (CmmLoad mem pk)
+
+getNonClobberedOperand e = getNonClobberedOperand_generic e
+
+getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
+getNonClobberedOperand_generic e = do
+    (reg, code) <- getNonClobberedReg e
+    return (OpReg reg, code)
+
+amodeCouldBeClobbered :: Platform -> AddrMode -> Bool
+amodeCouldBeClobbered platform amode = any (regClobbered platform) (addrModeRegs amode)
+
+regClobbered :: Platform -> Reg -> Bool
+regClobbered platform (RegReal (RealRegSingle rr)) = freeReg platform rr
+regClobbered _ _ = False
+
+-- getOperand: the operand is not required to remain valid across the
+-- computation of an arbitrary expression.
+getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+
+getOperand (CmmLit lit) = do
+  use_sse2 <- sse2Enabled
+  if (use_sse2 && isSuitableFloatingPointLit lit)
+    then do
+      let CmmFloat _ w = lit
+      Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
+      return (OpAddr addr, code)
+    else do
+
+  is32Bit <- is32BitPlatform
+  dflags <- getDynFlags
+  if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit))
+    then return (OpImm (litToImm lit), nilOL)
+    else getOperand_generic (CmmLit lit)
+
+getOperand (CmmLoad mem pk) = do
+  is32Bit <- is32BitPlatform
+  use_sse2 <- sse2Enabled
+  if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
+     then do
+       Amode src mem_code <- getAmode mem
+       return (OpAddr src, mem_code)
+     else
+       getOperand_generic (CmmLoad mem pk)
+
+getOperand e = getOperand_generic e
+
+getOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
+getOperand_generic e = do
+    (reg, code) <- getSomeReg e
+    return (OpReg reg, code)
+
+isOperand :: Bool -> CmmExpr -> Bool
+isOperand _ (CmmLoad _ _) = True
+isOperand is32Bit (CmmLit lit)  = is32BitLit is32Bit lit
+                          || isSuitableFloatingPointLit lit
+isOperand _ _            = False
+
+-- | Given a 'Register', produce a new 'Register' with an instruction block
+-- which will check the value for alignment. Used for @-falignment-sanitisation@.
+addAlignmentCheck :: Int -> Register -> Register
+addAlignmentCheck align reg =
+    case reg of
+      Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt reg)
+      Any fmt f          -> Any fmt (\reg -> f reg `appOL` check fmt reg)
+  where
+    check :: Format -> Reg -> InstrBlock
+    check fmt reg =
+        ASSERT(not $ isFloatFormat fmt)
+        toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg)
+             , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel
+             ]
+
+memConstant :: Alignment -> CmmLit -> NatM Amode
+memConstant align lit = do
+  lbl <- getNewLabelNat
+  let rosection = Section ReadOnlyData lbl
+  dflags <- getDynFlags
+  (addr, addr_code) <- if target32Bit (targetPlatform dflags)
+                       then do dynRef <- cmmMakeDynamicReference
+                                             dflags
+                                             DataReference
+                                             lbl
+                               Amode addr addr_code <- getAmode dynRef
+                               return (addr, addr_code)
+                       else return (ripRel (ImmCLbl lbl), nilOL)
+  let code =
+        LDATA rosection (align, RawCmmStatics lbl [CmmStaticLit lit])
+        `consOL` addr_code
+  return (Amode addr code)
+
+
+loadFloatAmode :: Width -> AddrMode -> InstrBlock -> NatM Register
+loadFloatAmode w addr addr_code = do
+  let format = floatFormat w
+      code dst = addr_code `snocOL`
+                    MOV format (OpAddr addr) (OpReg dst)
+
+  return (Any format code)
+
+
+-- if we want a floating-point literal as an operand, we can
+-- use it directly from memory.  However, if the literal is
+-- zero, we're better off generating it into a register using
+-- xor.
+isSuitableFloatingPointLit :: CmmLit -> Bool
+isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
+isSuitableFloatingPointLit _ = False
+
+getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
+getRegOrMem e@(CmmLoad mem pk) = do
+  is32Bit <- is32BitPlatform
+  use_sse2 <- sse2Enabled
+  if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
+     then do
+       Amode src mem_code <- getAmode mem
+       return (OpAddr src, mem_code)
+     else do
+       (reg, code) <- getNonClobberedReg e
+       return (OpReg reg, code)
+getRegOrMem e = do
+    (reg, code) <- getNonClobberedReg e
+    return (OpReg reg, code)
+
+is32BitLit :: Bool -> CmmLit -> Bool
+is32BitLit is32Bit (CmmInt i W64)
+ | not is32Bit
+    = -- assume that labels are in the range 0-2^31-1: this assumes the
+      -- small memory model (see gcc docs, -mcmodel=small).
+      is32BitInteger i
+is32BitLit _ _ = True
+
+
+
+
+-- Set up a condition code for a conditional branch.
+
+getCondCode :: CmmExpr -> NatM CondCode
+
+-- yes, they really do seem to want exactly the same!
+
+getCondCode (CmmMachOp mop [x, y])
+  =
+    case mop of
+      MO_F_Eq W32 -> condFltCode EQQ x y
+      MO_F_Ne W32 -> condFltCode NE  x y
+      MO_F_Gt W32 -> condFltCode GTT x y
+      MO_F_Ge W32 -> condFltCode GE  x y
+      -- Invert comparison condition and swap operands
+      -- See Note [SSE Parity Checks]
+      MO_F_Lt W32 -> condFltCode GTT  y x
+      MO_F_Le W32 -> condFltCode GE   y x
+
+      MO_F_Eq W64 -> condFltCode EQQ x y
+      MO_F_Ne W64 -> condFltCode NE  x y
+      MO_F_Gt W64 -> condFltCode GTT x y
+      MO_F_Ge W64 -> condFltCode GE  x y
+      MO_F_Lt W64 -> condFltCode GTT y x
+      MO_F_Le W64 -> condFltCode GE  y x
+
+      _ -> condIntCode (machOpToCond mop) x y
+
+getCondCode other = pprPanic "getCondCode(2)(x86,x86_64)" (ppr other)
+
+machOpToCond :: MachOp -> Cond
+machOpToCond mo = case mo of
+  MO_Eq _   -> EQQ
+  MO_Ne _   -> NE
+  MO_S_Gt _ -> GTT
+  MO_S_Ge _ -> GE
+  MO_S_Lt _ -> LTT
+  MO_S_Le _ -> LE
+  MO_U_Gt _ -> GU
+  MO_U_Ge _ -> GEU
+  MO_U_Lt _ -> LU
+  MO_U_Le _ -> LEU
+  _other -> pprPanic "machOpToCond" (pprMachOp mo)
+
+
+-- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
+-- passed back up the tree.
+
+condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+condIntCode cond x y = do is32Bit <- is32BitPlatform
+                          condIntCode' is32Bit cond x y
+
+condIntCode' :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+
+-- memory vs immediate
+condIntCode' is32Bit cond (CmmLoad x pk) (CmmLit lit)
+ | is32BitLit is32Bit lit = do
+    Amode x_addr x_code <- getAmode x
+    let
+        imm  = litToImm lit
+        code = x_code `snocOL`
+                  CMP (cmmTypeFormat pk) (OpImm imm) (OpAddr x_addr)
+    --
+    return (CondCode False cond code)
+
+-- anything vs zero, using a mask
+-- TODO: Add some sanity checking!!!!
+condIntCode' is32Bit cond (CmmMachOp (MO_And _) [x,o2]) (CmmLit (CmmInt 0 pk))
+    | (CmmLit lit@(CmmInt mask _)) <- o2, is32BitLit is32Bit lit
+    = do
+      (x_reg, x_code) <- getSomeReg x
+      let
+         code = x_code `snocOL`
+                TEST (intFormat pk) (OpImm (ImmInteger mask)) (OpReg x_reg)
+      --
+      return (CondCode False cond code)
+
+-- anything vs zero
+condIntCode' _ cond x (CmmLit (CmmInt 0 pk)) = do
+    (x_reg, x_code) <- getSomeReg x
+    let
+        code = x_code `snocOL`
+                  TEST (intFormat pk) (OpReg x_reg) (OpReg x_reg)
+    --
+    return (CondCode False cond code)
+
+-- anything vs operand
+condIntCode' is32Bit cond x y
+ | isOperand is32Bit y = do
+    dflags <- getDynFlags
+    (x_reg, x_code) <- getNonClobberedReg x
+    (y_op,  y_code) <- getOperand y
+    let
+        code = x_code `appOL` y_code `snocOL`
+                  CMP (cmmTypeFormat (cmmExprType dflags x)) y_op (OpReg x_reg)
+    return (CondCode False cond code)
+-- operand vs. anything: invert the comparison so that we can use a
+-- single comparison instruction.
+ | isOperand is32Bit x
+ , Just revcond <- maybeFlipCond cond = do
+    dflags <- getDynFlags
+    (y_reg, y_code) <- getNonClobberedReg y
+    (x_op,  x_code) <- getOperand x
+    let
+        code = y_code `appOL` x_code `snocOL`
+                  CMP (cmmTypeFormat (cmmExprType dflags x)) x_op (OpReg y_reg)
+    return (CondCode False revcond code)
+
+-- anything vs anything
+condIntCode' _ cond x y = do
+  dflags <- getDynFlags
+  (y_reg, y_code) <- getNonClobberedReg y
+  (x_op, x_code) <- getRegOrMem x
+  let
+        code = y_code `appOL`
+               x_code `snocOL`
+                  CMP (cmmTypeFormat (cmmExprType dflags x)) (OpReg y_reg) x_op
+  return (CondCode False cond code)
+
+
+
+--------------------------------------------------------------------------------
+condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+
+condFltCode cond x y
+  =  condFltCode_sse2
+  where
+
+
+  -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
+  -- an operand, but the right must be a reg.  We can probably do better
+  -- than this general case...
+  condFltCode_sse2 = do
+    dflags <- getDynFlags
+    (x_reg, x_code) <- getNonClobberedReg x
+    (y_op, y_code) <- getOperand y
+    let
+        code = x_code `appOL`
+               y_code `snocOL`
+                  CMP (floatFormat $ cmmExprWidth dflags x) y_op (OpReg x_reg)
+        -- NB(1): we need to use the unsigned comparison operators on the
+        -- result of this comparison.
+    return (CondCode True (condToUnsigned cond) code)
+
+-- -----------------------------------------------------------------------------
+-- Generating assignments
+
+-- Assignments are really at the heart of the whole code generation
+-- business.  Almost all top-level nodes of any real importance are
+-- assignments, which correspond to loads, stores, or register
+-- transfers.  If we're really lucky, some of the register transfers
+-- will go away, because we can use the destination register to
+-- complete the code generation for the right hand side.  This only
+-- fails when the right hand side is forced into a fixed register
+-- (e.g. the result of a call).
+
+assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_IntCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+
+assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_FltCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+
+
+-- integer assignment to memory
+
+-- specific case of adding/subtracting an integer to a particular address.
+-- ToDo: catch other cases where we can use an operation directly on a memory
+-- address.
+assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _,
+                                                 CmmLit (CmmInt i _)])
+   | addr == addr2, pk /= II64 || is32BitInteger i,
+     Just instr <- check op
+   = do Amode amode code_addr <- getAmode addr
+        let code = code_addr `snocOL`
+                   instr pk (OpImm (ImmInt (fromIntegral i))) (OpAddr amode)
+        return code
+   where
+        check (MO_Add _) = Just ADD
+        check (MO_Sub _) = Just SUB
+        check _ = Nothing
+        -- ToDo: more?
+
+-- general case
+assignMem_IntCode pk addr src = do
+    is32Bit <- is32BitPlatform
+    Amode addr code_addr <- getAmode addr
+    (code_src, op_src)   <- get_op_RI is32Bit src
+    let
+        code = code_src `appOL`
+               code_addr `snocOL`
+                  MOV pk op_src (OpAddr addr)
+        -- NOTE: op_src is stable, so it will still be valid
+        -- after code_addr.  This may involve the introduction
+        -- of an extra MOV to a temporary register, but we hope
+        -- the register allocator will get rid of it.
+    --
+    return code
+  where
+    get_op_RI :: Bool -> CmmExpr -> NatM (InstrBlock,Operand)   -- code, operator
+    get_op_RI is32Bit (CmmLit lit) | is32BitLit is32Bit lit
+      = return (nilOL, OpImm (litToImm lit))
+    get_op_RI _ op
+      = do (reg,code) <- getNonClobberedReg op
+           return (code, OpReg reg)
+
+
+-- Assign; dst is a reg, rhs is mem
+assignReg_IntCode pk reg (CmmLoad src _) = do
+  load_code <- intLoadCode (MOV pk) src
+  dflags <- getDynFlags
+  let platform = targetPlatform dflags
+  return (load_code (getRegisterReg platform reg))
+
+-- dst is a reg, but src could be anything
+assignReg_IntCode _ reg src = do
+  dflags <- getDynFlags
+  let platform = targetPlatform dflags
+  code <- getAnyReg src
+  return (code (getRegisterReg platform reg))
+
+
+-- Floating point assignment to memory
+assignMem_FltCode pk addr src = do
+  (src_reg, src_code) <- getNonClobberedReg src
+  Amode addr addr_code <- getAmode addr
+  let
+        code = src_code `appOL`
+               addr_code `snocOL`
+               MOV pk (OpReg src_reg) (OpAddr addr)
+
+  return code
+
+-- Floating point assignment to a register/temporary
+assignReg_FltCode _ reg src = do
+  src_code <- getAnyReg src
+  dflags <- getDynFlags
+  let platform = targetPlatform dflags
+  return (src_code (getRegisterReg platform  reg))
+
+
+genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
+
+genJump (CmmLoad mem _) regs = do
+  Amode target code <- getAmode mem
+  return (code `snocOL` JMP (OpAddr target) regs)
+
+genJump (CmmLit lit) regs = do
+  return (unitOL (JMP (OpImm (litToImm lit)) regs))
+
+genJump expr regs = do
+  (reg,code) <- getSomeReg expr
+  return (code `snocOL` JMP (OpReg reg) regs)
+
+
+-- -----------------------------------------------------------------------------
+--  Unconditional branches
+
+genBranch :: BlockId -> InstrBlock
+genBranch = toOL . mkJumpInstr
+
+
+
+-- -----------------------------------------------------------------------------
+--  Conditional jumps/branches
+
+{-
+Conditional jumps are always to local labels, so we can use branch
+instructions.  We peek at the arguments to decide what kind of
+comparison to do.
+
+I386: First, we have to ensure that the condition
+codes are set according to the supplied comparison operation.
+-}
+
+
+genCondBranch
+    :: BlockId      -- the source of the jump
+    -> BlockId      -- the true branch target
+    -> BlockId      -- the false branch target
+    -> CmmExpr      -- the condition on which to branch
+    -> NatM InstrBlock -- Instructions
+
+genCondBranch bid id false expr = do
+  is32Bit <- is32BitPlatform
+  genCondBranch' is32Bit bid id false expr
+
+-- | We return the instructions generated.
+genCondBranch' :: Bool -> BlockId -> BlockId -> BlockId -> CmmExpr
+               -> NatM InstrBlock
+
+-- 64-bit integer comparisons on 32-bit
+genCondBranch' is32Bit _bid true false (CmmMachOp mop [e1,e2])
+  | is32Bit, Just W64 <- maybeIntComparison mop = do
+  ChildCode64 code1 r1_lo <- iselExpr64 e1
+  ChildCode64 code2 r2_lo <- iselExpr64 e2
+  let r1_hi = getHiVRegFromLo r1_lo
+      r2_hi = getHiVRegFromLo r2_lo
+      cond = machOpToCond mop
+      Just cond' = maybeFlipCond cond
+  --TODO: Update CFG for x86
+  let code = code1 `appOL` code2 `appOL` toOL [
+        CMP II32 (OpReg r2_hi) (OpReg r1_hi),
+        JXX cond true,
+        JXX cond' false,
+        CMP II32 (OpReg r2_lo) (OpReg r1_lo),
+        JXX cond true] `appOL` genBranch false
+  return code
+
+genCondBranch' _ bid id false bool = do
+  CondCode is_float cond cond_code <- getCondCode bool
+  use_sse2 <- sse2Enabled
+  if not is_float || not use_sse2
+    then
+        return (cond_code `snocOL` JXX cond id `appOL` genBranch false)
+    else do
+        -- See Note [SSE Parity Checks]
+        let jmpFalse = genBranch false
+            code
+                = case cond of
+                  NE  -> or_unordered
+                  GU  -> plain_test
+                  GEU -> plain_test
+                  -- Use ASSERT so we don't break releases if
+                  -- LTT/LE creep in somehow.
+                  LTT ->
+                    ASSERT2(False, ppr "Should have been turned into >")
+                    and_ordered
+                  LE  ->
+                    ASSERT2(False, ppr "Should have been turned into >=")
+                    and_ordered
+                  _   -> and_ordered
+
+            plain_test = unitOL (
+                  JXX cond id
+                ) `appOL` jmpFalse
+            or_unordered = toOL [
+                  JXX cond id,
+                  JXX PARITY id
+                ] `appOL` jmpFalse
+            and_ordered = toOL [
+                  JXX PARITY false,
+                  JXX cond id,
+                  JXX ALWAYS false
+                ]
+        updateCfgNat (\cfg -> adjustEdgeWeight cfg (+3) bid false)
+        return (cond_code `appOL` code)
+
+{-  Note [Introducing cfg edges inside basic blocks]
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    During instruction selection a statement `s`
+    in a block B with control of the sort: B -> C
+    will sometimes result in control
+    flow of the sort:
+
+            ┌ < ┐
+            v   ^
+      B ->  B1  ┴ -> C
+
+    as is the case for some atomic operations.
+
+    Now to keep the CFG in sync when introducing B1 we clearly
+    want to insert it between B and C. However there is
+    a catch when we have to deal with self loops.
+
+    We might start with code and a CFG of these forms:
+
+    loop:
+        stmt1               ┌ < ┐
+        ....                v   ^
+        stmtX              loop ┘
+        stmtY
+        ....
+        goto loop:
+
+    Now we introduce B1:
+                            ┌ ─ ─ ─ ─ ─┐
+        loop:               │   ┌ <  ┐ │
+        instrs              v   │    │ ^
+        ....               loop ┴ B1 ┴ ┘
+        instrsFromX
+        stmtY
+        goto loop:
+
+    This is simple, all outgoing edges from loop now simply
+    start from B1 instead and the code generator knows which
+    new edges it introduced for the self loop of B1.
+
+    Disaster strikes if the statement Y follows the same pattern.
+    If we apply the same rule that all outgoing edges change then
+    we end up with:
+
+        loop ─> B1 ─> B2 ┬─┐
+          │      │    └─<┤ │
+          │      └───<───┘ │
+          └───────<────────┘
+
+    This is problematic. The edge B1->B1 is modified as expected.
+    However the modification is wrong!
+
+    The assembly in this case looked like this:
+
+    _loop:
+        <instrs>
+    _B1:
+        ...
+        cmpxchgq ...
+        jne _B1
+        <instrs>
+        <end _B1>
+    _B2:
+        ...
+        cmpxchgq ...
+        jne _B2
+        <instrs>
+        jmp loop
+
+    There is no edge _B2 -> _B1 here. It's still a self loop onto _B1.
+
+    The problem here is that really B1 should be two basic blocks.
+    Otherwise we have control flow in the *middle* of a basic block.
+    A contradiction!
+
+    So to account for this we add yet another basic block marker:
+
+    _B:
+        <instrs>
+    _B1:
+        ...
+        cmpxchgq ...
+        jne _B1
+        jmp _B1'
+    _B1':
+        <instrs>
+        <end _B1>
+    _B2:
+        ...
+
+    Now when inserting B2 we will only look at the outgoing edges of B1' and
+    everything will work out nicely.
+
+    You might also wonder why we don't insert jumps at the end of _B1'. There is
+    no way another block ends up jumping to the labels _B1 or _B2 since they are
+    essentially invisible to other blocks. View them as control flow labels local
+    to the basic block if you'd like.
+
+    Not doing this ultimately caused (part 2 of) #17334.
+-}
+
+
+-- -----------------------------------------------------------------------------
+--  Generating C calls
+
+-- Now the biggest nightmare---calls.  Most of the nastiness is buried in
+-- @get_arg@, which moves the arguments to the correct registers/stack
+-- locations.  Apart from that, the code is easy.
+--
+-- (If applicable) Do not fill the delay slots here; you will confuse the
+-- register allocator.
+--
+-- See Note [Keeping track of the current block] for information why we need
+-- to take/return a block id.
+
+genCCall
+    :: DynFlags
+    -> Bool                     -- 32 bit platform?
+    -> ForeignTarget            -- function to call
+    -> [CmmFormal]        -- where to put the result
+    -> [CmmActual]        -- arguments (of mixed type)
+    -> BlockId      -- The block we are in
+    -> NatM (InstrBlock, Maybe BlockId)
+
+-- First we deal with cases which might introduce new blocks in the stream.
+
+genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
+                                           [dst] [addr, n] bid = do
+    Amode amode addr_code <-
+        if amop `elem` [AMO_Add, AMO_Sub]
+        then getAmode addr
+        else getSimpleAmode dflags is32Bit addr  -- See genCCall for MO_Cmpxchg
+    arg <- getNewRegNat format
+    arg_code <- getAnyReg n
+    let platform = targetPlatform dflags
+        dst_r    = getRegisterReg platform  (CmmLocal dst)
+    (code, lbl) <- op_code dst_r arg amode
+    return (addr_code `appOL` arg_code arg `appOL` code, Just lbl)
+  where
+    -- Code for the operation
+    op_code :: Reg       -- Destination reg
+            -> Reg       -- Register containing argument
+            -> AddrMode  -- Address of location to mutate
+            -> NatM (OrdList Instr,BlockId) -- TODO: Return Maybe BlockId
+    op_code dst_r arg amode = case amop of
+        -- In the common case where dst_r is a virtual register the
+        -- final move should go away, because it's the last use of arg
+        -- and the first use of dst_r.
+        AMO_Add  -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
+                                  , MOV format (OpReg arg) (OpReg dst_r)
+                                  ], bid)
+        AMO_Sub  -> return $ (toOL [ NEGI format (OpReg arg)
+                                  , LOCK (XADD format (OpReg arg) (OpAddr amode))
+                                  , MOV format (OpReg arg) (OpReg dst_r)
+                                  ], bid)
+        -- In these cases we need a new block id, and have to return it so
+        -- that later instruction selection can reference it.
+        AMO_And  -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst)
+        AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst
+                                                    , NOT format dst
+                                                    ])
+        AMO_Or   -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst)
+        AMO_Xor  -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst)
+      where
+        -- Simulate operation that lacks a dedicated instruction using
+        -- cmpxchg.
+        cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
+                     -> NatM (OrdList Instr, BlockId)
+        cmpxchg_code instrs = do
+            lbl1 <- getBlockIdNat
+            lbl2 <- getBlockIdNat
+            tmp <- getNewRegNat format
+
+            --Record inserted blocks
+            --  We turn A -> B into A -> A' -> A'' -> B
+            --  with a self loop on A'.
+            addImmediateSuccessorNat bid lbl1
+            addImmediateSuccessorNat lbl1 lbl2
+            updateCfgNat (addWeightEdge lbl1 lbl1 0)
+
+            return $ (toOL
+                [ MOV format (OpAddr amode) (OpReg eax)
+                , JXX ALWAYS lbl1
+                , NEWBLOCK lbl1
+                  -- Keep old value so we can return it:
+                , MOV format (OpReg eax) (OpReg dst_r)
+                , MOV format (OpReg eax) (OpReg tmp)
+                ]
+                `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
+                [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
+                , JXX NE lbl1
+                -- See Note [Introducing cfg edges inside basic blocks]
+                -- why this basic block is required.
+                , JXX ALWAYS lbl2
+                , NEWBLOCK lbl2
+                ],
+                lbl2)
+    format = intFormat width
+
+genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
+  | is32Bit, width == W64 = do
+      ChildCode64 vcode rlo <- iselExpr64 src
+      let rhi     = getHiVRegFromLo rlo
+          dst_r   = getRegisterReg platform  (CmmLocal dst)
+      lbl1 <- getBlockIdNat
+      lbl2 <- getBlockIdNat
+      let format = if width == W8 then II16 else intFormat width
+      tmp_r <- getNewRegNat format
+
+      -- New CFG Edges:
+      --  bid -> lbl2
+      --  bid -> lbl1 -> lbl2
+      --  We also changes edges originating at bid to start at lbl2 instead.
+      updateCfgNat (addWeightEdge bid lbl1 110 .
+                    addWeightEdge lbl1 lbl2 110 .
+                    addImmediateSuccessor bid lbl2)
+
+      -- The following instruction sequence corresponds to the pseudo-code
+      --
+      --  if (src) {
+      --    dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
+      --  } else {
+      --    dst = 64;
+      --  }
+      let !instrs = vcode `appOL` toOL
+               ([ MOV      II32 (OpReg rhi)         (OpReg tmp_r)
+                , OR       II32 (OpReg rlo)         (OpReg tmp_r)
+                , MOV      II32 (OpImm (ImmInt 64)) (OpReg dst_r)
+                , JXX EQQ    lbl2
+                , JXX ALWAYS lbl1
+
+                , NEWBLOCK   lbl1
+                , BSF     II32 (OpReg rhi)         dst_r
+                , ADD     II32 (OpImm (ImmInt 32)) (OpReg dst_r)
+                , BSF     II32 (OpReg rlo)         tmp_r
+                , CMOV NE II32 (OpReg tmp_r)       dst_r
+                , JXX ALWAYS lbl2
+
+                , NEWBLOCK   lbl2
+                ])
+      return (instrs, Just lbl2)
+
+  | otherwise = do
+    code_src <- getAnyReg src
+    let dst_r = getRegisterReg platform (CmmLocal dst)
+
+    if isBmi2Enabled dflags
+    then do
+        src_r <- getNewRegNat (intFormat width)
+        let instrs = appOL (code_src src_r) $ case width of
+                W8 -> toOL
+                    [ OR    II32 (OpImm (ImmInteger 0xFFFFFF00)) (OpReg src_r)
+                    , TZCNT II32 (OpReg src_r)        dst_r
+                    ]
+                W16 -> toOL
+                    [ TZCNT  II16 (OpReg src_r) dst_r
+                    , MOVZxL II16 (OpReg dst_r) (OpReg dst_r)
+                    ]
+                _ -> unitOL $ TZCNT (intFormat width) (OpReg src_r) dst_r
+        return (instrs, Nothing)
+    else do
+        -- The following insn sequence makes sure 'ctz 0' has a defined value.
+        -- starting with Haswell, one could use the TZCNT insn instead.
+        let format = if width == W8 then II16 else intFormat width
+        src_r <- getNewRegNat format
+        tmp_r <- getNewRegNat format
+        let !instrs = code_src src_r `appOL` toOL
+                 ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+                  [ BSF     format (OpReg src_r) tmp_r
+                  , MOV     II32   (OpImm (ImmInt bw)) (OpReg dst_r)
+                  , CMOV NE format (OpReg tmp_r) dst_r
+                  ]) -- NB: We don't need to zero-extend the result for the
+                     -- W8/W16 cases because the 'MOV' insn already
+                     -- took care of implicitly clearing the upper bits
+        return (instrs, Nothing)
+  where
+    bw = widthInBits width
+    platform = targetPlatform dflags
+
+genCCall dflags bits mop dst args bid = do
+  instr <- genCCall' dflags bits mop dst args bid
+  return (instr, Nothing)
+
+-- genCCall' handles cases not introducing new code blocks.
+genCCall'
+    :: DynFlags
+    -> Bool                     -- 32 bit platform?
+    -> ForeignTarget            -- function to call
+    -> [CmmFormal]        -- where to put the result
+    -> [CmmActual]        -- arguments (of mixed type)
+    -> BlockId      -- The block we are in
+    -> NatM InstrBlock
+
+-- Unroll memcpy calls if the number of bytes to copy isn't too
+-- large.  Otherwise, call C's memcpy.
+genCCall' dflags _ (PrimTarget (MO_Memcpy align)) _
+         [dst, src, CmmLit (CmmInt n _)] _
+    | fromInteger insns <= maxInlineMemcpyInsns dflags = do
+        code_dst <- getAnyReg dst
+        dst_r <- getNewRegNat format
+        code_src <- getAnyReg src
+        src_r <- getNewRegNat format
+        tmp_r <- getNewRegNat format
+        return $ code_dst dst_r `appOL` code_src src_r `appOL`
+            go dst_r src_r tmp_r (fromInteger n)
+  where
+    -- The number of instructions we will generate (approx). We need 2
+    -- instructions per move.
+    insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes)
+
+    maxAlignment = wordAlignment dflags -- only machine word wide MOVs are supported
+    effectiveAlignment = min (alignmentOf align) maxAlignment
+    format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
+
+    -- The size of each move, in bytes.
+    sizeBytes :: Integer
+    sizeBytes = fromIntegral (formatInBytes format)
+
+    go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr
+    go dst src tmp i
+        | i >= sizeBytes =
+            unitOL (MOV format (OpAddr src_addr) (OpReg tmp)) `appOL`
+            unitOL (MOV format (OpReg tmp) (OpAddr dst_addr)) `appOL`
+            go dst src tmp (i - sizeBytes)
+        -- Deal with remaining bytes.
+        | i >= 4 =  -- Will never happen on 32-bit
+            unitOL (MOV II32 (OpAddr src_addr) (OpReg tmp)) `appOL`
+            unitOL (MOV II32 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+            go dst src tmp (i - 4)
+        | i >= 2 =
+            unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
+            unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+            go dst src tmp (i - 2)
+        | i >= 1 =
+            unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
+            unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+            go dst src tmp (i - 1)
+        | otherwise = nilOL
+      where
+        src_addr = AddrBaseIndex (EABaseReg src) EAIndexNone
+                   (ImmInteger (n - i))
+        dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
+                   (ImmInteger (n - i))
+
+genCCall' dflags _ (PrimTarget (MO_Memset align)) _
+         [dst,
+          CmmLit (CmmInt c _),
+          CmmLit (CmmInt n _)]
+         _
+    | fromInteger insns <= maxInlineMemsetInsns dflags = do
+        code_dst <- getAnyReg dst
+        dst_r <- getNewRegNat format
+        if format == II64 && n >= 8 then do
+          code_imm8byte <- getAnyReg (CmmLit (CmmInt c8 W64))
+          imm8byte_r <- getNewRegNat II64
+          return $ code_dst dst_r `appOL`
+                   code_imm8byte imm8byte_r `appOL`
+                   go8 dst_r imm8byte_r (fromInteger n)
+        else
+          return $ code_dst dst_r `appOL`
+                   go4 dst_r (fromInteger n)
+  where
+    maxAlignment = wordAlignment dflags -- only machine word wide MOVs are supported
+    effectiveAlignment = min (alignmentOf align) maxAlignment
+    format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
+    c2 = c `shiftL` 8 .|. c
+    c4 = c2 `shiftL` 16 .|. c2
+    c8 = c4 `shiftL` 32 .|. c4
+
+    -- The number of instructions we will generate (approx). We need 1
+    -- instructions per move.
+    insns = (n + sizeBytes - 1) `div` sizeBytes
+
+    -- The size of each move, in bytes.
+    sizeBytes :: Integer
+    sizeBytes = fromIntegral (formatInBytes format)
+
+    -- Depending on size returns the widest MOV instruction and its
+    -- width.
+    gen4 :: AddrMode -> Integer -> (InstrBlock, Integer)
+    gen4 addr size
+        | size >= 4 =
+            (unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4)
+        | size >= 2 =
+            (unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2)
+        | size >= 1 =
+            (unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1)
+        | otherwise = (nilOL, 0)
+
+    -- Generates a 64-bit wide MOV instruction from REG to MEM.
+    gen8 :: AddrMode -> Reg -> InstrBlock
+    gen8 addr reg8byte =
+      unitOL (MOV format (OpReg reg8byte) (OpAddr addr))
+
+    -- Unrolls memset when the widest MOV is <= 4 bytes.
+    go4 :: Reg -> Integer -> InstrBlock
+    go4 dst left =
+      if left <= 0 then nilOL
+      else curMov `appOL` go4 dst (left - curWidth)
+      where
+        possibleWidth = minimum [left, sizeBytes]
+        dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
+        (curMov, curWidth) = gen4 dst_addr possibleWidth
+
+    -- Unrolls memset when the widest MOV is 8 bytes (thus another Reg
+    -- argument). Falls back to go4 when all 8 byte moves are
+    -- exhausted.
+    go8 :: Reg -> Reg -> Integer -> InstrBlock
+    go8 dst reg8byte left =
+      if possibleWidth >= 8 then
+        let curMov = gen8 dst_addr reg8byte
+        in  curMov `appOL` go8 dst reg8byte (left - 8)
+      else go4 dst left
+      where
+        possibleWidth = minimum [left, sizeBytes]
+        dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
+
+genCCall' _ _ (PrimTarget MO_ReadBarrier) _ _ _  = return nilOL
+genCCall' _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
+        -- barriers compile to no code on x86/x86-64;
+        -- we keep it this long in order to prevent earlier optimisations.
+
+genCCall' _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL
+
+genCCall' _ is32bit (PrimTarget (MO_Prefetch_Data n )) _  [src] _ =
+        case n of
+            0 -> genPrefetch src $ PREFETCH NTA  format
+            1 -> genPrefetch src $ PREFETCH Lvl2 format
+            2 -> genPrefetch src $ PREFETCH Lvl1 format
+            3 -> genPrefetch src $ PREFETCH Lvl0 format
+            l -> panic $ "unexpected prefetch level in genCCall MO_Prefetch_Data: " ++ (show l)
+            -- the c / llvm prefetch convention is 0, 1, 2, and 3
+            -- the x86 corresponding names are : NTA, 2 , 1, and 0
+   where
+        format = archWordFormat is32bit
+        -- need to know what register width for pointers!
+        genPrefetch inRegSrc prefetchCTor =
+            do
+                code_src <- getAnyReg inRegSrc
+                src_r <- getNewRegNat format
+                return $ code_src src_r `appOL`
+                  (unitOL (prefetchCTor  (OpAddr
+                              ((AddrBaseIndex (EABaseReg src_r )   EAIndexNone (ImmInt 0))))  ))
+                  -- prefetch always takes an address
+
+genCCall' dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
+    let platform = targetPlatform dflags
+    let dst_r = getRegisterReg platform (CmmLocal dst)
+    case width of
+        W64 | is32Bit -> do
+               ChildCode64 vcode rlo <- iselExpr64 src
+               let dst_rhi = getHiVRegFromLo dst_r
+                   rhi     = getHiVRegFromLo rlo
+               return $ vcode `appOL`
+                        toOL [ MOV II32 (OpReg rlo) (OpReg dst_rhi),
+                               MOV II32 (OpReg rhi) (OpReg dst_r),
+                               BSWAP II32 dst_rhi,
+                               BSWAP II32 dst_r ]
+        W16 -> do code_src <- getAnyReg src
+                  return $ code_src dst_r `appOL`
+                           unitOL (BSWAP II32 dst_r) `appOL`
+                           unitOL (SHR II32 (OpImm $ ImmInt 16) (OpReg dst_r))
+        _   -> do code_src <- getAnyReg src
+                  return $ code_src dst_r `appOL` unitOL (BSWAP format dst_r)
+  where
+    format = intFormat width
+
+genCCall' dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
+         args@[src] bid = do
+    sse4_2 <- sse4_2Enabled
+    let platform = targetPlatform dflags
+    if sse4_2
+        then do code_src <- getAnyReg src
+                src_r <- getNewRegNat format
+                let dst_r = getRegisterReg platform  (CmmLocal dst)
+                return $ code_src src_r `appOL`
+                    (if width == W8 then
+                         -- The POPCNT instruction doesn't take a r/m8
+                         unitOL (MOVZxL II8 (OpReg src_r) (OpReg src_r)) `appOL`
+                         unitOL (POPCNT II16 (OpReg src_r) dst_r)
+                     else
+                         unitOL (POPCNT format (OpReg src_r) dst_r)) `appOL`
+                    (if width == W8 || width == W16 then
+                         -- We used a 16-bit destination register above,
+                         -- so zero-extend
+                         unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+                     else nilOL)
+        else do
+            targetExpr <- cmmMakeDynamicReference dflags
+                          CallReference lbl
+            let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+                                                           [NoHint] [NoHint]
+                                                           CmmMayReturn)
+            genCCall' dflags is32Bit target dest_regs args bid
+  where
+    format = intFormat width
+    lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width))
+
+genCCall' dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
+         args@[src, mask] bid = do
+    let platform = targetPlatform dflags
+    if isBmi2Enabled dflags
+        then do code_src  <- getAnyReg src
+                code_mask <- getAnyReg mask
+                src_r     <- getNewRegNat format
+                mask_r    <- getNewRegNat format
+                let dst_r = getRegisterReg platform  (CmmLocal dst)
+                return $ code_src src_r `appOL` code_mask mask_r `appOL`
+                    (if width == W8 then
+                         -- The PDEP instruction doesn't take a r/m8
+                         unitOL (MOVZxL II8  (OpReg src_r ) (OpReg src_r )) `appOL`
+                         unitOL (MOVZxL II8  (OpReg mask_r) (OpReg mask_r)) `appOL`
+                         unitOL (PDEP   II16 (OpReg mask_r) (OpReg src_r ) dst_r)
+                     else
+                         unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
+                    (if width == W8 || width == W16 then
+                         -- We used a 16-bit destination register above,
+                         -- so zero-extend
+                         unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+                     else nilOL)
+        else do
+            targetExpr <- cmmMakeDynamicReference dflags
+                          CallReference lbl
+            let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+                                                           [NoHint] [NoHint]
+                                                           CmmMayReturn)
+            genCCall' dflags is32Bit target dest_regs args bid
+  where
+    format = intFormat width
+    lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width))
+
+genCCall' dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
+         args@[src, mask] bid = do
+    let platform = targetPlatform dflags
+    if isBmi2Enabled dflags
+        then do code_src  <- getAnyReg src
+                code_mask <- getAnyReg mask
+                src_r     <- getNewRegNat format
+                mask_r    <- getNewRegNat format
+                let dst_r = getRegisterReg platform  (CmmLocal dst)
+                return $ code_src src_r `appOL` code_mask mask_r `appOL`
+                    (if width == W8 then
+                         -- The PEXT instruction doesn't take a r/m8
+                         unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
+                         unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
+                         unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r)
+                     else
+                         unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
+                    (if width == W8 || width == W16 then
+                         -- We used a 16-bit destination register above,
+                         -- so zero-extend
+                         unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+                     else nilOL)
+        else do
+            targetExpr <- cmmMakeDynamicReference dflags
+                          CallReference lbl
+            let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+                                                           [NoHint] [NoHint]
+                                                           CmmMayReturn)
+            genCCall' dflags is32Bit target dest_regs args bid
+  where
+    format = intFormat width
+    lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width))
+
+genCCall' dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid
+  | is32Bit && width == W64 = do
+    -- Fallback to `hs_clz64` on i386
+    targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
+    let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+                                           [NoHint] [NoHint]
+                                           CmmMayReturn)
+    genCCall' dflags is32Bit target dest_regs args bid
+
+  | otherwise = do
+    code_src <- getAnyReg src
+    let dst_r = getRegisterReg platform (CmmLocal dst)
+    if isBmi2Enabled dflags
+        then do
+            src_r <- getNewRegNat (intFormat width)
+            return $ appOL (code_src src_r) $ case width of
+                W8 -> toOL
+                    [ MOVZxL II8  (OpReg src_r)       (OpReg src_r) -- zero-extend to 32 bit
+                    , LZCNT  II32 (OpReg src_r)       dst_r         -- lzcnt with extra 24 zeros
+                    , SUB    II32 (OpImm (ImmInt 24)) (OpReg dst_r) -- compensate for extra zeros
+                    ]
+                W16 -> toOL
+                    [ LZCNT  II16 (OpReg src_r) dst_r
+                    , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) -- zero-extend from 16 bit
+                    ]
+                _ -> unitOL (LZCNT (intFormat width) (OpReg src_r) dst_r)
+        else do
+            let format = if width == W8 then II16 else intFormat width
+            src_r <- getNewRegNat format
+            tmp_r <- getNewRegNat format
+            return $ code_src src_r `appOL` toOL
+                     ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+                      [ BSR     format (OpReg src_r) tmp_r
+                      , MOV     II32   (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
+                      , CMOV NE format (OpReg tmp_r) dst_r
+                      , XOR     format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
+                      ]) -- NB: We don't need to zero-extend the result for the
+                         -- W8/W16 cases because the 'MOV' insn already
+                         -- took care of implicitly clearing the upper bits
+  where
+    bw = widthInBits width
+    platform = targetPlatform dflags
+    lbl = mkCmmCodeLabel primUnitId (fsLit (clzLabel width))
+
+genCCall' dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
+    targetExpr <- cmmMakeDynamicReference dflags
+                  CallReference lbl
+    let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+                                           [NoHint] [NoHint]
+                                           CmmMayReturn)
+    genCCall' dflags is32Bit target dest_regs args bid
+  where
+    lbl = mkCmmCodeLabel primUnitId (fsLit (word2FloatLabel width))
+
+genCCall' dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
+  load_code <- intLoadCode (MOV (intFormat width)) addr
+  let platform = targetPlatform dflags
+
+  return (load_code (getRegisterReg platform  (CmmLocal dst)))
+
+genCCall' _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
+    code <- assignMem_IntCode (intFormat width) addr val
+    return $ code `snocOL` MFENCE
+
+genCCall' dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do
+    -- On x86 we don't have enough registers to use cmpxchg with a
+    -- complicated addressing mode, so on that architecture we
+    -- pre-compute the address first.
+    Amode amode addr_code <- getSimpleAmode dflags is32Bit addr
+    newval <- getNewRegNat format
+    newval_code <- getAnyReg new
+    oldval <- getNewRegNat format
+    oldval_code <- getAnyReg old
+    let platform = targetPlatform dflags
+        dst_r    = getRegisterReg platform  (CmmLocal dst)
+        code     = toOL
+                   [ MOV format (OpReg oldval) (OpReg eax)
+                   , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode))
+                   , MOV format (OpReg eax) (OpReg dst_r)
+                   ]
+    return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval
+        `appOL` code
+  where
+    format = intFormat width
+
+genCCall' _ is32Bit target dest_regs args bid = do
+  dflags <- getDynFlags
+  let platform = targetPlatform dflags
+  case (target, dest_regs) of
+    -- void return type prim op
+    (PrimTarget op, []) ->
+        outOfLineCmmOp bid op Nothing args
+    -- we only cope with a single result for foreign calls
+    (PrimTarget op, [r])  -> case op of
+          MO_F32_Fabs -> case args of
+            [x] -> sse2FabsCode W32 x
+            _ -> panic "genCCall: Wrong number of arguments for fabs"
+          MO_F64_Fabs -> case args of
+            [x] -> sse2FabsCode W64 x
+            _ -> panic "genCCall: Wrong number of arguments for fabs"
+
+          MO_F32_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF32 args
+          MO_F64_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF64 args
+          _other_op -> outOfLineCmmOp bid op (Just r) args
+
+       where
+        actuallyInlineSSE2Op = actuallyInlineFloatOp'
+
+        actuallyInlineFloatOp'  instr format [x]
+              = do res <- trivialUFCode format (instr format) x
+                   any <- anyReg res
+                   return (any (getRegisterReg platform  (CmmLocal r)))
+
+        actuallyInlineFloatOp' _ _ args
+              = panic $ "genCCall.actuallyInlineFloatOp': bad number of arguments! ("
+                      ++ show (length args) ++ ")"
+
+        sse2FabsCode :: Width -> CmmExpr -> NatM InstrBlock
+        sse2FabsCode w x = do
+          let fmt = floatFormat w
+          x_code <- getAnyReg x
+          let
+            const | FF32 <- fmt = CmmInt 0x7fffffff W32
+                  | otherwise   = CmmInt 0x7fffffffffffffff W64
+          Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const
+          tmp <- getNewRegNat fmt
+          let
+            code dst = x_code dst `appOL` amode_code `appOL` toOL [
+                MOV fmt (OpAddr amode) (OpReg tmp),
+                AND fmt (OpReg tmp) (OpReg dst)
+                ]
+
+          return $ code (getRegisterReg platform (CmmLocal r))
+
+    (PrimTarget (MO_S_QuotRem  width), _) -> divOp1 platform True  width dest_regs args
+    (PrimTarget (MO_U_QuotRem  width), _) -> divOp1 platform False width dest_regs args
+    (PrimTarget (MO_U_QuotRem2 width), _) -> divOp2 platform False width dest_regs args
+    (PrimTarget (MO_Add2 width), [res_h, res_l]) ->
+        case args of
+        [arg_x, arg_y] ->
+            do hCode <- getAnyReg (CmmLit (CmmInt 0 width))
+               let format = intFormat width
+               lCode <- anyReg =<< trivialCode width (ADD_CC format)
+                                     (Just (ADD_CC format)) arg_x arg_y
+               let reg_l = getRegisterReg platform (CmmLocal res_l)
+                   reg_h = getRegisterReg platform (CmmLocal res_h)
+                   code = hCode reg_h `appOL`
+                          lCode reg_l `snocOL`
+                          ADC format (OpImm (ImmInteger 0)) (OpReg reg_h)
+               return code
+        _ -> panic "genCCall: Wrong number of arguments/results for add2"
+    (PrimTarget (MO_AddWordC width), [res_r, res_c]) ->
+        addSubIntC platform ADD_CC (const Nothing) CARRY width res_r res_c args
+    (PrimTarget (MO_SubWordC width), [res_r, res_c]) ->
+        addSubIntC platform SUB_CC (const Nothing) CARRY width res_r res_c args
+    (PrimTarget (MO_AddIntC width), [res_r, res_c]) ->
+        addSubIntC platform ADD_CC (Just . ADD_CC) OFLO width res_r res_c args
+    (PrimTarget (MO_SubIntC width), [res_r, res_c]) ->
+        addSubIntC platform SUB_CC (const Nothing) OFLO width res_r res_c args
+    (PrimTarget (MO_U_Mul2 width), [res_h, res_l]) ->
+        case args of
+        [arg_x, arg_y] ->
+            do (y_reg, y_code) <- getRegOrMem arg_y
+               x_code <- getAnyReg arg_x
+               let format = intFormat width
+                   reg_h = getRegisterReg platform (CmmLocal res_h)
+                   reg_l = getRegisterReg platform (CmmLocal res_l)
+                   code = y_code `appOL`
+                          x_code rax `appOL`
+                          toOL [MUL2 format y_reg,
+                                MOV format (OpReg rdx) (OpReg reg_h),
+                                MOV format (OpReg rax) (OpReg reg_l)]
+               return code
+        _ -> panic "genCCall: Wrong number of arguments/results for mul2"
+    (PrimTarget (MO_S_Mul2 width), [res_c, res_h, res_l]) ->
+        case args of
+        [arg_x, arg_y] ->
+            do (y_reg, y_code) <- getRegOrMem arg_y
+               x_code <- getAnyReg arg_x
+               reg_tmp <- getNewRegNat II8
+               let format = intFormat width
+                   reg_h = getRegisterReg platform (CmmLocal res_h)
+                   reg_l = getRegisterReg platform (CmmLocal res_l)
+                   reg_c = getRegisterReg platform (CmmLocal res_c)
+                   code = y_code `appOL`
+                          x_code rax `appOL`
+                          toOL [ IMUL2 format y_reg
+                               , MOV format (OpReg rdx) (OpReg reg_h)
+                               , MOV format (OpReg rax) (OpReg reg_l)
+                               , SETCC CARRY (OpReg reg_tmp)
+                               , MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
+                               ]
+               return code
+        _ -> panic "genCCall: Wrong number of arguments/results for imul2"
+
+    _ -> if is32Bit
+         then genCCall32' dflags target dest_regs args
+         else genCCall64' dflags target dest_regs args
+
+  where divOp1 platform signed width results [arg_x, arg_y]
+            = divOp platform signed width results Nothing arg_x arg_y
+        divOp1 _ _ _ _ _
+            = panic "genCCall: Wrong number of arguments for divOp1"
+        divOp2 platform signed width results [arg_x_high, arg_x_low, arg_y]
+            = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y
+        divOp2 _ _ _ _ _
+            = panic "genCCall: Wrong number of arguments for divOp2"
+
+        -- See Note [DIV/IDIV for bytes]
+        divOp platform signed W8 [res_q, res_r] m_arg_x_high arg_x_low arg_y =
+            let widen | signed = MO_SS_Conv W8 W16
+                      | otherwise = MO_UU_Conv W8 W16
+                arg_x_low_16 = CmmMachOp widen [arg_x_low]
+                arg_y_16 = CmmMachOp widen [arg_y]
+                m_arg_x_high_16 = (\p -> CmmMachOp widen [p]) <$> m_arg_x_high
+            in divOp
+                  platform signed W16 [res_q, res_r]
+                  m_arg_x_high_16 arg_x_low_16 arg_y_16
+
+        divOp platform signed width [res_q, res_r]
+              m_arg_x_high arg_x_low arg_y
+            = do let format = intFormat width
+                     reg_q = getRegisterReg platform (CmmLocal res_q)
+                     reg_r = getRegisterReg platform (CmmLocal res_r)
+                     widen | signed    = CLTD format
+                           | otherwise = XOR format (OpReg rdx) (OpReg rdx)
+                     instr | signed    = IDIV
+                           | otherwise = DIV
+                 (y_reg, y_code) <- getRegOrMem arg_y
+                 x_low_code <- getAnyReg arg_x_low
+                 x_high_code <- case m_arg_x_high of
+                                Just arg_x_high ->
+                                    getAnyReg arg_x_high
+                                Nothing ->
+                                    return $ const $ unitOL widen
+                 return $ y_code `appOL`
+                          x_low_code rax `appOL`
+                          x_high_code rdx `appOL`
+                          toOL [instr format y_reg,
+                                MOV format (OpReg rax) (OpReg reg_q),
+                                MOV format (OpReg rdx) (OpReg reg_r)]
+        divOp _ _ _ _ _ _ _
+            = panic "genCCall: Wrong number of results for divOp"
+
+        addSubIntC platform instr mrevinstr cond width
+                   res_r res_c [arg_x, arg_y]
+            = do let format = intFormat width
+                 rCode <- anyReg =<< trivialCode width (instr format)
+                                       (mrevinstr format) arg_x arg_y
+                 reg_tmp <- getNewRegNat II8
+                 let reg_c = getRegisterReg platform  (CmmLocal res_c)
+                     reg_r = getRegisterReg platform  (CmmLocal res_r)
+                     code = rCode reg_r `snocOL`
+                            SETCC cond (OpReg reg_tmp) `snocOL`
+                            MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
+
+                 return code
+        addSubIntC _ _ _ _ _ _ _ _
+            = panic "genCCall: Wrong number of arguments/results for addSubIntC"
+
+-- Note [DIV/IDIV for bytes]
+--
+-- IDIV reminder:
+--   Size    Dividend   Divisor   Quotient    Remainder
+--   byte    %ax         r/m8      %al          %ah
+--   word    %dx:%ax     r/m16     %ax          %dx
+--   dword   %edx:%eax   r/m32     %eax         %edx
+--   qword   %rdx:%rax   r/m64     %rax         %rdx
+--
+-- We do a special case for the byte division because the current
+-- codegen doesn't deal well with accessing %ah register (also,
+-- accessing %ah in 64-bit mode is complicated because it cannot be an
+-- operand of many instructions). So we just widen operands to 16 bits
+-- and get the results from %al, %dl. This is not optimal, but a few
+-- register moves are probably not a huge deal when doing division.
+
+genCCall32' :: DynFlags
+            -> ForeignTarget            -- function to call
+            -> [CmmFormal]        -- where to put the result
+            -> [CmmActual]        -- arguments (of mixed type)
+            -> NatM InstrBlock
+genCCall32' dflags target dest_regs args = do
+        let
+            prom_args = map (maybePromoteCArg dflags W32) args
+
+            -- Align stack to 16n for calls, assuming a starting stack
+            -- alignment of 16n - word_size on procedure entry. Which we
+            -- maintiain. See Note [rts/StgCRun.c : Stack Alignment on X86]
+            sizes               = map (arg_size_bytes . cmmExprType dflags) (reverse args)
+            raw_arg_size        = sum sizes + wORD_SIZE dflags
+            arg_pad_size        = (roundTo 16 $ raw_arg_size) - raw_arg_size
+            tot_arg_size        = raw_arg_size + arg_pad_size - wORD_SIZE dflags
+        delta0 <- getDeltaNat
+        setDeltaNat (delta0 - arg_pad_size)
+
+        push_codes <- mapM push_arg (reverse prom_args)
+        delta <- getDeltaNat
+        MASSERT(delta == delta0 - tot_arg_size)
+
+        -- deal with static vs dynamic call targets
+        (callinsns,cconv) <-
+          case target of
+            ForeignTarget (CmmLit (CmmLabel lbl)) conv
+               -> -- ToDo: stdcall arg sizes
+                  return (unitOL (CALL (Left fn_imm) []), conv)
+               where fn_imm = ImmCLbl lbl
+            ForeignTarget expr conv
+               -> do { (dyn_r, dyn_c) <- getSomeReg expr
+                     ; ASSERT( isWord32 (cmmExprType dflags expr) )
+                       return (dyn_c `snocOL` CALL (Right dyn_r) [], conv) }
+            PrimTarget _
+                -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
+                            ++ "probably because too many return values."
+
+        let push_code
+                | arg_pad_size /= 0
+                = toOL [SUB II32 (OpImm (ImmInt arg_pad_size)) (OpReg esp),
+                        DELTA (delta0 - arg_pad_size)]
+                  `appOL` concatOL push_codes
+                | otherwise
+                = concatOL push_codes
+
+              -- Deallocate parameters after call for ccall;
+              -- but not for stdcall (callee does it)
+              --
+              -- We have to pop any stack padding we added
+              -- even if we are doing stdcall, though (#5052)
+            pop_size
+               | ForeignConvention StdCallConv _ _ _ <- cconv = arg_pad_size
+               | otherwise = tot_arg_size
+
+            call = callinsns `appOL`
+                   toOL (
+                      (if pop_size==0 then [] else
+                       [ADD II32 (OpImm (ImmInt pop_size)) (OpReg esp)])
+                      ++
+                      [DELTA delta0]
+                   )
+        setDeltaNat delta0
+
+        dflags <- getDynFlags
+        let platform = targetPlatform dflags
+
+        let
+            -- assign the results, if necessary
+            assign_code []     = nilOL
+            assign_code [dest]
+              | isFloatType ty =
+                  -- we assume SSE2
+                  let tmp_amode = AddrBaseIndex (EABaseReg esp)
+                                                       EAIndexNone
+                                                       (ImmInt 0)
+                      fmt = floatFormat w
+                         in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
+                                   DELTA (delta0 - b),
+                                   X87Store fmt  tmp_amode,
+                                   -- X87Store only supported for the CDECL ABI
+                                   -- NB: This code will need to be
+                                   -- revisted once GHC does more work around
+                                   -- SIGFPE f
+                                   MOV fmt (OpAddr tmp_amode) (OpReg r_dest),
+                                   ADD II32 (OpImm (ImmInt b)) (OpReg esp),
+                                   DELTA delta0]
+              | isWord64 ty    = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
+                                        MOV II32 (OpReg edx) (OpReg r_dest_hi)]
+              | otherwise      = unitOL (MOV (intFormat w)
+                                             (OpReg eax)
+                                             (OpReg r_dest))
+              where
+                    ty = localRegType dest
+                    w  = typeWidth ty
+                    b  = widthInBytes w
+                    r_dest_hi = getHiVRegFromLo r_dest
+                    r_dest    = getRegisterReg platform  (CmmLocal dest)
+            assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
+
+        return (push_code `appOL`
+                call `appOL`
+                assign_code dest_regs)
+
+      where
+        -- If the size is smaller than the word, we widen things (see maybePromoteCArg)
+        arg_size_bytes :: CmmType -> Int
+        arg_size_bytes ty = max (widthInBytes (typeWidth ty)) (widthInBytes (wordWidth dflags))
+
+        roundTo a x | x `mod` a == 0 = x
+                    | otherwise = x + a - (x `mod` a)
+
+        push_arg :: CmmActual {-current argument-}
+                        -> NatM InstrBlock  -- code
+
+        push_arg  arg -- we don't need the hints on x86
+          | isWord64 arg_ty = do
+            ChildCode64 code r_lo <- iselExpr64 arg
+            delta <- getDeltaNat
+            setDeltaNat (delta - 8)
+            let r_hi = getHiVRegFromLo r_lo
+            return (       code `appOL`
+                           toOL [PUSH II32 (OpReg r_hi), DELTA (delta - 4),
+                                 PUSH II32 (OpReg r_lo), DELTA (delta - 8),
+                                 DELTA (delta-8)]
+                )
+
+          | isFloatType arg_ty = do
+            (reg, code) <- getSomeReg arg
+            delta <- getDeltaNat
+            setDeltaNat (delta-size)
+            return (code `appOL`
+                            toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp),
+                                  DELTA (delta-size),
+                                  let addr = AddrBaseIndex (EABaseReg esp)
+                                                            EAIndexNone
+                                                            (ImmInt 0)
+                                      format = floatFormat (typeWidth arg_ty)
+                                  in
+
+                                  -- assume SSE2
+                                   MOV format (OpReg reg) (OpAddr addr)
+
+                                 ]
+                           )
+
+          | otherwise = do
+            -- Arguments can be smaller than 32-bit, but we still use @PUSH
+            -- II32@ - the usual calling conventions expect integers to be
+            -- 4-byte aligned.
+            ASSERT((typeWidth arg_ty) <= W32) return ()
+            (operand, code) <- getOperand arg
+            delta <- getDeltaNat
+            setDeltaNat (delta-size)
+            return (code `snocOL`
+                    PUSH II32 operand `snocOL`
+                    DELTA (delta-size))
+
+          where
+             arg_ty = cmmExprType dflags arg
+             size = arg_size_bytes arg_ty -- Byte size
+
+genCCall64' :: DynFlags
+            -> ForeignTarget      -- function to call
+            -> [CmmFormal]        -- where to put the result
+            -> [CmmActual]        -- arguments (of mixed type)
+            -> NatM InstrBlock
+genCCall64' dflags target dest_regs args = do
+    -- load up the register arguments
+    let prom_args = map (maybePromoteCArg dflags W32) args
+
+    (stack_args, int_regs_used, fp_regs_used, load_args_code, assign_args_code)
+         <-
+        if platformOS platform == OSMinGW32
+        then load_args_win prom_args [] [] (allArgRegs platform) nilOL
+        else do
+           (stack_args, aregs, fregs, load_args_code, assign_args_code)
+               <- load_args prom_args (allIntArgRegs platform)
+                                      (allFPArgRegs platform)
+                                      nilOL nilOL
+           let used_regs rs as = reverse (drop (length rs) (reverse as))
+               fregs_used      = used_regs fregs (allFPArgRegs platform)
+               aregs_used      = used_regs aregs (allIntArgRegs platform)
+           return (stack_args, aregs_used, fregs_used, load_args_code
+                                                      , assign_args_code)
+
+    let
+        arg_regs_used = int_regs_used ++ fp_regs_used
+        arg_regs = [eax] ++ arg_regs_used
+                -- for annotating the call instruction with
+        sse_regs = length fp_regs_used
+        arg_stack_slots = if platformOS platform == OSMinGW32
+                          then length stack_args + length (allArgRegs platform)
+                          else length stack_args
+        tot_arg_size = arg_size * arg_stack_slots
+
+
+    -- Align stack to 16n for calls, assuming a starting stack
+    -- alignment of 16n - word_size on procedure entry. Which we
+    -- maintain. See Note [rts/StgCRun.c : Stack Alignment on X86]
+    (real_size, adjust_rsp) <-
+        if (tot_arg_size + wORD_SIZE dflags) `rem` 16 == 0
+            then return (tot_arg_size, nilOL)
+            else do -- we need to adjust...
+                delta <- getDeltaNat
+                setDeltaNat (delta - wORD_SIZE dflags)
+                return (tot_arg_size + wORD_SIZE dflags, toOL [
+                                SUB II64 (OpImm (ImmInt (wORD_SIZE dflags))) (OpReg rsp),
+                                DELTA (delta - wORD_SIZE dflags) ])
+
+    -- push the stack args, right to left
+    push_code <- push_args (reverse stack_args) nilOL
+    -- On Win64, we also have to leave stack space for the arguments
+    -- that we are passing in registers
+    lss_code <- if platformOS platform == OSMinGW32
+                then leaveStackSpace (length (allArgRegs platform))
+                else return nilOL
+    delta <- getDeltaNat
+
+    -- deal with static vs dynamic call targets
+    (callinsns,_cconv) <-
+      case target of
+        ForeignTarget (CmmLit (CmmLabel lbl)) conv
+           -> -- ToDo: stdcall arg sizes
+              return (unitOL (CALL (Left fn_imm) arg_regs), conv)
+           where fn_imm = ImmCLbl lbl
+        ForeignTarget expr conv
+           -> do (dyn_r, dyn_c) <- getSomeReg expr
+                 return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
+        PrimTarget _
+            -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
+                        ++ "probably because too many return values."
+
+    let
+        -- The x86_64 ABI requires us to set %al to the number of SSE2
+        -- registers that contain arguments, if the called routine
+        -- is a varargs function.  We don't know whether it's a
+        -- varargs function or not, so we have to assume it is.
+        --
+        -- It's not safe to omit this assignment, even if the number
+        -- of SSE2 regs in use is zero.  If %al is larger than 8
+        -- on entry to a varargs function, seg faults ensue.
+        assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
+
+    let call = callinsns `appOL`
+               toOL (
+                    -- Deallocate parameters after call for ccall;
+                    -- stdcall has callee do it, but is not supported on
+                    -- x86_64 target (see #3336)
+                  (if real_size==0 then [] else
+                   [ADD (intFormat (wordWidth dflags)) (OpImm (ImmInt real_size)) (OpReg esp)])
+                  ++
+                  [DELTA (delta + real_size)]
+               )
+    setDeltaNat (delta + real_size)
+
+    let
+        -- assign the results, if necessary
+        assign_code []     = nilOL
+        assign_code [dest] =
+          case typeWidth rep of
+                W32 | isFloatType rep -> unitOL (MOV (floatFormat W32)
+                                                     (OpReg xmm0)
+                                                     (OpReg r_dest))
+                W64 | isFloatType rep -> unitOL (MOV (floatFormat W64)
+                                                     (OpReg xmm0)
+                                                     (OpReg r_dest))
+                _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
+          where
+                rep = localRegType dest
+                r_dest = getRegisterReg platform  (CmmLocal dest)
+        assign_code _many = panic "genCCall.assign_code many"
+
+    return (adjust_rsp          `appOL`
+            push_code           `appOL`
+            load_args_code      `appOL`
+            assign_args_code    `appOL`
+            lss_code            `appOL`
+            assign_eax sse_regs `appOL`
+            call                `appOL`
+            assign_code dest_regs)
+
+  where platform = targetPlatform dflags
+        arg_size = 8 -- always, at the mo
+
+
+        load_args :: [CmmExpr]
+                  -> [Reg]         -- int regs avail for args
+                  -> [Reg]         -- FP regs avail for args
+                  -> InstrBlock    -- code computing args
+                  -> InstrBlock    -- code assigning args to ABI regs
+                  -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
+        -- no more regs to use
+        load_args args [] [] code acode     =
+            return (args, [], [], code, acode)
+
+        -- no more args to push
+        load_args [] aregs fregs code acode =
+            return ([], aregs, fregs, code, acode)
+
+        load_args (arg : rest) aregs fregs code acode
+            | isFloatType arg_rep = case fregs of
+                 []     -> push_this_arg
+                 (r:rs) -> do
+                    (code',acode') <- reg_this_arg r
+                    load_args rest aregs rs code' acode'
+            | otherwise           = case aregs of
+                 []     -> push_this_arg
+                 (r:rs) -> do
+                    (code',acode') <- reg_this_arg r
+                    load_args rest rs fregs code' acode'
+            where
+
+              -- put arg into the list of stack pushed args
+              push_this_arg = do
+                 (args',ars,frs,code',acode')
+                     <- load_args rest aregs fregs code acode
+                 return (arg:args', ars, frs, code', acode')
+
+              -- pass the arg into the given register
+              reg_this_arg r
+                -- "operand" args can be directly assigned into r
+                | isOperand False arg = do
+                    arg_code <- getAnyReg arg
+                    return (code, (acode `appOL` arg_code r))
+                -- The last non-operand arg can be directly assigned after its
+                -- computation without going into a temporary register
+                | all (isOperand False) rest = do
+                    arg_code   <- getAnyReg arg
+                    return (code `appOL` arg_code r,acode)
+
+                -- other args need to be computed beforehand to avoid clobbering
+                -- previously assigned registers used to pass parameters (see
+                -- #11792, #12614). They are assigned into temporary registers
+                -- and get assigned to proper call ABI registers after they all
+                -- have been computed.
+                | otherwise     = do
+                    arg_code <- getAnyReg arg
+                    tmp      <- getNewRegNat arg_fmt
+                    let
+                      code'  = code `appOL` arg_code tmp
+                      acode' = acode `snocOL` reg2reg arg_fmt tmp r
+                    return (code',acode')
+
+              arg_rep = cmmExprType dflags arg
+              arg_fmt = cmmTypeFormat arg_rep
+
+        load_args_win :: [CmmExpr]
+                      -> [Reg]        -- used int regs
+                      -> [Reg]        -- used FP regs
+                      -> [(Reg, Reg)] -- (int, FP) regs avail for args
+                      -> InstrBlock
+                      -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
+        load_args_win args usedInt usedFP [] code
+            = return (args, usedInt, usedFP, code, nilOL)
+            -- no more regs to use
+        load_args_win [] usedInt usedFP _ code
+            = return ([], usedInt, usedFP, code, nilOL)
+            -- no more args to push
+        load_args_win (arg : rest) usedInt usedFP
+                      ((ireg, freg) : regs) code
+            | isFloatType arg_rep = do
+                 arg_code <- getAnyReg arg
+                 load_args_win rest (ireg : usedInt) (freg : usedFP) regs
+                               (code `appOL`
+                                arg_code freg `snocOL`
+                                -- If we are calling a varargs function
+                                -- then we need to define ireg as well
+                                -- as freg
+                                MOV II64 (OpReg freg) (OpReg ireg))
+            | otherwise = do
+                 arg_code <- getAnyReg arg
+                 load_args_win rest (ireg : usedInt) usedFP regs
+                               (code `appOL` arg_code ireg)
+            where
+              arg_rep = cmmExprType dflags arg
+
+        push_args [] code = return code
+        push_args (arg:rest) code
+           | isFloatType arg_rep = do
+             (arg_reg, arg_code) <- getSomeReg arg
+             delta <- getDeltaNat
+             setDeltaNat (delta-arg_size)
+             let code' = code `appOL` arg_code `appOL` toOL [
+                            SUB (intFormat (wordWidth dflags)) (OpImm (ImmInt arg_size)) (OpReg rsp),
+                            DELTA (delta-arg_size),
+                            MOV (floatFormat width) (OpReg arg_reg) (OpAddr (spRel dflags 0))]
+             push_args rest code'
+
+           | otherwise = do
+             -- Arguments can be smaller than 64-bit, but we still use @PUSH
+             -- II64@ - the usual calling conventions expect integers to be
+             -- 8-byte aligned.
+             ASSERT(width <= W64) return ()
+             (arg_op, arg_code) <- getOperand arg
+             delta <- getDeltaNat
+             setDeltaNat (delta-arg_size)
+             let code' = code `appOL` arg_code `appOL` toOL [
+                                    PUSH II64 arg_op,
+                                    DELTA (delta-arg_size)]
+             push_args rest code'
+            where
+              arg_rep = cmmExprType dflags arg
+              width = typeWidth arg_rep
+
+        leaveStackSpace n = do
+             delta <- getDeltaNat
+             setDeltaNat (delta - n * arg_size)
+             return $ toOL [
+                         SUB II64 (OpImm (ImmInt (n * wORD_SIZE dflags))) (OpReg rsp),
+                         DELTA (delta - n * arg_size)]
+
+maybePromoteCArg :: DynFlags -> Width -> CmmExpr -> CmmExpr
+maybePromoteCArg dflags wto arg
+ | wfrom < wto = CmmMachOp (MO_UU_Conv wfrom wto) [arg]
+ | otherwise   = arg
+ where
+   wfrom = cmmExprWidth dflags arg
+
+outOfLineCmmOp :: BlockId -> CallishMachOp -> Maybe CmmFormal -> [CmmActual]
+               -> NatM InstrBlock
+outOfLineCmmOp bid mop res args
+  = do
+      dflags <- getDynFlags
+      targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
+      let target = ForeignTarget targetExpr
+                           (ForeignConvention CCallConv [] [] CmmMayReturn)
+
+      -- We know foreign calls results in no new basic blocks, so we can ignore
+      -- the returned block id.
+      (instrs, _) <- stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args)
+      return instrs
+  where
+        -- Assume we can call these functions directly, and that they're not in a dynamic library.
+        -- TODO: Why is this ok? Under linux this code will be in libm.so
+        --       Is it because they're really implemented as a primitive instruction by the assembler??  -- BL 2009/12/31
+        lbl = mkForeignLabel fn Nothing ForeignLabelInThisPackage IsFunction
+
+        fn = case mop of
+              MO_F32_Sqrt  -> fsLit "sqrtf"
+              MO_F32_Fabs  -> fsLit "fabsf"
+              MO_F32_Sin   -> fsLit "sinf"
+              MO_F32_Cos   -> fsLit "cosf"
+              MO_F32_Tan   -> fsLit "tanf"
+              MO_F32_Exp   -> fsLit "expf"
+              MO_F32_ExpM1 -> fsLit "expm1f"
+              MO_F32_Log   -> fsLit "logf"
+              MO_F32_Log1P -> fsLit "log1pf"
+
+              MO_F32_Asin  -> fsLit "asinf"
+              MO_F32_Acos  -> fsLit "acosf"
+              MO_F32_Atan  -> fsLit "atanf"
+
+              MO_F32_Sinh  -> fsLit "sinhf"
+              MO_F32_Cosh  -> fsLit "coshf"
+              MO_F32_Tanh  -> fsLit "tanhf"
+              MO_F32_Pwr   -> fsLit "powf"
+
+              MO_F32_Asinh -> fsLit "asinhf"
+              MO_F32_Acosh -> fsLit "acoshf"
+              MO_F32_Atanh -> fsLit "atanhf"
+
+              MO_F64_Sqrt  -> fsLit "sqrt"
+              MO_F64_Fabs  -> fsLit "fabs"
+              MO_F64_Sin   -> fsLit "sin"
+              MO_F64_Cos   -> fsLit "cos"
+              MO_F64_Tan   -> fsLit "tan"
+              MO_F64_Exp   -> fsLit "exp"
+              MO_F64_ExpM1 -> fsLit "expm1"
+              MO_F64_Log   -> fsLit "log"
+              MO_F64_Log1P -> fsLit "log1p"
+
+              MO_F64_Asin  -> fsLit "asin"
+              MO_F64_Acos  -> fsLit "acos"
+              MO_F64_Atan  -> fsLit "atan"
+
+              MO_F64_Sinh  -> fsLit "sinh"
+              MO_F64_Cosh  -> fsLit "cosh"
+              MO_F64_Tanh  -> fsLit "tanh"
+              MO_F64_Pwr   -> fsLit "pow"
+
+              MO_F64_Asinh  -> fsLit "asinh"
+              MO_F64_Acosh  -> fsLit "acosh"
+              MO_F64_Atanh  -> fsLit "atanh"
+
+              MO_Memcpy _  -> fsLit "memcpy"
+              MO_Memset _  -> fsLit "memset"
+              MO_Memmove _ -> fsLit "memmove"
+              MO_Memcmp _  -> fsLit "memcmp"
+
+              MO_PopCnt _  -> fsLit "popcnt"
+              MO_BSwap _   -> fsLit "bswap"
+              {- Here the C implementation is used as there is no x86
+              instruction to reverse a word's bit order.
+              -}
+              MO_BRev w    -> fsLit $ bRevLabel w
+              MO_Clz w     -> fsLit $ clzLabel w
+              MO_Ctz _     -> unsupported
+
+              MO_Pdep w    -> fsLit $ pdepLabel w
+              MO_Pext w    -> fsLit $ pextLabel w
+
+              MO_AtomicRMW _ _ -> fsLit "atomicrmw"
+              MO_AtomicRead _  -> fsLit "atomicread"
+              MO_AtomicWrite _ -> fsLit "atomicwrite"
+              MO_Cmpxchg _     -> fsLit "cmpxchg"
+
+              MO_UF_Conv _ -> unsupported
+
+              MO_S_Mul2    {}  -> unsupported
+              MO_S_QuotRem {}  -> unsupported
+              MO_U_QuotRem {}  -> unsupported
+              MO_U_QuotRem2 {} -> unsupported
+              MO_Add2 {}       -> unsupported
+              MO_AddIntC {}    -> unsupported
+              MO_SubIntC {}    -> unsupported
+              MO_AddWordC {}   -> unsupported
+              MO_SubWordC {}   -> unsupported
+              MO_U_Mul2 {}     -> unsupported
+              MO_ReadBarrier   -> unsupported
+              MO_WriteBarrier  -> unsupported
+              MO_Touch         -> unsupported
+              (MO_Prefetch_Data _ ) -> unsupported
+        unsupported = panic ("outOfLineCmmOp: " ++ show mop
+                          ++ " not supported here")
+
+-- -----------------------------------------------------------------------------
+-- Generating a table-branch
+
+genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock
+
+genSwitch dflags expr targets
+  | positionIndependent dflags
+  = do
+        (reg,e_code) <- getNonClobberedReg (cmmOffset dflags expr offset)
+           -- getNonClobberedReg because it needs to survive across t_code
+        lbl <- getNewLabelNat
+        dflags <- getDynFlags
+        let is32bit = target32Bit (targetPlatform dflags)
+            os = platformOS (targetPlatform dflags)
+            -- Might want to use .rodata.<function we're in> instead, but as
+            -- long as it's something unique it'll work out since the
+            -- references to the jump table are in the appropriate section.
+            rosection = case os of
+              -- on Mac OS X/x86_64, put the jump table in the text section to
+              -- work around a limitation of the linker.
+              -- ld64 is unable to handle the relocations for
+              --     .quad L1 - L0
+              -- if L0 is not preceded by a non-anonymous label in its section.
+              OSDarwin | not is32bit -> Section Text lbl
+              _ -> Section ReadOnlyData lbl
+        dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+        (tableReg,t_code) <- getSomeReg $ dynRef
+        let op = OpAddr (AddrBaseIndex (EABaseReg tableReg)
+                                       (EAIndex reg (wORD_SIZE dflags)) (ImmInt 0))
+
+        offsetReg <- getNewRegNat (intFormat (wordWidth dflags))
+        return $ if is32bit || os == OSDarwin
+                 then e_code `appOL` t_code `appOL` toOL [
+                                ADD (intFormat (wordWidth dflags)) op (OpReg tableReg),
+                                JMP_TBL (OpReg tableReg) ids rosection lbl
+                       ]
+                 else -- HACK: On x86_64 binutils<2.17 is only able to generate
+                      -- PC32 relocations, hence we only get 32-bit offsets in
+                      -- the jump table. As these offsets are always negative
+                      -- we need to properly sign extend them to 64-bit. This
+                      -- hack should be removed in conjunction with the hack in
+                      -- PprMach.hs/pprDataItem once binutils 2.17 is standard.
+                      e_code `appOL` t_code `appOL` toOL [
+                               MOVSxL II32 op (OpReg offsetReg),
+                               ADD (intFormat (wordWidth dflags))
+                                   (OpReg offsetReg)
+                                   (OpReg tableReg),
+                               JMP_TBL (OpReg tableReg) ids rosection lbl
+                       ]
+  | otherwise
+  = do
+        (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset)
+        lbl <- getNewLabelNat
+        let op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg (wORD_SIZE dflags)) (ImmCLbl lbl))
+            code = e_code `appOL` toOL [
+                    JMP_TBL op ids (Section ReadOnlyData lbl) lbl
+                 ]
+        return code
+  where
+    (offset, blockIds) = switchTargetsToTable targets
+    ids = map (fmap DestBlockId) blockIds
+
+generateJumpTableForInstr :: DynFlags -> Instr -> Maybe (NatCmmDecl (Alignment, RawCmmStatics) Instr)
+generateJumpTableForInstr dflags (JMP_TBL _ ids section lbl)
+    = let getBlockId (DestBlockId id) = id
+          getBlockId _ = panic "Non-Label target in Jump Table"
+          blockIds = map (fmap getBlockId) ids
+      in Just (createJumpTable dflags blockIds section lbl)
+generateJumpTableForInstr _ _ = Nothing
+
+createJumpTable :: DynFlags -> [Maybe BlockId] -> Section -> CLabel
+                -> GenCmmDecl (Alignment, RawCmmStatics) h g
+createJumpTable dflags ids section lbl
+    = let jumpTable
+            | positionIndependent dflags =
+                  let ww = wordWidth dflags
+                      jumpTableEntryRel Nothing
+                          = CmmStaticLit (CmmInt 0 ww)
+                      jumpTableEntryRel (Just blockid)
+                          = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0 ww)
+                          where blockLabel = blockLbl blockid
+                  in map jumpTableEntryRel ids
+            | otherwise = map (jumpTableEntry dflags) ids
+      in CmmData section (mkAlignment 1, RawCmmStatics lbl jumpTable)
+
+extractUnwindPoints :: [Instr] -> [UnwindPoint]
+extractUnwindPoints instrs =
+    [ UnwindPoint lbl unwinds | UNWIND lbl unwinds <- instrs]
+
+-- -----------------------------------------------------------------------------
+-- 'condIntReg' and 'condFltReg': condition codes into registers
+
+-- Turn those condition codes into integers now (when they appear on
+-- the right hand side of an assignment).
+--
+-- (If applicable) Do not fill the delay slots here; you will confuse the
+-- register allocator.
+
+condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
+
+condIntReg cond x y = do
+  CondCode _ cond cond_code <- condIntCode cond x y
+  tmp <- getNewRegNat II8
+  let
+        code dst = cond_code `appOL` toOL [
+                    SETCC cond (OpReg tmp),
+                    MOVZxL II8 (OpReg tmp) (OpReg dst)
+                  ]
+  return (Any II32 code)
+
+
+-----------------------------------------------------------
+---          Note [SSE Parity Checks]                   ---
+-----------------------------------------------------------
+
+-- We have to worry about unordered operands (eg. comparisons
+-- against NaN).  If the operands are unordered, the comparison
+-- sets the parity flag, carry flag and zero flag.
+-- All comparisons are supposed to return false for unordered
+-- operands except for !=, which returns true.
+--
+-- Optimisation: we don't have to test the parity flag if we
+-- know the test has already excluded the unordered case: eg >
+-- and >= test for a zero carry flag, which can only occur for
+-- ordered operands.
+--
+-- By reversing comparisons we can avoid testing the parity
+-- for < and <= as well. If any of the arguments is an NaN we
+-- return false either way. If both arguments are valid then
+-- x <= y  <->  y >= x  holds. So it's safe to swap these.
+--
+-- We invert the condition inside getRegister'and  getCondCode
+-- which should cover all invertable cases.
+-- All other functions translating FP comparisons to assembly
+-- use these to two generate the comparison code.
+--
+-- As an example consider a simple check:
+--
+-- func :: Float -> Float -> Int
+-- func x y = if x < y then 1 else 0
+--
+-- Which in Cmm gives the floating point comparison.
+--
+--  if (%MO_F_Lt_W32(F1, F2)) goto c2gg; else goto c2gf;
+--
+-- We used to compile this to an assembly code block like this:
+-- _c2gh:
+--  ucomiss %xmm2,%xmm1
+--  jp _c2gf
+--  jb _c2gg
+--  jmp _c2gf
+--
+-- Where we have to introduce an explicit
+-- check for unordered results (using jmp parity):
+--
+-- We can avoid this by exchanging the arguments and inverting the direction
+-- of the comparison. This results in the sequence of:
+--
+--  ucomiss %xmm1,%xmm2
+--  ja _c2g2
+--  jmp _c2g1
+--
+-- Removing the jump reduces the pressure on the branch predidiction system
+-- and plays better with the uOP cache.
+
+condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register
+condFltReg is32Bit cond x y = condFltReg_sse2
+ where
+
+
+  condFltReg_sse2 = do
+    CondCode _ cond cond_code <- condFltCode cond x y
+    tmp1 <- getNewRegNat (archWordFormat is32Bit)
+    tmp2 <- getNewRegNat (archWordFormat is32Bit)
+    let -- See Note [SSE Parity Checks]
+        code dst =
+           cond_code `appOL`
+             (case cond of
+                NE  -> or_unordered dst
+                GU  -> plain_test   dst
+                GEU -> plain_test   dst
+                -- Use ASSERT so we don't break releases if these creep in.
+                LTT -> ASSERT2(False, ppr "Should have been turned into >")
+                       and_ordered  dst
+                LE  -> ASSERT2(False, ppr "Should have been turned into >=")
+                       and_ordered  dst
+                _   -> and_ordered  dst)
+
+        plain_test dst = toOL [
+                    SETCC cond (OpReg tmp1),
+                    MOVZxL II8 (OpReg tmp1) (OpReg dst)
+                 ]
+        or_unordered dst = toOL [
+                    SETCC cond (OpReg tmp1),
+                    SETCC PARITY (OpReg tmp2),
+                    OR II8 (OpReg tmp1) (OpReg tmp2),
+                    MOVZxL II8 (OpReg tmp2) (OpReg dst)
+                  ]
+        and_ordered dst = toOL [
+                    SETCC cond (OpReg tmp1),
+                    SETCC NOTPARITY (OpReg tmp2),
+                    AND II8 (OpReg tmp1) (OpReg tmp2),
+                    MOVZxL II8 (OpReg tmp2) (OpReg dst)
+                  ]
+    return (Any II32 code)
+
+
+-- -----------------------------------------------------------------------------
+-- 'trivial*Code': deal with trivial instructions
+
+-- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
+-- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
+-- Only look for constants on the right hand side, because that's
+-- where the generic optimizer will have put them.
+
+-- Similarly, for unary instructions, we don't have to worry about
+-- matching an StInt as the argument, because genericOpt will already
+-- have handled the constant-folding.
+
+
+{-
+The Rules of the Game are:
+
+* You cannot assume anything about the destination register dst;
+  it may be anything, including a fixed reg.
+
+* You may compute an operand into a fixed reg, but you may not
+  subsequently change the contents of that fixed reg.  If you
+  want to do so, first copy the value either to a temporary
+  or into dst.  You are free to modify dst even if it happens
+  to be a fixed reg -- that's not your problem.
+
+* You cannot assume that a fixed reg will stay live over an
+  arbitrary computation.  The same applies to the dst reg.
+
+* Temporary regs obtained from getNewRegNat are distinct from
+  each other and from all other regs, and stay live over
+  arbitrary computations.
+
+--------------------
+
+SDM's version of The Rules:
+
+* If getRegister returns Any, that means it can generate correct
+  code which places the result in any register, period.  Even if that
+  register happens to be read during the computation.
+
+  Corollary #1: this means that if you are generating code for an
+  operation with two arbitrary operands, you cannot assign the result
+  of the first operand into the destination register before computing
+  the second operand.  The second operand might require the old value
+  of the destination register.
+
+  Corollary #2: A function might be able to generate more efficient
+  code if it knows the destination register is a new temporary (and
+  therefore not read by any of the sub-computations).
+
+* If getRegister returns Any, then the code it generates may modify only:
+        (a) fresh temporaries
+        (b) the destination register
+        (c) known registers (eg. %ecx is used by shifts)
+  In particular, it may *not* modify global registers, unless the global
+  register happens to be the destination register.
+-}
+
+trivialCode :: Width -> (Operand -> Operand -> Instr)
+            -> Maybe (Operand -> Operand -> Instr)
+            -> CmmExpr -> CmmExpr -> NatM Register
+trivialCode width instr m a b
+    = do is32Bit <- is32BitPlatform
+         trivialCode' is32Bit width instr m a b
+
+trivialCode' :: Bool -> Width -> (Operand -> Operand -> Instr)
+             -> Maybe (Operand -> Operand -> Instr)
+             -> CmmExpr -> CmmExpr -> NatM Register
+trivialCode' is32Bit width _ (Just revinstr) (CmmLit lit_a) b
+  | is32BitLit is32Bit lit_a = do
+  b_code <- getAnyReg b
+  let
+       code dst
+         = b_code dst `snocOL`
+           revinstr (OpImm (litToImm lit_a)) (OpReg dst)
+  return (Any (intFormat width) code)
+
+trivialCode' _ width instr _ a b
+  = genTrivialCode (intFormat width) instr a b
+
+-- This is re-used for floating pt instructions too.
+genTrivialCode :: Format -> (Operand -> Operand -> Instr)
+               -> CmmExpr -> CmmExpr -> NatM Register
+genTrivialCode rep instr a b = do
+  (b_op, b_code) <- getNonClobberedOperand b
+  a_code <- getAnyReg a
+  tmp <- getNewRegNat rep
+  let
+     -- We want the value of b to stay alive across the computation of a.
+     -- But, we want to calculate a straight into the destination register,
+     -- because the instruction only has two operands (dst := dst `op` src).
+     -- The troublesome case is when the result of b is in the same register
+     -- as the destination reg.  In this case, we have to save b in a
+     -- new temporary across the computation of a.
+     code dst
+        | dst `regClashesWithOp` b_op =
+                b_code `appOL`
+                unitOL (MOV rep b_op (OpReg tmp)) `appOL`
+                a_code dst `snocOL`
+                instr (OpReg tmp) (OpReg dst)
+        | otherwise =
+                b_code `appOL`
+                a_code dst `snocOL`
+                instr b_op (OpReg dst)
+  return (Any rep code)
+
+regClashesWithOp :: Reg -> Operand -> Bool
+reg `regClashesWithOp` OpReg reg2   = reg == reg2
+reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
+_   `regClashesWithOp` _            = False
+
+-----------
+
+trivialUCode :: Format -> (Operand -> Instr)
+             -> CmmExpr -> NatM Register
+trivialUCode rep instr x = do
+  x_code <- getAnyReg x
+  let
+     code dst =
+        x_code dst `snocOL`
+        instr (OpReg dst)
+  return (Any rep code)
+
+-----------
+
+
+trivialFCode_sse2 :: Width -> (Format -> Operand -> Operand -> Instr)
+                  -> CmmExpr -> CmmExpr -> NatM Register
+trivialFCode_sse2 pk instr x y
+    = genTrivialCode format (instr format) x y
+    where format = floatFormat pk
+
+
+trivialUFCode :: Format -> (Reg -> Reg -> Instr) -> CmmExpr -> NatM Register
+trivialUFCode format instr x = do
+  (x_reg, x_code) <- getSomeReg x
+  let
+     code dst =
+        x_code `snocOL`
+        instr x_reg dst
+  return (Any format code)
+
+
+--------------------------------------------------------------------------------
+coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
+coerceInt2FP from to x =  coerce_sse2
+ where
+
+   coerce_sse2 = do
+     (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+     let
+           opc  = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
+                             n -> panic $ "coerceInt2FP.sse: unhandled width ("
+                                         ++ show n ++ ")"
+           code dst = x_code `snocOL` opc (intFormat from) x_op dst
+     return (Any (floatFormat to) code)
+        -- works even if the destination rep is <II32
+
+--------------------------------------------------------------------------------
+coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
+coerceFP2Int from to x =  coerceFP2Int_sse2
+ where
+   coerceFP2Int_sse2 = do
+     (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+     let
+           opc  = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ;
+                               n -> panic $ "coerceFP2Init.sse: unhandled width ("
+                                           ++ show n ++ ")"
+           code dst = x_code `snocOL` opc (intFormat to) x_op dst
+     return (Any (intFormat to) code)
+         -- works even if the destination rep is <II32
+
+
+--------------------------------------------------------------------------------
+coerceFP2FP :: Width -> CmmExpr -> NatM Register
+coerceFP2FP to x = do
+  (x_reg, x_code) <- getSomeReg x
+  let
+        opc  = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
+                                     n -> panic $ "coerceFP2FP: unhandled width ("
+                                                 ++ show n ++ ")"
+        code dst = x_code `snocOL` opc x_reg dst
+  return (Any ( floatFormat to) code)
+
+--------------------------------------------------------------------------------
+
+sse2NegCode :: Width -> CmmExpr -> NatM Register
+sse2NegCode w x = do
+  let fmt = floatFormat w
+  x_code <- getAnyReg x
+  -- This is how gcc does it, so it can't be that bad:
+  let
+    const = case fmt of
+      FF32 -> CmmInt 0x80000000 W32
+      FF64 -> CmmInt 0x8000000000000000 W64
+      x@II8  -> wrongFmt x
+      x@II16 -> wrongFmt x
+      x@II32 -> wrongFmt x
+      x@II64 -> wrongFmt x
+
+      where
+        wrongFmt x = panic $ "sse2NegCode: " ++ show x
+  Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const
+  tmp <- getNewRegNat fmt
+  let
+    code dst = x_code dst `appOL` amode_code `appOL` toOL [
+        MOV fmt (OpAddr amode) (OpReg tmp),
+        XOR fmt (OpReg tmp) (OpReg dst)
+        ]
+  --
+  return (Any fmt code)
+
+isVecExpr :: CmmExpr -> Bool
+isVecExpr (CmmMachOp (MO_V_Insert {}) _)   = True
+isVecExpr (CmmMachOp (MO_V_Extract {}) _)  = True
+isVecExpr (CmmMachOp (MO_V_Add {}) _)      = True
+isVecExpr (CmmMachOp (MO_V_Sub {}) _)      = True
+isVecExpr (CmmMachOp (MO_V_Mul {}) _)      = True
+isVecExpr (CmmMachOp (MO_VS_Quot {}) _)    = True
+isVecExpr (CmmMachOp (MO_VS_Rem {}) _)     = True
+isVecExpr (CmmMachOp (MO_VS_Neg {}) _)     = True
+isVecExpr (CmmMachOp (MO_VF_Insert {}) _)  = True
+isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Add {}) _)     = True
+isVecExpr (CmmMachOp (MO_VF_Sub {}) _)     = True
+isVecExpr (CmmMachOp (MO_VF_Mul {}) _)     = True
+isVecExpr (CmmMachOp (MO_VF_Quot {}) _)    = True
+isVecExpr (CmmMachOp (MO_VF_Neg {}) _)     = True
+isVecExpr (CmmMachOp _ [e])                = isVecExpr e
+isVecExpr _                                = False
+
+needLlvm :: NatM a
+needLlvm =
+    sorry $ unlines ["The native code generator does not support vector"
+                    ,"instructions. Please use -fllvm."]
+
+-- | This works on the invariant that all jumps in the given blocks are required.
+--   Starting from there we try to make a few more jumps redundant by reordering
+--   them.
+--   We depend on the information in the CFG to do so so without a given CFG
+--   we do nothing.
+invertCondBranches :: Maybe CFG  -- ^ CFG if present
+                   -> LabelMap a -- ^ Blocks with info tables
+                   -> [NatBasicBlock Instr] -- ^ List of basic blocks
+                   -> [NatBasicBlock Instr]
+invertCondBranches Nothing _       bs = bs
+invertCondBranches (Just cfg) keep bs =
+    invert bs
+  where
+    invert :: [NatBasicBlock Instr] -> [NatBasicBlock Instr]
+    invert ((BasicBlock lbl1 ins@(_:_:_xs)):b2@(BasicBlock lbl2 _):bs)
+      | --pprTrace "Block" (ppr lbl1) True,
+        (jmp1,jmp2) <- last2 ins
+      , JXX cond1 target1 <- jmp1
+      , target1 == lbl2
+      --, pprTrace "CutChance" (ppr b1) True
+      , JXX ALWAYS target2 <- jmp2
+      -- We have enough information to check if we can perform the inversion
+      -- TODO: We could also check for the last asm instruction which sets
+      -- status flags instead. Which I suspect is worse in terms of compiler
+      -- performance, but might be applicable to more cases
+      , Just edgeInfo1 <- getEdgeInfo lbl1 target1 cfg
+      , Just edgeInfo2 <- getEdgeInfo lbl1 target2 cfg
+      -- Both jumps come from the same cmm statement
+      , transitionSource edgeInfo1 == transitionSource edgeInfo2
+      , CmmSource {trans_cmmNode = cmmCondBranch} <- transitionSource edgeInfo1
+
+      --Int comparisons are invertable
+      , CmmCondBranch (CmmMachOp op _args) _ _ _ <- cmmCondBranch
+      , Just _ <- maybeIntComparison op
+      , Just invCond <- maybeInvertCond cond1
+
+      --Swap the last two jumps, invert the conditional jumps condition.
+      = let jumps =
+              case () of
+                -- We are free the eliminate the jmp. So we do so.
+                _ | not (mapMember target1 keep)
+                    -> [JXX invCond target2]
+                -- If the conditional target is unlikely we put the other
+                -- target at the front.
+                  | edgeWeight edgeInfo2 > edgeWeight edgeInfo1
+                    -> [JXX invCond target2, JXX ALWAYS target1]
+                -- Keep things as-is otherwise
+                  | otherwise
+                    -> [jmp1, jmp2]
+        in --pprTrace "Cutable" (ppr [jmp1,jmp2] <+> text "=>" <+> ppr jumps) $
+           (BasicBlock lbl1
+            (dropTail 2 ins ++ jumps))
+            : invert (b2:bs)
+    invert (b:bs) = b : invert bs
+    invert [] = []
diff --git a/compiler/GHC/CmmToAsm/X86/Cond.hs b/compiler/GHC/CmmToAsm/X86/Cond.hs
new file mode 100644
index 0000000000..bb8f61438b
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/Cond.hs
@@ -0,0 +1,109 @@
+module GHC.CmmToAsm.X86.Cond (
+        Cond(..),
+        condUnsigned,
+        condToSigned,
+        condToUnsigned,
+        maybeFlipCond,
+        maybeInvertCond
+)
+
+where
+
+import GhcPrelude
+
+data Cond
+        = ALWAYS        -- What's really used? ToDo
+        | EQQ
+        | GE
+        | GEU
+        | GTT
+        | GU
+        | LE
+        | LEU
+        | LTT
+        | LU
+        | NE
+        | NEG
+        | POS
+        | CARRY
+        | OFLO
+        | PARITY
+        | NOTPARITY
+        deriving Eq
+
+condUnsigned :: Cond -> Bool
+condUnsigned GU  = True
+condUnsigned LU  = True
+condUnsigned GEU = True
+condUnsigned LEU = True
+condUnsigned _   = False
+
+
+condToSigned :: Cond -> Cond
+condToSigned GU  = GTT
+condToSigned LU  = LTT
+condToSigned GEU = GE
+condToSigned LEU = LE
+condToSigned x   = x
+
+
+condToUnsigned :: Cond -> Cond
+condToUnsigned GTT = GU
+condToUnsigned LTT = LU
+condToUnsigned GE  = GEU
+condToUnsigned LE  = LEU
+condToUnsigned x   = x
+
+-- | @maybeFlipCond c@ returns @Just c'@ if it is possible to flip the
+-- arguments to the conditional @c@, and the new condition should be @c'@.
+maybeFlipCond :: Cond -> Maybe Cond
+maybeFlipCond cond  = case cond of
+        EQQ   -> Just EQQ
+        NE    -> Just NE
+        LU    -> Just GU
+        GU    -> Just LU
+        LEU   -> Just GEU
+        GEU   -> Just LEU
+        LTT   -> Just GTT
+        GTT   -> Just LTT
+        LE    -> Just GE
+        GE    -> Just LE
+        _other -> Nothing
+
+-- | If we apply @maybeInvertCond@ to the condition of a jump we turn
+-- jumps taken into jumps not taken and vice versa.
+--
+-- Careful! If the used comparison and the conditional jump
+-- don't match the above behaviour will NOT hold.
+-- When used for FP comparisons this does not consider unordered
+-- numbers.
+-- Also inverting twice might return a synonym for the original condition.
+maybeInvertCond :: Cond -> Maybe Cond
+maybeInvertCond cond  = case cond of
+        ALWAYS  -> Nothing
+        EQQ     -> Just NE
+        NE      -> Just EQQ
+
+        NEG     -> Just POS
+        POS     -> Just NEG
+
+        GEU     -> Just LU
+        LU      -> Just GEU
+
+        GE      -> Just LTT
+        LTT     -> Just GE
+
+        GTT     -> Just LE
+        LE      -> Just GTT
+
+        GU      -> Just LEU
+        LEU     -> Just GU
+
+        --GEU "==" NOTCARRY, they are synonyms
+        --at the assembly level
+        CARRY   -> Just GEU
+
+        OFLO    -> Nothing
+
+        PARITY  -> Just NOTPARITY
+        NOTPARITY -> Just PARITY
diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs
new file mode 100644
index 0000000000..4171806695
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/Instr.hs
@@ -0,0 +1,1056 @@
+{-# LANGUAGE CPP, TypeFamilies #-}
+
+-----------------------------------------------------------------------------
+--
+-- Machine-dependent assembly language
+--
+-- (c) The University of Glasgow 1993-2004
+--
+-----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.X86.Instr
+   ( Instr(..), Operand(..), PrefetchVariant(..), JumpDest(..)
+   , getJumpDestBlockId, canShortcut, shortcutStatics
+   , shortcutJump, allocMoreStack
+   , maxSpillSlots, archWordFormat
+   )
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Cond
+import GHC.CmmToAsm.X86.Regs
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Reg.Target
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Platform.Regs
+import GHC.Cmm
+import FastString
+import Outputable
+import GHC.Platform
+
+import BasicTypes       (Alignment)
+import GHC.Cmm.CLabel
+import GHC.Driver.Session
+import UniqSet
+import Unique
+import UniqSupply
+import GHC.Cmm.DebugBlock (UnwindTable)
+
+import Control.Monad
+import Data.Maybe       (fromMaybe)
+
+-- Format of an x86/x86_64 memory address, in bytes.
+--
+archWordFormat :: Bool -> Format
+archWordFormat is32Bit
+ | is32Bit   = II32
+ | otherwise = II64
+
+-- | Instruction instance for x86 instruction set.
+instance Instruction Instr where
+        regUsageOfInstr         = x86_regUsageOfInstr
+        patchRegsOfInstr        = x86_patchRegsOfInstr
+        isJumpishInstr          = x86_isJumpishInstr
+        jumpDestsOfInstr        = x86_jumpDestsOfInstr
+        patchJumpInstr          = x86_patchJumpInstr
+        mkSpillInstr            = x86_mkSpillInstr
+        mkLoadInstr             = x86_mkLoadInstr
+        takeDeltaInstr          = x86_takeDeltaInstr
+        isMetaInstr             = x86_isMetaInstr
+        mkRegRegMoveInstr       = x86_mkRegRegMoveInstr
+        takeRegRegMoveInstr     = x86_takeRegRegMoveInstr
+        mkJumpInstr             = x86_mkJumpInstr
+        mkStackAllocInstr       = x86_mkStackAllocInstr
+        mkStackDeallocInstr     = x86_mkStackDeallocInstr
+
+
+-- -----------------------------------------------------------------------------
+-- Intel x86 instructions
+
+{-
+Intel, in their infinite wisdom, selected a stack model for floating
+point registers on x86.  That might have made sense back in 1979 --
+nowadays we can see it for the nonsense it really is.  A stack model
+fits poorly with the existing nativeGen infrastructure, which assumes
+flat integer and FP register sets.  Prior to this commit, nativeGen
+could not generate correct x86 FP code -- to do so would have meant
+somehow working the register-stack paradigm into the register
+allocator and spiller, which sounds very difficult.
+
+We have decided to cheat, and go for a simple fix which requires no
+infrastructure modifications, at the expense of generating ropey but
+correct FP code.  All notions of the x86 FP stack and its insns have
+been removed.  Instead, we pretend (to the instruction selector and
+register allocator) that x86 has six floating point registers, %fake0
+.. %fake5, which can be used in the usual flat manner.  We further
+claim that x86 has floating point instructions very similar to SPARC
+and Alpha, that is, a simple 3-operand register-register arrangement.
+Code generation and register allocation proceed on this basis.
+
+When we come to print out the final assembly, our convenient fiction
+is converted to dismal reality.  Each fake instruction is
+independently converted to a series of real x86 instructions.
+%fake0 .. %fake5 are mapped to %st(0) .. %st(5).  To do reg-reg
+arithmetic operations, the two operands are pushed onto the top of the
+FP stack, the operation done, and the result copied back into the
+relevant register.  There are only six %fake registers because 2 are
+needed for the translation, and x86 has 8 in total.
+
+The translation is inefficient but is simple and it works.  A cleverer
+translation would handle a sequence of insns, simulating the FP stack
+contents, would not impose a fixed mapping from %fake to %st regs, and
+hopefully could avoid most of the redundant reg-reg moves of the
+current translation.
+
+We might as well make use of whatever unique FP facilities Intel have
+chosen to bless us with (let's not be churlish, after all).
+Hence GLDZ and GLD1.  Bwahahahahahahaha!
+-}
+
+{-
+Note [x86 Floating point precision]
+
+Intel's internal floating point registers are by default 80 bit
+extended precision.  This means that all operations done on values in
+registers are done at 80 bits, and unless the intermediate values are
+truncated to the appropriate size (32 or 64 bits) by storing in
+memory, calculations in registers will give different results from
+calculations which pass intermediate values in memory (eg. via
+function calls).
+
+One solution is to set the FPU into 64 bit precision mode.  Some OSs
+do this (eg. FreeBSD) and some don't (eg. Linux).  The problem here is
+that this will only affect 64-bit precision arithmetic; 32-bit
+calculations will still be done at 64-bit precision in registers.  So
+it doesn't solve the whole problem.
+
+There's also the issue of what the C library is expecting in terms of
+precision.  It seems to be the case that glibc on Linux expects the
+FPU to be set to 80 bit precision, so setting it to 64 bit could have
+unexpected effects.  Changing the default could have undesirable
+effects on other 3rd-party library code too, so the right thing would
+be to save/restore the FPU control word across Haskell code if we were
+to do this.
+
+gcc's -ffloat-store gives consistent results by always storing the
+results of floating-point calculations in memory, which works for both
+32 and 64-bit precision.  However, it only affects the values of
+user-declared floating point variables in C, not intermediate results.
+GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
+flag).
+
+Another problem is how to spill floating point registers in the
+register allocator.  Should we spill the whole 80 bits, or just 64?
+On an OS which is set to 64 bit precision, spilling 64 is fine.  On
+Linux, spilling 64 bits will round the results of some operations.
+This is what gcc does.  Spilling at 80 bits requires taking up a full
+128 bit slot (so we get alignment).  We spill at 80-bits and ignore
+the alignment problems.
+
+In the future [edit: now available in GHC 7.0.1, with the -msse2
+flag], we'll use the SSE registers for floating point.  This requires
+a CPU that supports SSE2 (ordinary SSE only supports 32 bit precision
+float ops), which means P4 or Xeon and above.  Using SSE will solve
+all these problems, because the SSE registers use fixed 32 bit or 64
+bit precision.
+
+--SDM 1/2003
+-}
+
+data Instr
+        -- comment pseudo-op
+        = COMMENT FastString
+
+        -- location pseudo-op (file, line, col, name)
+        | LOCATION Int Int Int String
+
+        -- some static data spat out during code
+        -- generation.  Will be extracted before
+        -- pretty-printing.
+        | LDATA   Section (Alignment, RawCmmStatics)
+
+        -- start a new basic block.  Useful during
+        -- codegen, removed later.  Preceding
+        -- instruction should be a jump, as per the
+        -- invariants for a BasicBlock (see Cmm).
+        | NEWBLOCK BlockId
+
+        -- unwinding information
+        -- See Note [Unwinding information in the NCG].
+        | UNWIND CLabel UnwindTable
+
+        -- specify current stack offset for benefit of subsequent passes.
+        -- This carries a BlockId so it can be used in unwinding information.
+        | DELTA  Int
+
+        -- Moves.
+        | MOV         Format Operand Operand
+        | CMOV   Cond Format Operand Reg
+        | MOVZxL      Format Operand Operand -- format is the size of operand 1
+        | MOVSxL      Format Operand Operand -- format is the size of operand 1
+        -- x86_64 note: plain mov into a 32-bit register always zero-extends
+        -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
+        -- don't affect the high bits of the register.
+
+        -- Load effective address (also a very useful three-operand add instruction :-)
+        | LEA         Format Operand Operand
+
+        -- Int Arithmetic.
+        | ADD         Format Operand Operand
+        | ADC         Format Operand Operand
+        | SUB         Format Operand Operand
+        | SBB         Format Operand Operand
+
+        | MUL         Format Operand Operand
+        | MUL2        Format Operand         -- %edx:%eax = operand * %rax
+        | IMUL        Format Operand Operand -- signed int mul
+        | IMUL2       Format Operand         -- %edx:%eax = operand * %eax
+
+        | DIV         Format Operand         -- eax := eax:edx/op, edx := eax:edx%op
+        | IDIV        Format Operand         -- ditto, but signed
+
+        -- Int Arithmetic, where the effects on the condition register
+        -- are important. Used in specialized sequences such as MO_Add2.
+        -- Do not rewrite these instructions to "equivalent" ones that
+        -- have different effect on the condition register! (See #9013.)
+        | ADD_CC      Format Operand Operand
+        | SUB_CC      Format Operand Operand
+
+        -- Simple bit-twiddling.
+        | AND         Format Operand Operand
+        | OR          Format Operand Operand
+        | XOR         Format Operand Operand
+        | NOT         Format Operand
+        | NEGI        Format Operand         -- NEG instruction (name clash with Cond)
+        | BSWAP       Format Reg
+
+        -- Shifts (amount may be immediate or %cl only)
+        | SHL         Format Operand{-amount-} Operand
+        | SAR         Format Operand{-amount-} Operand
+        | SHR         Format Operand{-amount-} Operand
+
+        | BT          Format Imm Operand
+        | NOP
+
+
+        -- We need to support the FSTP (x87 store and pop) instruction
+        -- so that we can correctly read off the return value of an
+        -- x86 CDECL C function call when its floating point.
+        -- so we dont include a register argument, and just use st(0)
+        -- this instruction is used ONLY for return values of C ffi calls
+        -- in x86_32 abi
+        | X87Store         Format  AddrMode -- st(0), dst
+
+
+        -- SSE2 floating point: we use a restricted set of the available SSE2
+        -- instructions for floating-point.
+        -- use MOV for moving (either movss or movsd (movlpd better?))
+        | CVTSS2SD      Reg Reg            -- F32 to F64
+        | CVTSD2SS      Reg Reg            -- F64 to F32
+        | CVTTSS2SIQ    Format Operand Reg -- F32 to I32/I64 (with truncation)
+        | CVTTSD2SIQ    Format Operand Reg -- F64 to I32/I64 (with truncation)
+        | CVTSI2SS      Format Operand Reg -- I32/I64 to F32
+        | CVTSI2SD      Format Operand Reg -- I32/I64 to F64
+
+        -- use ADD, SUB, and SQRT for arithmetic.  In both cases, operands
+        -- are  Operand Reg.
+
+        -- SSE2 floating-point division:
+        | FDIV          Format Operand Operand   -- divisor, dividend(dst)
+
+        -- use CMP for comparisons.  ucomiss and ucomisd instructions
+        -- compare single/double prec floating point respectively.
+
+        | SQRT          Format Operand Reg      -- src, dst
+
+
+        -- Comparison
+        | TEST          Format Operand Operand
+        | CMP           Format Operand Operand
+        | SETCC         Cond Operand
+
+        -- Stack Operations.
+        | PUSH          Format Operand
+        | POP           Format Operand
+        -- both unused (SDM):
+        --  | PUSHA
+        --  | POPA
+
+        -- Jumping around.
+        | JMP         Operand [Reg] -- including live Regs at the call
+        | JXX         Cond BlockId  -- includes unconditional branches
+        | JXX_GBL     Cond Imm      -- non-local version of JXX
+        -- Table jump
+        | JMP_TBL     Operand   -- Address to jump to
+                      [Maybe JumpDest] -- Targets of the jump table
+                      Section   -- Data section jump table should be put in
+                      CLabel    -- Label of jump table
+        -- | X86 call instruction
+        | CALL        (Either Imm Reg) -- ^ Jump target
+                      [Reg]            -- ^ Arguments (required for register allocation)
+
+        -- Other things.
+        | CLTD Format            -- sign extend %eax into %edx:%eax
+
+        | FETCHGOT    Reg        -- pseudo-insn for ELF position-independent code
+                                 -- pretty-prints as
+                                 --       call 1f
+                                 -- 1:    popl %reg
+                                 --       addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
+        | FETCHPC     Reg        -- pseudo-insn for Darwin position-independent code
+                                 -- pretty-prints as
+                                 --       call 1f
+                                 -- 1:    popl %reg
+
+    -- bit counting instructions
+        | POPCNT      Format Operand Reg -- [SSE4.2] count number of bits set to 1
+        | LZCNT       Format Operand Reg -- [BMI2] count number of leading zeros
+        | TZCNT       Format Operand Reg -- [BMI2] count number of trailing zeros
+        | BSF         Format Operand Reg -- bit scan forward
+        | BSR         Format Operand Reg -- bit scan reverse
+
+    -- bit manipulation instructions
+        | PDEP        Format Operand Operand Reg -- [BMI2] deposit bits to   the specified mask
+        | PEXT        Format Operand Operand Reg -- [BMI2] extract bits from the specified mask
+
+    -- prefetch
+        | PREFETCH  PrefetchVariant Format Operand -- prefetch Variant, addr size, address to prefetch
+                                        -- variant can be NTA, Lvl0, Lvl1, or Lvl2
+
+        | LOCK        Instr -- lock prefix
+        | XADD        Format Operand Operand -- src (r), dst (r/m)
+        | CMPXCHG     Format Operand Operand -- src (r), dst (r/m), eax implicit
+        | MFENCE
+
+data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
+
+
+data Operand
+        = OpReg  Reg            -- register
+        | OpImm  Imm            -- immediate value
+        | OpAddr AddrMode       -- memory reference
+
+
+
+-- | Returns which registers are read and written as a (read, written)
+-- pair.
+x86_regUsageOfInstr :: Platform -> Instr -> RegUsage
+x86_regUsageOfInstr platform instr
+ = case instr of
+    MOV    _ src dst    -> usageRW src dst
+    CMOV _ _ src dst    -> mkRU (use_R src [dst]) [dst]
+    MOVZxL _ src dst    -> usageRW src dst
+    MOVSxL _ src dst    -> usageRW src dst
+    LEA    _ src dst    -> usageRW src dst
+    ADD    _ src dst    -> usageRM src dst
+    ADC    _ src dst    -> usageRM src dst
+    SUB    _ src dst    -> usageRM src dst
+    SBB    _ src dst    -> usageRM src dst
+    IMUL   _ src dst    -> usageRM src dst
+
+    -- Result of IMULB will be in just in %ax
+    IMUL2  II8 src       -> mkRU (eax:use_R src []) [eax]
+    -- Result of IMUL for wider values, will be split between %dx/%edx/%rdx and
+    -- %ax/%eax/%rax.
+    IMUL2  _ src        -> mkRU (eax:use_R src []) [eax,edx]
+
+    MUL    _ src dst    -> usageRM src dst
+    MUL2   _ src        -> mkRU (eax:use_R src []) [eax,edx]
+    DIV    _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
+    IDIV   _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
+    ADD_CC _ src dst    -> usageRM src dst
+    SUB_CC _ src dst    -> usageRM src dst
+    AND    _ src dst    -> usageRM src dst
+    OR     _ src dst    -> usageRM src dst
+
+    XOR    _ (OpReg src) (OpReg dst)
+        | src == dst    -> mkRU [] [dst]
+
+    XOR    _ src dst    -> usageRM src dst
+    NOT    _ op         -> usageM op
+    BSWAP  _ reg        -> mkRU [reg] [reg]
+    NEGI   _ op         -> usageM op
+    SHL    _ imm dst    -> usageRM imm dst
+    SAR    _ imm dst    -> usageRM imm dst
+    SHR    _ imm dst    -> usageRM imm dst
+    BT     _ _   src    -> mkRUR (use_R src [])
+
+    PUSH   _ op         -> mkRUR (use_R op [])
+    POP    _ op         -> mkRU [] (def_W op)
+    TEST   _ src dst    -> mkRUR (use_R src $! use_R dst [])
+    CMP    _ src dst    -> mkRUR (use_R src $! use_R dst [])
+    SETCC  _ op         -> mkRU [] (def_W op)
+    JXX    _ _          -> mkRU [] []
+    JXX_GBL _ _         -> mkRU [] []
+    JMP     op regs     -> mkRUR (use_R op regs)
+    JMP_TBL op _ _ _    -> mkRUR (use_R op [])
+    CALL (Left _)  params   -> mkRU params (callClobberedRegs platform)
+    CALL (Right reg) params -> mkRU (reg:params) (callClobberedRegs platform)
+    CLTD   _            -> mkRU [eax] [edx]
+    NOP                 -> mkRU [] []
+
+    X87Store    _  dst    -> mkRUR ( use_EA dst [])
+
+    CVTSS2SD   src dst  -> mkRU [src] [dst]
+    CVTSD2SS   src dst  -> mkRU [src] [dst]
+    CVTTSS2SIQ _ src dst -> mkRU (use_R src []) [dst]
+    CVTTSD2SIQ _ src dst -> mkRU (use_R src []) [dst]
+    CVTSI2SS   _ src dst -> mkRU (use_R src []) [dst]
+    CVTSI2SD   _ src dst -> mkRU (use_R src []) [dst]
+    FDIV _     src dst  -> usageRM src dst
+    SQRT _ src dst      -> mkRU (use_R src []) [dst]
+
+    FETCHGOT reg        -> mkRU [] [reg]
+    FETCHPC  reg        -> mkRU [] [reg]
+
+    COMMENT _           -> noUsage
+    LOCATION{}          -> noUsage
+    UNWIND{}            -> noUsage
+    DELTA   _           -> noUsage
+
+    POPCNT _ src dst -> mkRU (use_R src []) [dst]
+    LZCNT  _ src dst -> mkRU (use_R src []) [dst]
+    TZCNT  _ src dst -> mkRU (use_R src []) [dst]
+    BSF    _ src dst -> mkRU (use_R src []) [dst]
+    BSR    _ src dst -> mkRU (use_R src []) [dst]
+
+    PDEP   _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
+    PEXT   _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
+
+    -- note: might be a better way to do this
+    PREFETCH _  _ src -> mkRU (use_R src []) []
+    LOCK i              -> x86_regUsageOfInstr platform i
+    XADD _ src dst      -> usageMM src dst
+    CMPXCHG _ src dst   -> usageRMM src dst (OpReg eax)
+    MFENCE -> noUsage
+
+    _other              -> panic "regUsage: unrecognised instr"
+ where
+    -- # Definitions
+    --
+    -- Written: If the operand is a register, it's written. If it's an
+    -- address, registers mentioned in the address are read.
+    --
+    -- Modified: If the operand is a register, it's both read and
+    -- written. If it's an address, registers mentioned in the address
+    -- are read.
+
+    -- 2 operand form; first operand Read; second Written
+    usageRW :: Operand -> Operand -> RegUsage
+    usageRW op (OpReg reg)      = mkRU (use_R op []) [reg]
+    usageRW op (OpAddr ea)      = mkRUR (use_R op $! use_EA ea [])
+    usageRW _ _                 = panic "X86.RegInfo.usageRW: no match"
+
+    -- 2 operand form; first operand Read; second Modified
+    usageRM :: Operand -> Operand -> RegUsage
+    usageRM op (OpReg reg)      = mkRU (use_R op [reg]) [reg]
+    usageRM op (OpAddr ea)      = mkRUR (use_R op $! use_EA ea [])
+    usageRM _ _                 = panic "X86.RegInfo.usageRM: no match"
+
+    -- 2 operand form; first operand Modified; second Modified
+    usageMM :: Operand -> Operand -> RegUsage
+    usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst]
+    usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src]
+    usageMM _ _                     = panic "X86.RegInfo.usageMM: no match"
+
+    -- 3 operand form; first operand Read; second Modified; third Modified
+    usageRMM :: Operand -> Operand -> Operand -> RegUsage
+    usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg]
+    usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg]
+    usageRMM _ _ _                               = panic "X86.RegInfo.usageRMM: no match"
+
+    -- 1 operand form; operand Modified
+    usageM :: Operand -> RegUsage
+    usageM (OpReg reg)          = mkRU [reg] [reg]
+    usageM (OpAddr ea)          = mkRUR (use_EA ea [])
+    usageM _                    = panic "X86.RegInfo.usageM: no match"
+
+    -- Registers defd when an operand is written.
+    def_W (OpReg reg)           = [reg]
+    def_W (OpAddr _ )           = []
+    def_W _                     = panic "X86.RegInfo.def_W: no match"
+
+    -- Registers used when an operand is read.
+    use_R (OpReg reg)  tl = reg : tl
+    use_R (OpImm _)    tl = tl
+    use_R (OpAddr ea)  tl = use_EA ea tl
+
+    -- Registers used to compute an effective address.
+    use_EA (ImmAddr _ _) tl = tl
+    use_EA (AddrBaseIndex base index _) tl =
+        use_base base $! use_index index tl
+        where use_base (EABaseReg r)  tl = r : tl
+              use_base _              tl = tl
+              use_index EAIndexNone   tl = tl
+              use_index (EAIndex i _) tl = i : tl
+
+    mkRUR src = src' `seq` RU src' []
+        where src' = filter (interesting platform) src
+
+    mkRU src dst = src' `seq` dst' `seq` RU src' dst'
+        where src' = filter (interesting platform) src
+              dst' = filter (interesting platform) dst
+
+-- | Is this register interesting for the register allocator?
+interesting :: Platform -> Reg -> Bool
+interesting _        (RegVirtual _)              = True
+interesting platform (RegReal (RealRegSingle i)) = freeReg platform i
+interesting _        (RegReal (RealRegPair{}))   = panic "X86.interesting: no reg pairs on this arch"
+
+
+
+-- | Applies the supplied function to all registers in instructions.
+-- Typically used to change virtual registers to real registers.
+x86_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
+x86_patchRegsOfInstr instr env
+ = case instr of
+    MOV  fmt src dst     -> patch2 (MOV  fmt) src dst
+    CMOV cc fmt src dst  -> CMOV cc fmt (patchOp src) (env dst)
+    MOVZxL fmt src dst   -> patch2 (MOVZxL fmt) src dst
+    MOVSxL fmt src dst   -> patch2 (MOVSxL fmt) src dst
+    LEA  fmt src dst     -> patch2 (LEA  fmt) src dst
+    ADD  fmt src dst     -> patch2 (ADD  fmt) src dst
+    ADC  fmt src dst     -> patch2 (ADC  fmt) src dst
+    SUB  fmt src dst     -> patch2 (SUB  fmt) src dst
+    SBB  fmt src dst     -> patch2 (SBB  fmt) src dst
+    IMUL fmt src dst     -> patch2 (IMUL fmt) src dst
+    IMUL2 fmt src        -> patch1 (IMUL2 fmt) src
+    MUL fmt src dst      -> patch2 (MUL fmt) src dst
+    MUL2 fmt src         -> patch1 (MUL2 fmt) src
+    IDIV fmt op          -> patch1 (IDIV fmt) op
+    DIV fmt op           -> patch1 (DIV fmt) op
+    ADD_CC fmt src dst   -> patch2 (ADD_CC fmt) src dst
+    SUB_CC fmt src dst   -> patch2 (SUB_CC fmt) src dst
+    AND  fmt src dst     -> patch2 (AND  fmt) src dst
+    OR   fmt src dst     -> patch2 (OR   fmt) src dst
+    XOR  fmt src dst     -> patch2 (XOR  fmt) src dst
+    NOT  fmt op          -> patch1 (NOT  fmt) op
+    BSWAP fmt reg        -> BSWAP fmt (env reg)
+    NEGI fmt op          -> patch1 (NEGI fmt) op
+    SHL  fmt imm dst     -> patch1 (SHL fmt imm) dst
+    SAR  fmt imm dst     -> patch1 (SAR fmt imm) dst
+    SHR  fmt imm dst     -> patch1 (SHR fmt imm) dst
+    BT   fmt imm src     -> patch1 (BT  fmt imm) src
+    TEST fmt src dst     -> patch2 (TEST fmt) src dst
+    CMP  fmt src dst     -> patch2 (CMP  fmt) src dst
+    PUSH fmt op          -> patch1 (PUSH fmt) op
+    POP  fmt op          -> patch1 (POP  fmt) op
+    SETCC cond op        -> patch1 (SETCC cond) op
+    JMP op regs          -> JMP (patchOp op) regs
+    JMP_TBL op ids s lbl -> JMP_TBL (patchOp op) ids s lbl
+
+    -- literally only support storing the top x87 stack value st(0)
+    X87Store  fmt  dst     -> X87Store fmt  (lookupAddr dst)
+
+    CVTSS2SD src dst    -> CVTSS2SD (env src) (env dst)
+    CVTSD2SS src dst    -> CVTSD2SS (env src) (env dst)
+    CVTTSS2SIQ fmt src dst -> CVTTSS2SIQ fmt (patchOp src) (env dst)
+    CVTTSD2SIQ fmt src dst -> CVTTSD2SIQ fmt (patchOp src) (env dst)
+    CVTSI2SS fmt src dst -> CVTSI2SS fmt (patchOp src) (env dst)
+    CVTSI2SD fmt src dst -> CVTSI2SD fmt (patchOp src) (env dst)
+    FDIV fmt src dst     -> FDIV fmt (patchOp src) (patchOp dst)
+    SQRT fmt src dst    -> SQRT fmt (patchOp src) (env dst)
+
+    CALL (Left _)  _    -> instr
+    CALL (Right reg) p  -> CALL (Right (env reg)) p
+
+    FETCHGOT reg        -> FETCHGOT (env reg)
+    FETCHPC  reg        -> FETCHPC  (env reg)
+
+    NOP                 -> instr
+    COMMENT _           -> instr
+    LOCATION {}         -> instr
+    UNWIND {}           -> instr
+    DELTA _             -> instr
+
+    JXX _ _             -> instr
+    JXX_GBL _ _         -> instr
+    CLTD _              -> instr
+
+    POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst)
+    LZCNT  fmt src dst -> LZCNT  fmt (patchOp src) (env dst)
+    TZCNT  fmt src dst -> TZCNT  fmt (patchOp src) (env dst)
+    PDEP   fmt src mask dst -> PDEP   fmt (patchOp src) (patchOp mask) (env dst)
+    PEXT   fmt src mask dst -> PEXT   fmt (patchOp src) (patchOp mask) (env dst)
+    BSF    fmt src dst -> BSF    fmt (patchOp src) (env dst)
+    BSR    fmt src dst -> BSR    fmt (patchOp src) (env dst)
+
+    PREFETCH lvl format src -> PREFETCH lvl format (patchOp src)
+
+    LOCK i               -> LOCK (x86_patchRegsOfInstr i env)
+    XADD fmt src dst     -> patch2 (XADD fmt) src dst
+    CMPXCHG fmt src dst  -> patch2 (CMPXCHG fmt) src dst
+    MFENCE               -> instr
+
+    _other              -> panic "patchRegs: unrecognised instr"
+
+  where
+    patch1 :: (Operand -> a) -> Operand -> a
+    patch1 insn op      = insn $! patchOp op
+    patch2 :: (Operand -> Operand -> a) -> Operand -> Operand -> a
+    patch2 insn src dst = (insn $! patchOp src) $! patchOp dst
+
+    patchOp (OpReg  reg) = OpReg $! env reg
+    patchOp (OpImm  imm) = OpImm imm
+    patchOp (OpAddr ea)  = OpAddr $! lookupAddr ea
+
+    lookupAddr (ImmAddr imm off) = ImmAddr imm off
+    lookupAddr (AddrBaseIndex base index disp)
+      = ((AddrBaseIndex $! lookupBase base) $! lookupIndex index) disp
+      where
+        lookupBase EABaseNone       = EABaseNone
+        lookupBase EABaseRip        = EABaseRip
+        lookupBase (EABaseReg r)    = EABaseReg $! env r
+
+        lookupIndex EAIndexNone     = EAIndexNone
+        lookupIndex (EAIndex r i)   = (EAIndex $! env r) i
+
+
+--------------------------------------------------------------------------------
+x86_isJumpishInstr
+        :: Instr -> Bool
+
+x86_isJumpishInstr instr
+ = case instr of
+        JMP{}           -> True
+        JXX{}           -> True
+        JXX_GBL{}       -> True
+        JMP_TBL{}       -> True
+        CALL{}          -> True
+        _               -> False
+
+
+x86_jumpDestsOfInstr
+        :: Instr
+        -> [BlockId]
+
+x86_jumpDestsOfInstr insn
+  = case insn of
+        JXX _ id        -> [id]
+        JMP_TBL _ ids _ _ -> [id | Just (DestBlockId id) <- ids]
+        _               -> []
+
+
+x86_patchJumpInstr
+        :: Instr -> (BlockId -> BlockId) -> Instr
+
+x86_patchJumpInstr insn patchF
+  = case insn of
+        JXX cc id       -> JXX cc (patchF id)
+        JMP_TBL op ids section lbl
+          -> JMP_TBL op (map (fmap (patchJumpDest patchF)) ids) section lbl
+        _               -> insn
+    where
+        patchJumpDest f (DestBlockId id) = DestBlockId (f id)
+        patchJumpDest _ dest             = dest
+
+
+
+
+
+-- -----------------------------------------------------------------------------
+-- | Make a spill instruction.
+x86_mkSpillInstr
+    :: DynFlags
+    -> Reg      -- register to spill
+    -> Int      -- current stack delta
+    -> Int      -- spill slot to use
+    -> Instr
+
+x86_mkSpillInstr dflags reg delta slot
+  = let off     = spillSlotToOffset platform slot - delta
+    in
+    case targetClassOfReg platform reg of
+           RcInteger   -> MOV (archWordFormat is32Bit)
+                              (OpReg reg) (OpAddr (spRel dflags off))
+           RcDouble    -> MOV FF64 (OpReg reg) (OpAddr (spRel dflags off))
+           _         -> panic "X86.mkSpillInstr: no match"
+    where platform = targetPlatform dflags
+          is32Bit = target32Bit platform
+
+-- | Make a spill reload instruction.
+x86_mkLoadInstr
+    :: DynFlags
+    -> Reg      -- register to load
+    -> Int      -- current stack delta
+    -> Int      -- spill slot to use
+    -> Instr
+
+x86_mkLoadInstr dflags reg delta slot
+  = let off     = spillSlotToOffset platform slot - delta
+    in
+        case targetClassOfReg platform reg of
+              RcInteger -> MOV (archWordFormat is32Bit)
+                               (OpAddr (spRel dflags off)) (OpReg reg)
+              RcDouble  -> MOV FF64 (OpAddr (spRel dflags off)) (OpReg reg)
+              _           -> panic "X86.x86_mkLoadInstr"
+    where platform = targetPlatform dflags
+          is32Bit = target32Bit platform
+
+spillSlotSize :: Platform -> Int
+spillSlotSize dflags = if is32Bit then 12 else 8
+    where is32Bit = target32Bit dflags
+
+maxSpillSlots :: DynFlags -> Int
+maxSpillSlots dflags
+    = ((rESERVED_C_STACK_BYTES dflags - 64) `div` spillSlotSize (targetPlatform dflags)) - 1
+--     = 0 -- useful for testing allocMoreStack
+
+-- number of bytes that the stack pointer should be aligned to
+stackAlign :: Int
+stackAlign = 16
+
+-- convert a spill slot number to a *byte* offset, with no sign:
+-- decide on a per arch basis whether you are spilling above or below
+-- the C stack pointer.
+spillSlotToOffset :: Platform -> Int -> Int
+spillSlotToOffset platform slot
+   = 64 + spillSlotSize platform * slot
+
+--------------------------------------------------------------------------------
+
+-- | See if this instruction is telling us the current C stack delta
+x86_takeDeltaInstr
+        :: Instr
+        -> Maybe Int
+
+x86_takeDeltaInstr instr
+ = case instr of
+        DELTA i         -> Just i
+        _               -> Nothing
+
+
+x86_isMetaInstr
+        :: Instr
+        -> Bool
+
+x86_isMetaInstr instr
+ = case instr of
+        COMMENT{}       -> True
+        LOCATION{}      -> True
+        LDATA{}         -> True
+        NEWBLOCK{}      -> True
+        UNWIND{}        -> True
+        DELTA{}         -> True
+        _               -> False
+
+
+
+---  TODO: why is there
+-- | Make a reg-reg move instruction.
+--      On SPARC v8 there are no instructions to move directly between
+--      floating point and integer regs. If we need to do that then we
+--      have to go via memory.
+--
+x86_mkRegRegMoveInstr
+    :: Platform
+    -> Reg
+    -> Reg
+    -> Instr
+
+x86_mkRegRegMoveInstr platform src dst
+ = case targetClassOfReg platform src of
+        RcInteger -> case platformArch platform of
+                     ArchX86    -> MOV II32 (OpReg src) (OpReg dst)
+                     ArchX86_64 -> MOV II64 (OpReg src) (OpReg dst)
+                     _          -> panic "x86_mkRegRegMoveInstr: Bad arch"
+        RcDouble    ->  MOV FF64 (OpReg src) (OpReg dst)
+        -- this code is the lie we tell ourselves because both float and double
+        -- use the same register class.on x86_64 and x86 32bit with SSE2,
+        -- more plainly, both use the XMM registers
+        _     -> panic "X86.RegInfo.mkRegRegMoveInstr: no match"
+
+-- | Check whether an instruction represents a reg-reg move.
+--      The register allocator attempts to eliminate reg->reg moves whenever it can,
+--      by assigning the src and dest temporaries to the same real register.
+--
+x86_takeRegRegMoveInstr
+        :: Instr
+        -> Maybe (Reg,Reg)
+
+x86_takeRegRegMoveInstr (MOV _ (OpReg r1) (OpReg r2))
+        = Just (r1,r2)
+
+x86_takeRegRegMoveInstr _  = Nothing
+
+
+-- | Make an unconditional branch instruction.
+x86_mkJumpInstr
+        :: BlockId
+        -> [Instr]
+
+x86_mkJumpInstr id
+        = [JXX ALWAYS id]
+
+-- Note [Windows stack layout]
+-- | On most OSes the kernel will place a guard page after the current stack
+--   page.  If you allocate larger than a page worth you may jump over this
+--   guard page.  Not only is this a security issue, but on certain OSes such
+--   as Windows a new page won't be allocated if you don't hit the guard.  This
+--   will cause a segfault or access fault.
+--
+--   This function defines if the current allocation amount requires a probe.
+--   On Windows (for now) we emit a call to _chkstk for this.  For other OSes
+--   this is not yet implemented.
+--   See https://docs.microsoft.com/en-us/windows/desktop/DevNotes/-win32-chkstk
+--   The Windows stack looks like this:
+--
+--                         +-------------------+
+--                         |        SP         |
+--                         +-------------------+
+--                         |                   |
+--                         |    GUARD PAGE     |
+--                         |                   |
+--                         +-------------------+
+--                         |                   |
+--                         |                   |
+--                         |     UNMAPPED      |
+--                         |                   |
+--                         |                   |
+--                         +-------------------+
+--
+--   In essence each allocation larger than a page size needs to be chunked and
+--   a probe emitted after each page allocation.  You have to hit the guard
+--   page so the kernel can map in the next page, otherwise you'll segfault.
+--
+needs_probe_call :: Platform -> Int -> Bool
+needs_probe_call platform amount
+  = case platformOS platform of
+     OSMinGW32 -> case platformArch platform of
+                    ArchX86    -> amount > (4 * 1024)
+                    ArchX86_64 -> amount > (8 * 1024)
+                    _          -> False
+     _         -> False
+
+x86_mkStackAllocInstr
+        :: Platform
+        -> Int
+        -> [Instr]
+x86_mkStackAllocInstr platform amount
+  = case platformOS platform of
+      OSMinGW32 ->
+        -- These will clobber AX but this should be ok because
+        --
+        -- 1. It is the first thing we do when entering the closure and AX is
+        --    a caller saved registers on Windows both on x86_64 and x86.
+        --
+        -- 2. The closures are only entered via a call or longjmp in which case
+        --    there are no expectations for volatile registers.
+        --
+        -- 3. When the target is a local branch point it is re-targeted
+        --    after the dealloc, preserving #2.  See note [extra spill slots].
+        --
+        -- We emit a call because the stack probes are quite involved and
+        -- would bloat code size a lot.  GHC doesn't really have an -Os.
+        -- __chkstk is guaranteed to leave all nonvolatile registers and AX
+        -- untouched.  It's part of the standard prologue code for any Windows
+        -- function dropping the stack more than a page.
+        -- See Note [Windows stack layout]
+        case platformArch platform of
+            ArchX86    | needs_probe_call platform amount ->
+                           [ MOV II32 (OpImm (ImmInt amount)) (OpReg eax)
+                           , CALL (Left $ strImmLit "___chkstk_ms") [eax]
+                           , SUB II32 (OpReg eax) (OpReg esp)
+                           ]
+                       | otherwise ->
+                           [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp)
+                           , TEST II32 (OpReg esp) (OpReg esp)
+                           ]
+            ArchX86_64 | needs_probe_call platform amount ->
+                           [ MOV II64 (OpImm (ImmInt amount)) (OpReg rax)
+                           , CALL (Left $ strImmLit "___chkstk_ms") [rax]
+                           , SUB II64 (OpReg rax) (OpReg rsp)
+                           ]
+                       | otherwise ->
+                           [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp)
+                           , TEST II64 (OpReg rsp) (OpReg rsp)
+                           ]
+            _ -> panic "x86_mkStackAllocInstr"
+      _       ->
+        case platformArch platform of
+          ArchX86    -> [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) ]
+          ArchX86_64 -> [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) ]
+          _ -> panic "x86_mkStackAllocInstr"
+
+x86_mkStackDeallocInstr
+        :: Platform
+        -> Int
+        -> [Instr]
+x86_mkStackDeallocInstr platform amount
+  = case platformArch platform of
+      ArchX86    -> [ADD II32 (OpImm (ImmInt amount)) (OpReg esp)]
+      ArchX86_64 -> [ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)]
+      _ -> panic "x86_mkStackDeallocInstr"
+
+
+--
+-- Note [extra spill slots]
+--
+-- If the register allocator used more spill slots than we have
+-- pre-allocated (rESERVED_C_STACK_BYTES), then we must allocate more
+-- C stack space on entry and exit from this proc.  Therefore we
+-- insert a "sub $N, %rsp" at every entry point, and an "add $N, %rsp"
+-- before every non-local jump.
+--
+-- This became necessary when the new codegen started bundling entire
+-- functions together into one proc, because the register allocator
+-- assigns a different stack slot to each virtual reg within a proc.
+-- To avoid using so many slots we could also:
+--
+--   - split up the proc into connected components before code generator
+--
+--   - rename the virtual regs, so that we re-use vreg names and hence
+--     stack slots for non-overlapping vregs.
+--
+-- Note that when a block is both a non-local entry point (with an
+-- info table) and a local branch target, we have to split it into
+-- two, like so:
+--
+--    <info table>
+--    L:
+--       <code>
+--
+-- becomes
+--
+--    <info table>
+--    L:
+--       subl $rsp, N
+--       jmp Lnew
+--    Lnew:
+--       <code>
+--
+-- and all branches pointing to L are retargetted to point to Lnew.
+-- Otherwise, we would repeat the $rsp adjustment for each branch to
+-- L.
+--
+-- Returns a list of (L,Lnew) pairs.
+--
+allocMoreStack
+  :: Platform
+  -> Int
+  -> NatCmmDecl statics GHC.CmmToAsm.X86.Instr.Instr
+  -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.X86.Instr.Instr, [(BlockId,BlockId)])
+
+allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
+allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
+    let entries = entryBlocks proc
+
+    uniqs <- replicateM (length entries) getUniqueM
+
+    let
+      delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
+        where x = slots * spillSlotSize platform -- sp delta
+
+      alloc   = mkStackAllocInstr   platform delta
+      dealloc = mkStackDeallocInstr platform delta
+
+      retargetList = (zip entries (map mkBlockId uniqs))
+
+      new_blockmap :: LabelMap BlockId
+      new_blockmap = mapFromList retargetList
+
+      insert_stack_insns (BasicBlock id insns)
+         | Just new_blockid <- mapLookup id new_blockmap
+         = [ BasicBlock id $ alloc ++ [JXX ALWAYS new_blockid]
+           , BasicBlock new_blockid block' ]
+         | otherwise
+         = [ BasicBlock id block' ]
+         where
+           block' = foldr insert_dealloc [] insns
+
+      insert_dealloc insn r = case insn of
+         JMP _ _     -> dealloc ++ (insn : r)
+         JXX_GBL _ _ -> panic "insert_dealloc: cannot handle JXX_GBL"
+         _other      -> x86_patchJumpInstr insn retarget : r
+           where retarget b = fromMaybe b (mapLookup b new_blockmap)
+
+      new_code = concatMap insert_stack_insns code
+    -- in
+    return (CmmProc info lbl live (ListGraph new_code), retargetList)
+
+data JumpDest = DestBlockId BlockId | DestImm Imm
+
+-- Debug Instance
+instance Outputable JumpDest where
+  ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid
+  ppr (DestImm _imm)    = text "jd<imm>:noShow"
+
+
+getJumpDestBlockId :: JumpDest -> Maybe BlockId
+getJumpDestBlockId (DestBlockId bid) = Just bid
+getJumpDestBlockId _                 = Nothing
+
+canShortcut :: Instr -> Maybe JumpDest
+canShortcut (JXX ALWAYS id)      = Just (DestBlockId id)
+canShortcut (JMP (OpImm imm) _)  = Just (DestImm imm)
+canShortcut _                    = Nothing
+
+
+-- This helper shortcuts a sequence of branches.
+-- The blockset helps avoid following cycles.
+shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
+shortcutJump fn insn = shortcutJump' fn (setEmpty :: LabelSet) insn
+  where
+    shortcutJump' :: (BlockId -> Maybe JumpDest) -> LabelSet -> Instr -> Instr
+    shortcutJump' fn seen insn@(JXX cc id) =
+        if setMember id seen then insn
+        else case fn id of
+            Nothing                -> insn
+            Just (DestBlockId id') -> shortcutJump' fn seen' (JXX cc id')
+            Just (DestImm imm)     -> shortcutJump' fn seen' (JXX_GBL cc imm)
+        where seen' = setInsert id seen
+    shortcutJump' fn _ (JMP_TBL addr blocks section tblId) =
+        let updateBlock (Just (DestBlockId bid))  =
+                case fn bid of
+                    Nothing   -> Just (DestBlockId bid )
+                    Just dest -> Just dest
+            updateBlock dest = dest
+            blocks' = map updateBlock blocks
+        in  JMP_TBL addr blocks' section tblId
+    shortcutJump' _ _ other = other
+
+-- Here because it knows about JumpDest
+shortcutStatics :: (BlockId -> Maybe JumpDest) -> (Alignment, RawCmmStatics) -> (Alignment, RawCmmStatics)
+shortcutStatics fn (align, RawCmmStatics lbl statics)
+  = (align, RawCmmStatics lbl $ map (shortcutStatic fn) statics)
+  -- we need to get the jump tables, so apply the mapping to the entries
+  -- of a CmmData too.
+
+shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel
+shortcutLabel fn lab
+  | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn emptyUniqSet blkId
+  | otherwise                              = lab
+
+shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic
+shortcutStatic fn (CmmStaticLit (CmmLabel lab))
+  = CmmStaticLit (CmmLabel (shortcutLabel fn lab))
+shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w))
+  = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w)
+        -- slightly dodgy, we're ignoring the second label, but this
+        -- works with the way we use CmmLabelDiffOff for jump tables now.
+shortcutStatic _ other_static
+        = other_static
+
+shortBlockId
+        :: (BlockId -> Maybe JumpDest)
+        -> UniqSet Unique
+        -> BlockId
+        -> CLabel
+
+shortBlockId fn seen blockid =
+  case (elementOfUniqSet uq seen, fn blockid) of
+    (True, _)    -> blockLbl blockid
+    (_, Nothing) -> blockLbl blockid
+    (_, Just (DestBlockId blockid'))  -> shortBlockId fn (addOneToUniqSet seen uq) blockid'
+    (_, Just (DestImm (ImmCLbl lbl))) -> lbl
+    (_, _other) -> panic "shortBlockId"
+  where uq = getUnique blockid
diff --git a/compiler/GHC/CmmToAsm/X86/Ppr.hs b/compiler/GHC/CmmToAsm/X86/Ppr.hs
new file mode 100644
index 0000000000..a5b9041974
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/Ppr.hs
@@ -0,0 +1,1014 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Pretty-printing assembly language
+--
+-- (c) The University of Glasgow 1993-2005
+--
+-----------------------------------------------------------------------------
+
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+module GHC.CmmToAsm.X86.Ppr (
+        pprNatCmmDecl,
+        pprData,
+        pprInstr,
+        pprFormat,
+        pprImm,
+        pprDataItem,
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Regs
+import GHC.CmmToAsm.X86.Instr
+import GHC.CmmToAsm.X86.Cond
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Ppr
+
+
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import BasicTypes       (Alignment, mkAlignment, alignmentBytes)
+import GHC.Driver.Session
+import GHC.Cmm              hiding (topInfoTable)
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import Unique           ( pprUniqueAlways )
+import GHC.Platform
+import FastString
+import Outputable
+
+import Data.Word
+import Data.Bits
+
+-- -----------------------------------------------------------------------------
+-- Printing this stuff out
+--
+--
+-- Note [Subsections Via Symbols]
+--
+-- If we are using the .subsections_via_symbols directive
+-- (available on recent versions of Darwin),
+-- we have to make sure that there is some kind of reference
+-- from the entry code to a label on the _top_ of of the info table,
+-- so that the linker will not think it is unreferenced and dead-strip
+-- it. That's why the label is called a DeadStripPreventer (_dsp).
+--
+-- The LLVM code gen already creates `iTableSuf` symbols, where
+-- the X86 would generate the DeadStripPreventer (_dsp) symbol.
+-- Therefore all that is left for llvm code gen, is to ensure
+-- that all the `iTableSuf` symbols are marked as used.
+-- As of this writing the documentation regarding the
+-- .subsections_via_symbols and -dead_strip can be found at
+-- <https://developer.apple.com/library/mac/documentation/DeveloperTools/Reference/Assembler/040-Assembler_Directives/asm_directives.html#//apple_ref/doc/uid/TP30000823-TPXREF101>
+
+pprProcAlignment :: SDoc
+pprProcAlignment = sdocWithDynFlags $ \dflags ->
+  (maybe empty (pprAlign . mkAlignment) (cmmProcAlignment dflags))
+
+pprNatCmmDecl :: NatCmmDecl (Alignment, RawCmmStatics) Instr -> SDoc
+pprNatCmmDecl (CmmData section dats) =
+  pprSectionAlign section $$ pprDatas dats
+
+pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
+  sdocWithDynFlags $ \dflags ->
+  pprProcAlignment $$
+  case topInfoTable proc of
+    Nothing ->
+        -- special case for code without info table:
+        pprSectionAlign (Section Text lbl) $$
+        pprProcAlignment $$
+        pprLabel lbl $$ -- blocks guaranteed not null, so label needed
+        vcat (map (pprBasicBlock top_info) blocks) $$
+        (if debugLevel dflags > 0
+         then ppr (mkAsmTempEndLabel lbl) <> char ':' else empty) $$
+        pprSizeDecl lbl
+
+    Just (RawCmmStatics info_lbl _) ->
+      sdocWithPlatform $ \platform ->
+      pprSectionAlign (Section Text info_lbl) $$
+      pprProcAlignment $$
+      (if platformHasSubsectionsViaSymbols platform
+          then ppr (mkDeadStripPreventer info_lbl) <> char ':'
+          else empty) $$
+      vcat (map (pprBasicBlock top_info) blocks) $$
+      -- above: Even the first block gets a label, because with branch-chain
+      -- elimination, it might be the target of a goto.
+      (if platformHasSubsectionsViaSymbols platform
+       then -- See Note [Subsections Via Symbols]
+                text "\t.long "
+            <+> ppr info_lbl
+            <+> char '-'
+            <+> ppr (mkDeadStripPreventer info_lbl)
+       else empty) $$
+      pprSizeDecl info_lbl
+
+-- | Output the ELF .size directive.
+pprSizeDecl :: CLabel -> SDoc
+pprSizeDecl lbl
+ = sdocWithPlatform $ \platform ->
+   if osElfTarget (platformOS platform)
+   then text "\t.size" <+> ppr lbl <> ptext (sLit ", .-") <> ppr lbl
+   else empty
+
+pprBasicBlock :: LabelMap RawCmmStatics -> NatBasicBlock Instr -> SDoc
+pprBasicBlock info_env (BasicBlock blockid instrs)
+  = maybe_infotable $
+    pprLabel asmLbl $$
+    vcat (map pprInstr instrs) $$
+    (sdocOption sdocDebugLevel $ \level ->
+        if level > 0
+           then ppr (mkAsmTempEndLabel asmLbl) <> char ':'
+           else empty
+    )
+  where
+    asmLbl = blockLbl blockid
+    maybe_infotable c = case mapLookup blockid info_env of
+       Nothing -> c
+       Just (RawCmmStatics infoLbl info) ->
+           pprAlignForSection Text $$
+           infoTableLoc $$
+           vcat (map pprData info) $$
+           pprLabel infoLbl $$
+           c $$
+           (sdocOption sdocDebugLevel $ \level ->
+               if level > 0
+                  then ppr (mkAsmTempEndLabel infoLbl) <> char ':'
+                  else empty
+           )
+    -- Make sure the info table has the right .loc for the block
+    -- coming right after it. See [Note: Info Offset]
+    infoTableLoc = case instrs of
+      (l@LOCATION{} : _) -> pprInstr l
+      _other             -> empty
+
+
+pprDatas :: (Alignment, RawCmmStatics) -> SDoc
+-- See note [emit-time elimination of static indirections] in CLabel.
+pprDatas (_, RawCmmStatics alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _])
+  | lbl == mkIndStaticInfoLabel
+  , let labelInd (CmmLabelOff l _) = Just l
+        labelInd (CmmLabel l) = Just l
+        labelInd _ = Nothing
+  , Just ind' <- labelInd ind
+  , alias `mayRedirectTo` ind'
+  = pprGloblDecl alias
+    $$ text ".equiv" <+> ppr alias <> comma <> ppr (CmmLabel ind')
+
+pprDatas (align, (RawCmmStatics lbl dats))
+ = vcat (pprAlign align : pprLabel lbl : map pprData dats)
+
+pprData :: CmmStatic -> SDoc
+pprData (CmmString str) = pprBytes str
+
+pprData (CmmUninitialised bytes)
+ = sdocWithPlatform $ \platform ->
+   if platformOS platform == OSDarwin then text ".space " <> int bytes
+                                      else text ".skip "  <> int bytes
+
+pprData (CmmStaticLit lit) = pprDataItem lit
+
+pprGloblDecl :: CLabel -> SDoc
+pprGloblDecl lbl
+  | not (externallyVisibleCLabel lbl) = empty
+  | otherwise = text ".globl " <> ppr lbl
+
+pprLabelType' :: DynFlags -> CLabel -> SDoc
+pprLabelType' dflags lbl =
+  if isCFunctionLabel lbl || functionOkInfoTable then
+    text "@function"
+  else
+    text "@object"
+  where
+    {-
+    NOTE: This is a bit hacky.
+
+    With the `tablesNextToCode` info tables look like this:
+    ```
+      <info table data>
+    label_info:
+      <info table code>
+    ```
+    So actually info table label points exactly to the code and we can mark
+    the label as @function. (This is required to make perf and potentially other
+    tools to work on Haskell binaries).
+    This usually works well but it can cause issues with a linker.
+    A linker uses different algorithms for the relocation depending on
+    the symbol type.For some reason, a linker will generate JUMP_SLOT relocation
+    when constructor info table is referenced from a data section.
+    This only happens with static constructor call so
+    we mark _con_info symbols as `@object` to avoid the issue with relocations.
+
+    @SimonMarlow hack explanation:
+    "The reasoning goes like this:
+
+    * The danger when we mark a symbol as `@function` is that the linker will
+      redirect it to point to the PLT and use a `JUMP_SLOT` relocation when
+      the symbol refers to something outside the current shared object.
+      A PLT / JUMP_SLOT reference only works for symbols that we jump to, not
+      for symbols representing data,, nor for info table symbol references which
+      we expect to point directly to the info table.
+    * GHC generates code that might refer to any info table symbol from the text
+      segment, but that's OK, because those will be explicit GOT references
+      generated by the code generator.
+    * When we refer to info tables from the data segment, it's either
+      * a FUN_STATIC/THUNK_STATIC local to this module
+      * a `con_info` that could be from anywhere
+
+    So, the only info table symbols that we might refer to from the data segment
+    of another shared object are `con_info` symbols, so those are the ones we
+    need to exclude from getting the @function treatment.
+    "
+
+    A good place to check for more
+    https://gitlab.haskell.org/ghc/ghc/wikis/commentary/position-independent-code
+
+    Another possible hack is to create an extra local function symbol for
+    every code-like thing to give the needed information for to the tools
+    but mess up with the relocation. https://phabricator.haskell.org/D4730
+    -}
+    functionOkInfoTable = tablesNextToCode dflags &&
+      isInfoTableLabel lbl && not (isConInfoTableLabel lbl)
+
+
+pprTypeDecl :: CLabel -> SDoc
+pprTypeDecl lbl
+    = sdocWithPlatform $ \platform ->
+      if osElfTarget (platformOS platform) && externallyVisibleCLabel lbl
+      then
+        sdocWithDynFlags $ \df ->
+          text ".type " <> ppr lbl <> ptext (sLit  ", ") <> pprLabelType' df lbl
+      else empty
+
+pprLabel :: CLabel -> SDoc
+pprLabel lbl = pprGloblDecl lbl
+            $$ pprTypeDecl lbl
+            $$ (ppr lbl <> char ':')
+
+pprAlign :: Alignment -> SDoc
+pprAlign alignment
+        = sdocWithPlatform $ \platform ->
+          text ".align " <> int (alignmentOn platform)
+  where
+        bytes = alignmentBytes alignment
+        alignmentOn platform = if platformOS platform == OSDarwin
+                               then log2 bytes
+                               else      bytes
+
+        log2 :: Int -> Int  -- cache the common ones
+        log2 1 = 0
+        log2 2 = 1
+        log2 4 = 2
+        log2 8 = 3
+        log2 n = 1 + log2 (n `quot` 2)
+
+-- -----------------------------------------------------------------------------
+-- pprInstr: print an 'Instr'
+
+instance Outputable Instr where
+    ppr instr = pprInstr instr
+
+
+pprReg :: Format -> Reg -> SDoc
+pprReg f r
+  = case r of
+      RegReal    (RealRegSingle i) ->
+          sdocWithPlatform $ \platform ->
+          if target32Bit platform then ppr32_reg_no f i
+                                  else ppr64_reg_no f i
+      RegReal    (RealRegPair _ _) -> panic "X86.Ppr: no reg pairs on this arch"
+      RegVirtual (VirtualRegI  u)  -> text "%vI_"   <> pprUniqueAlways u
+      RegVirtual (VirtualRegHi u)  -> text "%vHi_"  <> pprUniqueAlways u
+      RegVirtual (VirtualRegF  u)  -> text "%vF_"   <> pprUniqueAlways u
+      RegVirtual (VirtualRegD  u)  -> text "%vD_"   <> pprUniqueAlways u
+
+  where
+    ppr32_reg_no :: Format -> Int -> SDoc
+    ppr32_reg_no II8   = ppr32_reg_byte
+    ppr32_reg_no II16  = ppr32_reg_word
+    ppr32_reg_no _     = ppr32_reg_long
+
+    ppr32_reg_byte i = ptext
+      (case i of {
+         0 -> sLit "%al";     1 -> sLit "%bl";
+         2 -> sLit "%cl";     3 -> sLit "%dl";
+        _  -> sLit $ "very naughty I386 byte register: " ++ show i
+      })
+
+    ppr32_reg_word i = ptext
+      (case i of {
+         0 -> sLit "%ax";     1 -> sLit "%bx";
+         2 -> sLit "%cx";     3 -> sLit "%dx";
+         4 -> sLit "%si";     5 -> sLit "%di";
+         6 -> sLit "%bp";     7 -> sLit "%sp";
+        _  -> sLit "very naughty I386 word register"
+      })
+
+    ppr32_reg_long i = ptext
+      (case i of {
+         0 -> sLit "%eax";    1 -> sLit "%ebx";
+         2 -> sLit "%ecx";    3 -> sLit "%edx";
+         4 -> sLit "%esi";    5 -> sLit "%edi";
+         6 -> sLit "%ebp";    7 -> sLit "%esp";
+         _  -> ppr_reg_float i
+      })
+
+    ppr64_reg_no :: Format -> Int -> SDoc
+    ppr64_reg_no II8   = ppr64_reg_byte
+    ppr64_reg_no II16  = ppr64_reg_word
+    ppr64_reg_no II32  = ppr64_reg_long
+    ppr64_reg_no _     = ppr64_reg_quad
+
+    ppr64_reg_byte i = ptext
+      (case i of {
+         0 -> sLit "%al";     1 -> sLit "%bl";
+         2 -> sLit "%cl";     3 -> sLit "%dl";
+         4 -> sLit "%sil";    5 -> sLit "%dil"; -- new 8-bit regs!
+         6 -> sLit "%bpl";    7 -> sLit "%spl";
+         8 -> sLit "%r8b";    9  -> sLit "%r9b";
+        10 -> sLit "%r10b";   11 -> sLit "%r11b";
+        12 -> sLit "%r12b";   13 -> sLit "%r13b";
+        14 -> sLit "%r14b";   15 -> sLit "%r15b";
+        _  -> sLit $ "very naughty x86_64 byte register: " ++ show i
+      })
+
+    ppr64_reg_word i = ptext
+      (case i of {
+         0 -> sLit "%ax";     1 -> sLit "%bx";
+         2 -> sLit "%cx";     3 -> sLit "%dx";
+         4 -> sLit "%si";     5 -> sLit "%di";
+         6 -> sLit "%bp";     7 -> sLit "%sp";
+         8 -> sLit "%r8w";    9  -> sLit "%r9w";
+        10 -> sLit "%r10w";   11 -> sLit "%r11w";
+        12 -> sLit "%r12w";   13 -> sLit "%r13w";
+        14 -> sLit "%r14w";   15 -> sLit "%r15w";
+        _  -> sLit "very naughty x86_64 word register"
+      })
+
+    ppr64_reg_long i = ptext
+      (case i of {
+         0 -> sLit "%eax";    1  -> sLit "%ebx";
+         2 -> sLit "%ecx";    3  -> sLit "%edx";
+         4 -> sLit "%esi";    5  -> sLit "%edi";
+         6 -> sLit "%ebp";    7  -> sLit "%esp";
+         8 -> sLit "%r8d";    9  -> sLit "%r9d";
+        10 -> sLit "%r10d";   11 -> sLit "%r11d";
+        12 -> sLit "%r12d";   13 -> sLit "%r13d";
+        14 -> sLit "%r14d";   15 -> sLit "%r15d";
+        _  -> sLit "very naughty x86_64 register"
+      })
+
+    ppr64_reg_quad i = ptext
+      (case i of {
+         0 -> sLit "%rax";      1 -> sLit "%rbx";
+         2 -> sLit "%rcx";      3 -> sLit "%rdx";
+         4 -> sLit "%rsi";      5 -> sLit "%rdi";
+         6 -> sLit "%rbp";      7 -> sLit "%rsp";
+         8 -> sLit "%r8";       9 -> sLit "%r9";
+        10 -> sLit "%r10";    11 -> sLit "%r11";
+        12 -> sLit "%r12";    13 -> sLit "%r13";
+        14 -> sLit "%r14";    15 -> sLit "%r15";
+        _  -> ppr_reg_float i
+      })
+
+ppr_reg_float :: Int -> PtrString
+ppr_reg_float i = case i of
+        16 -> sLit "%xmm0" ;   17 -> sLit "%xmm1"
+        18 -> sLit "%xmm2" ;   19 -> sLit "%xmm3"
+        20 -> sLit "%xmm4" ;   21 -> sLit "%xmm5"
+        22 -> sLit "%xmm6" ;   23 -> sLit "%xmm7"
+        24 -> sLit "%xmm8" ;   25 -> sLit "%xmm9"
+        26 -> sLit "%xmm10";   27 -> sLit "%xmm11"
+        28 -> sLit "%xmm12";   29 -> sLit "%xmm13"
+        30 -> sLit "%xmm14";   31 -> sLit "%xmm15"
+        _  -> sLit "very naughty x86 register"
+
+pprFormat :: Format -> SDoc
+pprFormat x
+ = ptext (case x of
+                II8   -> sLit "b"
+                II16  -> sLit "w"
+                II32  -> sLit "l"
+                II64  -> sLit "q"
+                FF32  -> sLit "ss"      -- "scalar single-precision float" (SSE2)
+                FF64  -> sLit "sd"      -- "scalar double-precision float" (SSE2)
+                )
+
+pprFormat_x87 :: Format -> SDoc
+pprFormat_x87 x
+  = ptext $ case x of
+                FF32  -> sLit "s"
+                FF64  -> sLit "l"
+                _     -> panic "X86.Ppr.pprFormat_x87"
+
+
+pprCond :: Cond -> SDoc
+pprCond c
+ = ptext (case c of {
+                GEU     -> sLit "ae";   LU    -> sLit "b";
+                EQQ     -> sLit "e";    GTT   -> sLit "g";
+                GE      -> sLit "ge";   GU    -> sLit "a";
+                LTT     -> sLit "l";    LE    -> sLit "le";
+                LEU     -> sLit "be";   NE    -> sLit "ne";
+                NEG     -> sLit "s";    POS   -> sLit "ns";
+                CARRY   -> sLit "c";   OFLO  -> sLit "o";
+                PARITY  -> sLit "p";   NOTPARITY -> sLit "np";
+                ALWAYS  -> sLit "mp"})
+
+
+pprImm :: Imm -> SDoc
+pprImm (ImmInt i)     = int i
+pprImm (ImmInteger i) = integer i
+pprImm (ImmCLbl l)    = ppr l
+pprImm (ImmIndex l i) = ppr l <> char '+' <> int i
+pprImm (ImmLit s)     = s
+
+pprImm (ImmFloat _)  = text "naughty float immediate"
+pprImm (ImmDouble _) = text "naughty double immediate"
+
+pprImm (ImmConstantSum a b) = pprImm a <> char '+' <> pprImm b
+pprImm (ImmConstantDiff a b) = pprImm a <> char '-'
+                            <> lparen <> pprImm b <> rparen
+
+
+
+pprAddr :: AddrMode -> SDoc
+pprAddr (ImmAddr imm off)
+  = let pp_imm = pprImm imm
+    in
+    if (off == 0) then
+        pp_imm
+    else if (off < 0) then
+        pp_imm <> int off
+    else
+        pp_imm <> char '+' <> int off
+
+pprAddr (AddrBaseIndex base index displacement)
+  = sdocWithPlatform $ \platform ->
+    let
+        pp_disp  = ppr_disp displacement
+        pp_off p = pp_disp <> char '(' <> p <> char ')'
+        pp_reg r = pprReg (archWordFormat (target32Bit platform)) r
+    in
+    case (base, index) of
+      (EABaseNone,  EAIndexNone) -> pp_disp
+      (EABaseReg b, EAIndexNone) -> pp_off (pp_reg b)
+      (EABaseRip,   EAIndexNone) -> pp_off (text "%rip")
+      (EABaseNone,  EAIndex r i) -> pp_off (comma <> pp_reg r <> comma <> int i)
+      (EABaseReg b, EAIndex r i) -> pp_off (pp_reg b <> comma <> pp_reg r
+                                       <> comma <> int i)
+      _                         -> panic "X86.Ppr.pprAddr: no match"
+
+  where
+    ppr_disp (ImmInt 0) = empty
+    ppr_disp imm        = pprImm imm
+
+-- | Print section header and appropriate alignment for that section.
+pprSectionAlign :: Section -> SDoc
+pprSectionAlign (Section (OtherSection _) _) =
+     panic "X86.Ppr.pprSectionAlign: unknown section"
+pprSectionAlign sec@(Section seg _) =
+  sdocWithPlatform $ \platform ->
+    pprSectionHeader platform sec $$
+    pprAlignForSection seg
+
+-- | Print appropriate alignment for the given section type.
+pprAlignForSection :: SectionType -> SDoc
+pprAlignForSection seg =
+  sdocWithPlatform $ \platform ->
+    text ".align " <>
+    case platformOS platform of
+      -- Darwin: alignments are given as shifts.
+      OSDarwin
+       | target32Bit platform ->
+          case seg of
+           ReadOnlyData16    -> int 4
+           CString           -> int 1
+           _                 -> int 2
+       | otherwise ->
+          case seg of
+           ReadOnlyData16    -> int 4
+           CString           -> int 1
+           _                 -> int 3
+      -- Other: alignments are given as bytes.
+      _
+       | target32Bit platform ->
+          case seg of
+           Text              -> text "4,0x90"
+           ReadOnlyData16    -> int 16
+           CString           -> int 1
+           _                 -> int 4
+       | otherwise ->
+          case seg of
+           ReadOnlyData16    -> int 16
+           CString           -> int 1
+           _                 -> int 8
+
+pprDataItem :: CmmLit -> SDoc
+pprDataItem lit = sdocWithDynFlags $ \dflags -> pprDataItem' dflags lit
+
+pprDataItem' :: DynFlags -> CmmLit -> SDoc
+pprDataItem' dflags lit
+  = vcat (ppr_item (cmmTypeFormat $ cmmLitType dflags lit) lit)
+    where
+        platform = targetPlatform dflags
+        imm = litToImm lit
+
+        -- These seem to be common:
+        ppr_item II8   _ = [text "\t.byte\t" <> pprImm imm]
+        ppr_item II16  _ = [text "\t.word\t" <> pprImm imm]
+        ppr_item II32  _ = [text "\t.long\t" <> pprImm imm]
+
+        ppr_item FF32  (CmmFloat r _)
+           = let bs = floatToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+        ppr_item FF64 (CmmFloat r _)
+           = let bs = doubleToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+        ppr_item II64 _
+            = case platformOS platform of
+              OSDarwin
+               | target32Bit platform ->
+                  case lit of
+                  CmmInt x _ ->
+                      [text "\t.long\t"
+                          <> int (fromIntegral (fromIntegral x :: Word32)),
+                       text "\t.long\t"
+                          <> int (fromIntegral
+                              (fromIntegral (x `shiftR` 32) :: Word32))]
+                  _ -> panic "X86.Ppr.ppr_item: no match for II64"
+               | otherwise ->
+                  [text "\t.quad\t" <> pprImm imm]
+              _
+               | target32Bit platform ->
+                  [text "\t.quad\t" <> pprImm imm]
+               | otherwise ->
+                  -- x86_64: binutils can't handle the R_X86_64_PC64
+                  -- relocation type, which means we can't do
+                  -- pc-relative 64-bit addresses. Fortunately we're
+                  -- assuming the small memory model, in which all such
+                  -- offsets will fit into 32 bits, so we have to stick
+                  -- to 32-bit offset fields and modify the RTS
+                  -- appropriately
+                  --
+                  -- See Note [x86-64-relative] in includes/rts/storage/InfoTables.h
+                  --
+                  case lit of
+                  -- A relative relocation:
+                  CmmLabelDiffOff _ _ _ _ ->
+                      [text "\t.long\t" <> pprImm imm,
+                       text "\t.long\t0"]
+                  _ ->
+                      [text "\t.quad\t" <> pprImm imm]
+
+        ppr_item _ _
+                = panic "X86.Ppr.ppr_item: no match"
+
+
+asmComment :: SDoc -> SDoc
+asmComment c = whenPprDebug $ text "# " <> c
+
+pprInstr :: Instr -> SDoc
+
+pprInstr (COMMENT s)
+   = asmComment (ftext s)
+
+pprInstr (LOCATION file line col _name)
+   = text "\t.loc " <> ppr file <+> ppr line <+> ppr col
+
+pprInstr (DELTA d)
+   = asmComment $ text ("\tdelta = " ++ show d)
+
+pprInstr (NEWBLOCK _)
+   = panic "PprMach.pprInstr: NEWBLOCK"
+
+pprInstr (UNWIND lbl d)
+   = asmComment (text "\tunwind = " <> ppr d)
+     $$ ppr lbl <> colon
+
+pprInstr (LDATA _ _)
+   = panic "PprMach.pprInstr: LDATA"
+
+{-
+pprInstr (SPILL reg slot)
+   = hcat [
+        text "\tSPILL",
+        char ' ',
+        pprUserReg reg,
+        comma,
+        text "SLOT" <> parens (int slot)]
+
+pprInstr (RELOAD slot reg)
+   = hcat [
+        text "\tRELOAD",
+        char ' ',
+        text "SLOT" <> parens (int slot),
+        comma,
+        pprUserReg reg]
+-}
+
+-- Replace 'mov $0x0,%reg' by 'xor %reg,%reg', which is smaller and cheaper.
+-- The code generator catches most of these already, but not all.
+pprInstr (MOV format (OpImm (ImmInt 0)) dst@(OpReg _))
+  = pprInstr (XOR format' dst dst)
+  where format' = case format of
+          II64 -> II32          -- 32-bit version is equivalent, and smaller
+          _    -> format
+pprInstr (MOV format src dst)
+  = pprFormatOpOp (sLit "mov") format src dst
+
+pprInstr (CMOV cc format src dst)
+  = pprCondOpReg (sLit "cmov") format cc src dst
+
+pprInstr (MOVZxL II32 src dst) = pprFormatOpOp (sLit "mov") II32 src dst
+        -- 32-to-64 bit zero extension on x86_64 is accomplished by a simple
+        -- movl.  But we represent it as a MOVZxL instruction, because
+        -- the reg alloc would tend to throw away a plain reg-to-reg
+        -- move, and we still want it to do that.
+
+pprInstr (MOVZxL formats src dst)
+  = pprFormatOpOpCoerce (sLit "movz") formats II32 src dst
+        -- zero-extension only needs to extend to 32 bits: on x86_64,
+        -- the remaining zero-extension to 64 bits is automatic, and the 32-bit
+        -- instruction is shorter.
+
+pprInstr (MOVSxL formats src dst)
+  = sdocWithPlatform $ \platform ->
+    pprFormatOpOpCoerce (sLit "movs") formats (archWordFormat (target32Bit platform)) src dst
+
+-- here we do some patching, since the physical registers are only set late
+-- in the code generation.
+pprInstr (LEA format (OpAddr (AddrBaseIndex (EABaseReg reg1) (EAIndex reg2 1) (ImmInt 0))) dst@(OpReg reg3))
+  | reg1 == reg3
+  = pprFormatOpOp (sLit "add") format (OpReg reg2) dst
+
+pprInstr (LEA format (OpAddr (AddrBaseIndex (EABaseReg reg1) (EAIndex reg2 1) (ImmInt 0))) dst@(OpReg reg3))
+  | reg2 == reg3
+  = pprFormatOpOp (sLit "add") format (OpReg reg1) dst
+
+pprInstr (LEA format (OpAddr (AddrBaseIndex (EABaseReg reg1) EAIndexNone displ)) dst@(OpReg reg3))
+  | reg1 == reg3
+  = pprInstr (ADD format (OpImm displ) dst)
+
+pprInstr (LEA format src dst) = pprFormatOpOp (sLit "lea") format src dst
+
+pprInstr (ADD format (OpImm (ImmInt (-1))) dst)
+  = pprFormatOp (sLit "dec") format dst
+pprInstr (ADD format (OpImm (ImmInt 1)) dst)
+  = pprFormatOp (sLit "inc") format dst
+pprInstr (ADD format src dst) = pprFormatOpOp (sLit "add") format src dst
+pprInstr (ADC format src dst) = pprFormatOpOp (sLit "adc") format src dst
+pprInstr (SUB format src dst) = pprFormatOpOp (sLit "sub") format src dst
+pprInstr (SBB format src dst) = pprFormatOpOp (sLit "sbb") format src dst
+pprInstr (IMUL format op1 op2) = pprFormatOpOp (sLit "imul") format op1 op2
+
+pprInstr (ADD_CC format src dst)
+  = pprFormatOpOp (sLit "add") format src dst
+pprInstr (SUB_CC format src dst)
+  = pprFormatOpOp (sLit "sub") format src dst
+
+{- A hack.  The Intel documentation says that "The two and three
+   operand forms [of IMUL] may also be used with unsigned operands
+   because the lower half of the product is the same regardless if
+   (sic) the operands are signed or unsigned.  The CF and OF flags,
+   however, cannot be used to determine if the upper half of the
+   result is non-zero."  So there.
+-}
+
+-- Use a 32-bit instruction when possible as it saves a byte.
+-- Notably, extracting the tag bits of a pointer has this form.
+-- TODO: we could save a byte in a subsequent CMP instruction too,
+-- but need something like a peephole pass for this
+pprInstr (AND II64 src@(OpImm (ImmInteger mask)) dst)
+  | 0 <= mask && mask < 0xffffffff
+    = pprInstr (AND II32 src dst)
+pprInstr (AND FF32 src dst) = pprOpOp (sLit "andps") FF32 src dst
+pprInstr (AND FF64 src dst) = pprOpOp (sLit "andpd") FF64 src dst
+pprInstr (AND format src dst) = pprFormatOpOp (sLit "and") format src dst
+pprInstr (OR  format src dst) = pprFormatOpOp (sLit "or")  format src dst
+
+pprInstr (XOR FF32 src dst) = pprOpOp (sLit "xorps") FF32 src dst
+pprInstr (XOR FF64 src dst) = pprOpOp (sLit "xorpd") FF64 src dst
+pprInstr (XOR format src dst) = pprFormatOpOp (sLit "xor")  format src dst
+
+pprInstr (POPCNT format src dst) = pprOpOp (sLit "popcnt") format src (OpReg dst)
+pprInstr (LZCNT format src dst)  = pprOpOp (sLit "lzcnt")  format src (OpReg dst)
+pprInstr (TZCNT format src dst)  = pprOpOp (sLit "tzcnt")  format src (OpReg dst)
+pprInstr (BSF format src dst)    = pprOpOp (sLit "bsf")    format src (OpReg dst)
+pprInstr (BSR format src dst)    = pprOpOp (sLit "bsr")    format src (OpReg dst)
+
+pprInstr (PDEP format src mask dst)   = pprFormatOpOpReg (sLit "pdep") format src mask dst
+pprInstr (PEXT format src mask dst)   = pprFormatOpOpReg (sLit "pext") format src mask dst
+
+pprInstr (PREFETCH NTA format src ) = pprFormatOp_ (sLit "prefetchnta") format src
+pprInstr (PREFETCH Lvl0 format src) = pprFormatOp_ (sLit "prefetcht0") format src
+pprInstr (PREFETCH Lvl1 format src) = pprFormatOp_ (sLit "prefetcht1") format src
+pprInstr (PREFETCH Lvl2 format src) = pprFormatOp_ (sLit "prefetcht2") format src
+
+pprInstr (NOT format op) = pprFormatOp (sLit "not") format op
+pprInstr (BSWAP format op) = pprFormatOp (sLit "bswap") format (OpReg op)
+pprInstr (NEGI format op) = pprFormatOp (sLit "neg") format op
+
+pprInstr (SHL format src dst) = pprShift (sLit "shl") format src dst
+pprInstr (SAR format src dst) = pprShift (sLit "sar") format src dst
+pprInstr (SHR format src dst) = pprShift (sLit "shr") format src dst
+
+pprInstr (BT  format imm src) = pprFormatImmOp (sLit "bt") format imm src
+
+pprInstr (CMP format src dst)
+  | isFloatFormat format =  pprFormatOpOp (sLit "ucomi") format src dst -- SSE2
+  | otherwise     =  pprFormatOpOp (sLit "cmp")   format src dst
+
+pprInstr (TEST format src dst) = sdocWithPlatform $ \platform ->
+  let format' = case (src,dst) of
+        -- Match instructions like 'test $0x3,%esi' or 'test $0x7,%rbx'.
+        -- We can replace them by equivalent, but smaller instructions
+        -- by reducing the size of the immediate operand as far as possible.
+        -- (We could handle masks larger than a single byte too,
+        -- but it would complicate the code considerably
+        -- and tag checks are by far the most common case.)
+        -- The mask must have the high bit clear for this smaller encoding
+        -- to be completely equivalent to the original; in particular so
+        -- that the signed comparison condition bits are the same as they
+        -- would be if doing a full word comparison. See #13425.
+        (OpImm (ImmInteger mask), OpReg dstReg)
+          | 0 <= mask && mask < 128 -> minSizeOfReg platform dstReg
+        _ -> format
+  in pprFormatOpOp (sLit "test") format' src dst
+  where
+    minSizeOfReg platform (RegReal (RealRegSingle i))
+      | target32Bit platform && i <= 3        = II8  -- al, bl, cl, dl
+      | target32Bit platform && i <= 7        = II16 -- si, di, bp, sp
+      | not (target32Bit platform) && i <= 15 = II8  -- al .. r15b
+    minSizeOfReg _ _ = format                 -- other
+
+pprInstr (PUSH format op) = pprFormatOp (sLit "push") format op
+pprInstr (POP format op) = pprFormatOp (sLit "pop") format op
+
+-- both unused (SDM):
+-- pprInstr PUSHA = text "\tpushal"
+-- pprInstr POPA = text "\tpopal"
+
+pprInstr NOP = text "\tnop"
+pprInstr (CLTD II8) = text "\tcbtw"
+pprInstr (CLTD II16) = text "\tcwtd"
+pprInstr (CLTD II32) = text "\tcltd"
+pprInstr (CLTD II64) = text "\tcqto"
+pprInstr (CLTD x) = panic $ "pprInstr: " ++ show x
+
+pprInstr (SETCC cond op) = pprCondInstr (sLit "set") cond (pprOperand II8 op)
+
+pprInstr (JXX cond blockid)
+  = pprCondInstr (sLit "j") cond (ppr lab)
+  where lab = blockLbl blockid
+
+pprInstr        (JXX_GBL cond imm) = pprCondInstr (sLit "j") cond (pprImm imm)
+
+pprInstr        (JMP (OpImm imm) _) = text "\tjmp " <> pprImm imm
+pprInstr (JMP op _)          = sdocWithPlatform $ \platform ->
+                               text "\tjmp *"
+                                   <> pprOperand (archWordFormat (target32Bit platform)) op
+pprInstr (JMP_TBL op _ _ _)  = pprInstr (JMP op [])
+pprInstr        (CALL (Left imm) _)    = text "\tcall " <> pprImm imm
+pprInstr (CALL (Right reg) _)   = sdocWithPlatform $ \platform ->
+                                  text "\tcall *"
+                                      <> pprReg (archWordFormat (target32Bit platform)) reg
+
+pprInstr (IDIV fmt op)   = pprFormatOp (sLit "idiv") fmt op
+pprInstr (DIV fmt op)    = pprFormatOp (sLit "div")  fmt op
+pprInstr (IMUL2 fmt op)  = pprFormatOp (sLit "imul") fmt op
+
+-- x86_64 only
+pprInstr (MUL format op1 op2) = pprFormatOpOp (sLit "mul") format op1 op2
+pprInstr (MUL2 format op) = pprFormatOp (sLit "mul") format op
+
+pprInstr (FDIV format op1 op2) = pprFormatOpOp (sLit "div") format op1 op2
+pprInstr (SQRT format op1 op2) = pprFormatOpReg (sLit "sqrt") format op1 op2
+
+pprInstr (CVTSS2SD from to)      = pprRegReg (sLit "cvtss2sd") from to
+pprInstr (CVTSD2SS from to)      = pprRegReg (sLit "cvtsd2ss") from to
+pprInstr (CVTTSS2SIQ fmt from to) = pprFormatFormatOpReg (sLit "cvttss2si") FF32 fmt from to
+pprInstr (CVTTSD2SIQ fmt from to) = pprFormatFormatOpReg (sLit "cvttsd2si") FF64 fmt from to
+pprInstr (CVTSI2SS fmt from to)   = pprFormatOpReg (sLit "cvtsi2ss") fmt from to
+pprInstr (CVTSI2SD fmt from to)   = pprFormatOpReg (sLit "cvtsi2sd") fmt from to
+
+    -- FETCHGOT for PIC on ELF platforms
+pprInstr (FETCHGOT reg)
+   = vcat [ text "\tcall 1f",
+            hcat [ text "1:\tpopl\t", pprReg II32 reg ],
+            hcat [ text "\taddl\t$_GLOBAL_OFFSET_TABLE_+(.-1b), ",
+                   pprReg II32 reg ]
+          ]
+
+    -- FETCHPC for PIC on Darwin/x86
+    -- get the instruction pointer into a register
+    -- (Terminology note: the IP is called Program Counter on PPC,
+    --  and it's a good thing to use the same name on both platforms)
+pprInstr (FETCHPC reg)
+   = vcat [ text "\tcall 1f",
+            hcat [ text "1:\tpopl\t", pprReg II32 reg ]
+          ]
+
+
+-- the
+-- GST fmt src addr ==> FLD dst ; FSTPsz addr
+pprInstr g@(X87Store fmt  addr)
+ = pprX87 g (hcat [gtab,
+                 text "fstp", pprFormat_x87 fmt, gsp, pprAddr addr])
+
+
+-- Atomics
+
+pprInstr (LOCK i) = text "\tlock" $$ pprInstr i
+
+pprInstr MFENCE = text "\tmfence"
+
+pprInstr (XADD format src dst) = pprFormatOpOp (sLit "xadd") format src dst
+
+pprInstr (CMPXCHG format src dst)
+   = pprFormatOpOp (sLit "cmpxchg") format src dst
+
+
+
+--------------------------
+-- some left over
+
+
+
+gtab :: SDoc
+gtab  = char '\t'
+
+gsp :: SDoc
+gsp   = char ' '
+
+
+
+pprX87 :: Instr -> SDoc -> SDoc
+pprX87 fake actual
+   = (char '#' <> pprX87Instr fake) $$ actual
+
+pprX87Instr :: Instr -> SDoc
+pprX87Instr (X87Store fmt  dst) = pprFormatAddr (sLit "gst") fmt  dst
+pprX87Instr _ = panic "X86.Ppr.pprX87Instr: no match"
+
+pprDollImm :: Imm -> SDoc
+pprDollImm i = text "$" <> pprImm i
+
+
+pprOperand :: Format -> Operand -> SDoc
+pprOperand f (OpReg r)   = pprReg f r
+pprOperand _ (OpImm i)   = pprDollImm i
+pprOperand _ (OpAddr ea) = pprAddr ea
+
+
+pprMnemonic_  :: PtrString -> SDoc
+pprMnemonic_ name =
+   char '\t' <> ptext name <> space
+
+
+pprMnemonic  :: PtrString -> Format -> SDoc
+pprMnemonic name format =
+   char '\t' <> ptext name <> pprFormat format <> space
+
+
+pprFormatImmOp :: PtrString -> Format -> Imm -> Operand -> SDoc
+pprFormatImmOp name format imm op1
+  = hcat [
+        pprMnemonic name format,
+        char '$',
+        pprImm imm,
+        comma,
+        pprOperand format op1
+    ]
+
+
+pprFormatOp_ :: PtrString -> Format -> Operand -> SDoc
+pprFormatOp_ name format op1
+  = hcat [
+        pprMnemonic_ name ,
+        pprOperand format op1
+    ]
+
+pprFormatOp :: PtrString -> Format -> Operand -> SDoc
+pprFormatOp name format op1
+  = hcat [
+        pprMnemonic name format,
+        pprOperand format op1
+    ]
+
+
+pprFormatOpOp :: PtrString -> Format -> Operand -> Operand -> SDoc
+pprFormatOpOp name format op1 op2
+  = hcat [
+        pprMnemonic name format,
+        pprOperand format op1,
+        comma,
+        pprOperand format op2
+    ]
+
+
+pprOpOp :: PtrString -> Format -> Operand -> Operand -> SDoc
+pprOpOp name format op1 op2
+  = hcat [
+        pprMnemonic_ name,
+        pprOperand format op1,
+        comma,
+        pprOperand format op2
+    ]
+
+
+
+pprRegReg :: PtrString -> Reg -> Reg -> SDoc
+pprRegReg name reg1 reg2
+  = sdocWithPlatform $ \platform ->
+    hcat [
+        pprMnemonic_ name,
+        pprReg (archWordFormat (target32Bit platform)) reg1,
+        comma,
+        pprReg (archWordFormat (target32Bit platform)) reg2
+    ]
+
+
+pprFormatOpReg :: PtrString -> Format -> Operand -> Reg -> SDoc
+pprFormatOpReg name format op1 reg2
+  = sdocWithPlatform $ \platform ->
+    hcat [
+        pprMnemonic name format,
+        pprOperand format op1,
+        comma,
+        pprReg (archWordFormat (target32Bit platform)) reg2
+    ]
+
+pprCondOpReg :: PtrString -> Format -> Cond -> Operand -> Reg -> SDoc
+pprCondOpReg name format cond op1 reg2
+  = hcat [
+        char '\t',
+        ptext name,
+        pprCond cond,
+        space,
+        pprOperand format op1,
+        comma,
+        pprReg format reg2
+    ]
+
+pprFormatFormatOpReg :: PtrString -> Format -> Format -> Operand -> Reg -> SDoc
+pprFormatFormatOpReg name format1 format2 op1 reg2
+  = hcat [
+        pprMnemonic name format2,
+        pprOperand format1 op1,
+        comma,
+        pprReg format2 reg2
+    ]
+
+pprFormatOpOpReg :: PtrString -> Format -> Operand -> Operand -> Reg -> SDoc
+pprFormatOpOpReg name format op1 op2 reg3
+  = hcat [
+        pprMnemonic name format,
+        pprOperand format op1,
+        comma,
+        pprOperand format op2,
+        comma,
+        pprReg format reg3
+    ]
+
+
+
+pprFormatAddr :: PtrString -> Format -> AddrMode -> SDoc
+pprFormatAddr name format  op
+  = hcat [
+        pprMnemonic name format,
+        comma,
+        pprAddr op
+    ]
+
+pprShift :: PtrString -> Format -> Operand -> Operand -> SDoc
+pprShift name format src dest
+  = hcat [
+        pprMnemonic name format,
+        pprOperand II8 src,  -- src is 8-bit sized
+        comma,
+        pprOperand format dest
+    ]
+
+
+pprFormatOpOpCoerce :: PtrString -> Format -> Format -> Operand -> Operand -> SDoc
+pprFormatOpOpCoerce name format1 format2 op1 op2
+  = hcat [ char '\t', ptext name, pprFormat format1, pprFormat format2, space,
+        pprOperand format1 op1,
+        comma,
+        pprOperand format2 op2
+    ]
+
+
+pprCondInstr :: PtrString -> Cond -> SDoc -> SDoc
+pprCondInstr name cond arg
+  = hcat [ char '\t', ptext name, pprCond cond, space, arg]
diff --git a/compiler/GHC/CmmToAsm/X86/RegInfo.hs b/compiler/GHC/CmmToAsm/X86/RegInfo.hs
new file mode 100644
index 0000000000..597efe1c3e
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/RegInfo.hs
@@ -0,0 +1,73 @@
+{-# LANGUAGE CPP #-}
+module GHC.CmmToAsm.X86.RegInfo (
+        mkVirtualReg,
+        regDotColor
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+
+import Outputable
+import GHC.Platform
+import Unique
+
+import UniqFM
+import GHC.CmmToAsm.X86.Regs
+
+
+mkVirtualReg :: Unique -> Format -> VirtualReg
+mkVirtualReg u format
+   = case format of
+        FF32    -> VirtualRegD u
+        -- for scalar F32, we use the same xmm as F64!
+        -- this is a hack that needs some improvement.
+        -- For now we map both to being allocated as "Double" Registers
+        -- on X86/X86_64
+        FF64    -> VirtualRegD u
+        _other  -> VirtualRegI u
+
+regDotColor :: Platform -> RealReg -> SDoc
+regDotColor platform reg
+ = case (lookupUFM (regColors platform) reg) of
+        Just str -> text str
+        _        -> panic "Register not assigned a color"
+
+regColors :: Platform -> UniqFM [Char]
+regColors platform = listToUFM (normalRegColors platform)
+
+normalRegColors :: Platform -> [(Reg,String)]
+normalRegColors platform =
+    zip (map regSingle [0..lastint platform]) colors
+        ++ zip (map regSingle [firstxmm..lastxmm platform]) greys
+  where
+    -- 16 colors - enough for amd64 gp regs
+    colors = ["#800000","#ff0000","#808000","#ffff00","#008000"
+             ,"#00ff00","#008080","#00ffff","#000080","#0000ff"
+             ,"#800080","#ff00ff","#87005f","#875f00","#87af00"
+             ,"#ff00af"]
+
+    -- 16 shades of grey, enough for the currently supported
+    -- SSE extensions.
+    greys = ["#0e0e0e","#1c1c1c","#2a2a2a","#383838","#464646"
+            ,"#545454","#626262","#707070","#7e7e7e","#8c8c8c"
+            ,"#9a9a9a","#a8a8a8","#b6b6b6","#c4c4c4","#d2d2d2"
+            ,"#e0e0e0"]
+
+
+
+--     32 shades of grey - use for avx 512 if we ever need it
+--     greys = ["#070707","#0e0e0e","#151515","#1c1c1c"
+--             ,"#232323","#2a2a2a","#313131","#383838","#3f3f3f"
+--             ,"#464646","#4d4d4d","#545454","#5b5b5b","#626262"
+--             ,"#696969","#707070","#777777","#7e7e7e","#858585"
+--             ,"#8c8c8c","#939393","#9a9a9a","#a1a1a1","#a8a8a8"
+--             ,"#afafaf","#b6b6b6","#bdbdbd","#c4c4c4","#cbcbcb"
+--             ,"#d2d2d2","#d9d9d9","#e0e0e0"]
+
+
diff --git a/compiler/GHC/CmmToAsm/X86/Regs.hs b/compiler/GHC/CmmToAsm/X86/Regs.hs
new file mode 100644
index 0000000000..87e31a1428
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/Regs.hs
@@ -0,0 +1,442 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.CmmToAsm.X86.Regs (
+        -- squeese functions for the graph allocator
+        virtualRegSqueeze,
+        realRegSqueeze,
+
+        -- immediates
+        Imm(..),
+        strImmLit,
+        litToImm,
+
+        -- addressing modes
+        AddrMode(..),
+        addrOffset,
+
+        -- registers
+        spRel,
+        argRegs,
+        allArgRegs,
+        allIntArgRegs,
+        callClobberedRegs,
+        instrClobberedRegs,
+        allMachRegNos,
+        classOfRealReg,
+        showReg,
+
+        -- machine specific
+        EABase(..), EAIndex(..), addrModeRegs,
+
+        eax, ebx, ecx, edx, esi, edi, ebp, esp,
+
+
+        rax, rbx, rcx, rdx, rsi, rdi, rbp, rsp,
+        r8,  r9,  r10, r11, r12, r13, r14, r15,
+        lastint,
+        xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
+        xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+        xmm,
+        firstxmm, lastxmm,
+
+        ripRel,
+        allFPArgRegs,
+
+        allocatableRegs
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Platform.Regs
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+
+import GHC.Cmm
+import GHC.Cmm.CLabel           ( CLabel )
+import GHC.Driver.Session
+import Outputable
+import GHC.Platform
+
+import qualified Data.Array as A
+
+-- | regSqueeze_class reg
+--      Calculate the maximum number of register colors that could be
+--      denied to a node of this class due to having this reg
+--      as a neighbour.
+--
+{-# INLINE virtualRegSqueeze #-}
+virtualRegSqueeze :: RegClass -> VirtualReg -> Int
+
+virtualRegSqueeze cls vr
+ = case cls of
+        RcInteger
+         -> case vr of
+                VirtualRegI{}           -> 1
+                VirtualRegHi{}          -> 1
+                _other                  -> 0
+
+        RcDouble
+         -> case vr of
+                VirtualRegD{}           -> 1
+                VirtualRegF{}           -> 0
+                _other                  -> 0
+
+
+        _other -> 0
+
+{-# INLINE realRegSqueeze #-}
+realRegSqueeze :: RegClass -> RealReg -> Int
+realRegSqueeze cls rr
+ = case cls of
+        RcInteger
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo < firstxmm -> 1
+                        | otherwise     -> 0
+
+                RealRegPair{}           -> 0
+
+        RcDouble
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo >= firstxmm  -> 1
+                        | otherwise     -> 0
+
+                RealRegPair{}           -> 0
+
+
+        _other -> 0
+
+-- -----------------------------------------------------------------------------
+-- Immediates
+
+data Imm
+  = ImmInt      Int
+  | ImmInteger  Integer     -- Sigh.
+  | ImmCLbl     CLabel      -- AbstractC Label (with baggage)
+  | ImmLit      SDoc        -- Simple string
+  | ImmIndex    CLabel Int
+  | ImmFloat    Rational
+  | ImmDouble   Rational
+  | ImmConstantSum Imm Imm
+  | ImmConstantDiff Imm Imm
+
+strImmLit :: String -> Imm
+strImmLit s = ImmLit (text s)
+
+
+litToImm :: CmmLit -> Imm
+litToImm (CmmInt i w)        = ImmInteger (narrowS w i)
+                -- narrow to the width: a CmmInt might be out of
+                -- range, but we assume that ImmInteger only contains
+                -- in-range values.  A signed value should be fine here.
+litToImm (CmmFloat f W32)    = ImmFloat f
+litToImm (CmmFloat f W64)    = ImmDouble f
+litToImm (CmmLabel l)        = ImmCLbl l
+litToImm (CmmLabelOff l off) = ImmIndex l off
+litToImm (CmmLabelDiffOff l1 l2 off _)
+                             = ImmConstantSum
+                               (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2))
+                               (ImmInt off)
+litToImm _                   = panic "X86.Regs.litToImm: no match"
+
+-- addressing modes ------------------------------------------------------------
+
+data AddrMode
+        = AddrBaseIndex EABase EAIndex Displacement
+        | ImmAddr Imm Int
+
+data EABase       = EABaseNone  | EABaseReg Reg | EABaseRip
+data EAIndex      = EAIndexNone | EAIndex Reg Int
+type Displacement = Imm
+
+
+addrOffset :: AddrMode -> Int -> Maybe AddrMode
+addrOffset addr off
+  = case addr of
+      ImmAddr i off0      -> Just (ImmAddr i (off0 + off))
+
+      AddrBaseIndex r i (ImmInt n) -> Just (AddrBaseIndex r i (ImmInt (n + off)))
+      AddrBaseIndex r i (ImmInteger n)
+        -> Just (AddrBaseIndex r i (ImmInt (fromInteger (n + toInteger off))))
+
+      AddrBaseIndex r i (ImmCLbl lbl)
+        -> Just (AddrBaseIndex r i (ImmIndex lbl off))
+
+      AddrBaseIndex r i (ImmIndex lbl ix)
+        -> Just (AddrBaseIndex r i (ImmIndex lbl (ix+off)))
+
+      _ -> Nothing  -- in theory, shouldn't happen
+
+
+addrModeRegs :: AddrMode -> [Reg]
+addrModeRegs (AddrBaseIndex b i _) =  b_regs ++ i_regs
+  where
+   b_regs = case b of { EABaseReg r -> [r]; _ -> [] }
+   i_regs = case i of { EAIndex r _ -> [r]; _ -> [] }
+addrModeRegs _ = []
+
+
+-- registers -------------------------------------------------------------------
+
+-- @spRel@ gives us a stack relative addressing mode for volatile
+-- temporaries and for excess call arguments.  @fpRel@, where
+-- applicable, is the same but for the frame pointer.
+
+
+spRel :: DynFlags
+      -> Int -- ^ desired stack offset in bytes, positive or negative
+      -> AddrMode
+spRel dflags n
+ | target32Bit (targetPlatform dflags)
+    = AddrBaseIndex (EABaseReg esp) EAIndexNone (ImmInt n)
+ | otherwise
+    = AddrBaseIndex (EABaseReg rsp) EAIndexNone (ImmInt n)
+
+-- The register numbers must fit into 32 bits on x86, so that we can
+-- use a Word32 to represent the set of free registers in the register
+-- allocator.
+
+
+
+firstxmm :: RegNo
+firstxmm  = 16
+
+--  on 32bit platformOSs, only the first 8 XMM/YMM/ZMM registers are available
+lastxmm :: Platform -> RegNo
+lastxmm platform
+ | target32Bit platform = firstxmm + 7  -- xmm0 - xmmm7
+ | otherwise            = firstxmm + 15 -- xmm0 -xmm15
+
+lastint :: Platform -> RegNo
+lastint platform
+ | target32Bit platform = 7 -- not %r8..%r15
+ | otherwise            = 15
+
+intregnos :: Platform -> [RegNo]
+intregnos platform = [0 .. lastint platform]
+
+
+
+xmmregnos :: Platform -> [RegNo]
+xmmregnos platform = [firstxmm  .. lastxmm platform]
+
+floatregnos :: Platform -> [RegNo]
+floatregnos platform = xmmregnos platform
+
+-- argRegs is the set of regs which are read for an n-argument call to C.
+-- For archs which pass all args on the stack (x86), is empty.
+-- Sparc passes up to the first 6 args in regs.
+argRegs :: RegNo -> [Reg]
+argRegs _       = panic "MachRegs.argRegs(x86): should not be used!"
+
+-- | The complete set of machine registers.
+allMachRegNos :: Platform -> [RegNo]
+allMachRegNos platform = intregnos platform ++ floatregnos platform
+
+-- | Take the class of a register.
+{-# INLINE classOfRealReg #-}
+classOfRealReg :: Platform -> RealReg -> RegClass
+-- On x86, we might want to have an 8-bit RegClass, which would
+-- contain just regs 1-4 (the others don't have 8-bit versions).
+-- However, we can get away without this at the moment because the
+-- only allocatable integer regs are also 8-bit compatible (1, 3, 4).
+classOfRealReg platform reg
+    = case reg of
+        RealRegSingle i
+            | i <= lastint platform -> RcInteger
+            | i <= lastxmm platform -> RcDouble
+            | otherwise             -> panic "X86.Reg.classOfRealReg registerSingle too high"
+        _   -> panic "X86.Regs.classOfRealReg: RegPairs on this arch"
+
+-- | Get the name of the register with this number.
+-- NOTE: fixme, we dont track which "way" the XMM registers are used
+showReg :: Platform -> RegNo -> String
+showReg platform n
+        | n >= firstxmm && n <= lastxmm  platform = "%xmm" ++ show (n-firstxmm)
+        | n >= 8   && n < firstxmm      = "%r" ++ show n
+        | otherwise      = regNames platform A.! n
+
+regNames :: Platform -> A.Array Int String
+regNames platform
+    = if target32Bit platform
+      then A.listArray (0,8) ["%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "%ebp", "%esp"]
+      else A.listArray (0,8) ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp"]
+
+
+
+-- machine specific ------------------------------------------------------------
+
+
+{-
+Intel x86 architecture:
+- All registers except 7 (esp) are available for use.
+- Only ebx, esi, edi and esp are available across a C call (they are callee-saves).
+- Registers 0-7 have 16-bit counterparts (ax, bx etc.)
+- Registers 0-3 have 8 bit counterparts (ah, bh etc.)
+
+The fp registers are all Double registers; we don't have any RcFloat class
+regs.  @regClass@ barfs if you give it a VirtualRegF, and mkVReg above should
+never generate them.
+
+TODO: cleanup modelling float vs double registers and how they are the same class.
+-}
+
+
+eax, ebx, ecx, edx, esp, ebp, esi, edi :: Reg
+
+eax   = regSingle 0
+ebx   = regSingle 1
+ecx   = regSingle 2
+edx   = regSingle 3
+esi   = regSingle 4
+edi   = regSingle 5
+ebp   = regSingle 6
+esp   = regSingle 7
+
+
+
+
+{-
+AMD x86_64 architecture:
+- All 16 integer registers are addressable as 8, 16, 32 and 64-bit values:
+
+  8     16    32    64
+  ---------------------
+  al    ax    eax   rax
+  bl    bx    ebx   rbx
+  cl    cx    ecx   rcx
+  dl    dx    edx   rdx
+  sil   si    esi   rsi
+  dil   si    edi   rdi
+  bpl   bp    ebp   rbp
+  spl   sp    esp   rsp
+  r10b  r10w  r10d  r10
+  r11b  r11w  r11d  r11
+  r12b  r12w  r12d  r12
+  r13b  r13w  r13d  r13
+  r14b  r14w  r14d  r14
+  r15b  r15w  r15d  r15
+-}
+
+rax, rbx, rcx, rdx, rsp, rbp, rsi, rdi,
+  r8, r9, r10, r11, r12, r13, r14, r15,
+  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
+  xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 :: Reg
+
+rax   = regSingle 0
+rbx   = regSingle 1
+rcx   = regSingle 2
+rdx   = regSingle 3
+rsi   = regSingle 4
+rdi   = regSingle 5
+rbp   = regSingle 6
+rsp   = regSingle 7
+r8    = regSingle 8
+r9    = regSingle 9
+r10   = regSingle 10
+r11   = regSingle 11
+r12   = regSingle 12
+r13   = regSingle 13
+r14   = regSingle 14
+r15   = regSingle 15
+xmm0  = regSingle 16
+xmm1  = regSingle 17
+xmm2  = regSingle 18
+xmm3  = regSingle 19
+xmm4  = regSingle 20
+xmm5  = regSingle 21
+xmm6  = regSingle 22
+xmm7  = regSingle 23
+xmm8  = regSingle 24
+xmm9  = regSingle 25
+xmm10 = regSingle 26
+xmm11 = regSingle 27
+xmm12 = regSingle 28
+xmm13 = regSingle 29
+xmm14 = regSingle 30
+xmm15 = regSingle 31
+
+ripRel :: Displacement -> AddrMode
+ripRel imm      = AddrBaseIndex EABaseRip EAIndexNone imm
+
+
+ -- so we can re-use some x86 code:
+{-
+eax = rax
+ebx = rbx
+ecx = rcx
+edx = rdx
+esi = rsi
+edi = rdi
+ebp = rbp
+esp = rsp
+-}
+
+xmm :: RegNo -> Reg
+xmm n = regSingle (firstxmm+n)
+
+
+
+
+-- | these are the regs which we cannot assume stay alive over a C call.
+callClobberedRegs       :: Platform -> [Reg]
+-- caller-saves registers
+callClobberedRegs platform
+ | target32Bit platform = [eax,ecx,edx] ++ map regSingle (floatregnos platform)
+ | platformOS platform == OSMinGW32
+   = [rax,rcx,rdx,r8,r9,r10,r11]
+   -- Only xmm0-5 are caller-saves registers on 64bit windows.
+   -- ( https://docs.microsoft.com/en-us/cpp/build/register-usage )
+   -- For details check the Win64 ABI.
+   ++ map xmm [0  .. 5]
+ | otherwise
+    -- all xmm regs are caller-saves
+    -- caller-saves registers
+    = [rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11]
+   ++ map regSingle (floatregnos platform)
+
+allArgRegs :: Platform -> [(Reg, Reg)]
+allArgRegs platform
+ | platformOS platform == OSMinGW32 = zip [rcx,rdx,r8,r9]
+                                          (map regSingle [firstxmm ..])
+ | otherwise = panic "X86.Regs.allArgRegs: not defined for this arch"
+
+allIntArgRegs :: Platform -> [Reg]
+allIntArgRegs platform
+ | (platformOS platform == OSMinGW32) || target32Bit platform
+    = panic "X86.Regs.allIntArgRegs: not defined for this platform"
+ | otherwise = [rdi,rsi,rdx,rcx,r8,r9]
+
+
+-- | on 64bit platforms we pass the first 8 float/double arguments
+-- in the xmm registers.
+allFPArgRegs :: Platform -> [Reg]
+allFPArgRegs platform
+ | platformOS platform == OSMinGW32
+    = panic "X86.Regs.allFPArgRegs: not defined for this platform"
+ | otherwise = map regSingle [firstxmm .. firstxmm + 7 ]
+
+
+-- Machine registers which might be clobbered by instructions that
+-- generate results into fixed registers, or need arguments in a fixed
+-- register.
+instrClobberedRegs :: Platform -> [Reg]
+instrClobberedRegs platform
+ | target32Bit platform = [ eax, ecx, edx ]
+ | otherwise            = [ rax, rcx, rdx ]
+
+--
+
+-- allocatableRegs is allMachRegNos with the fixed-use regs removed.
+-- i.e., these are the regs for which we are prepared to allow the
+-- register allocator to attempt to map VRegs to.
+allocatableRegs :: Platform -> [RealReg]
+allocatableRegs platform
+   = let isFree i = freeReg platform i
+     in  map RealRegSingle $ filter isFree (allMachRegNos platform)
+
diff --git a/compiler/GHC/CmmToC.hs b/compiler/GHC/CmmToC.hs
index f7245f5c30..6e6f58ba7d 100644
--- a/compiler/GHC/CmmToC.hs
+++ b/compiler/GHC/CmmToC.hs
@@ -39,7 +39,7 @@ import GHC.Cmm.Utils
 import GHC.Cmm.Switch
 
 -- Utils
-import CPrim
+import GHC.CmmToAsm.CPrim
 import GHC.Driver.Session
 import FastString
 import Outputable
diff --git a/compiler/GHC/Driver/CodeOutput.hs b/compiler/GHC/Driver/CodeOutput.hs
index e52d3216d5..507311c039 100644
--- a/compiler/GHC/Driver/CodeOutput.hs
+++ b/compiler/GHC/Driver/CodeOutput.hs
@@ -12,7 +12,7 @@ module GHC.Driver.CodeOutput ( codeOutput, outputForeignStubs ) where
 
 import GhcPrelude
 
-import AsmCodeGen       ( nativeCodeGen )
+import GHC.CmmToAsm     ( nativeCodeGen )
 import GHC.CmmToLlvm    ( llvmCodeGen )
 
 import UniqSupply       ( mkSplitUniqSupply )
diff --git a/compiler/GHC/Llvm/Types.hs b/compiler/GHC/Llvm/Types.hs
index f8ca0c826c..3e320634d0 100644
--- a/compiler/GHC/Llvm/Types.hs
+++ b/compiler/GHC/Llvm/Types.hs
@@ -20,7 +20,7 @@ import Outputable
 import Unique
 
 -- from NCG
-import PprBase
+import GHC.CmmToAsm.Ppr
 
 import GHC.Float
 
diff --git a/compiler/GHC/Platform/Reg.hs b/compiler/GHC/Platform/Reg.hs
new file mode 100644
index 0000000000..b856d7c3af
--- /dev/null
+++ b/compiler/GHC/Platform/Reg.hs
@@ -0,0 +1,241 @@
+-- | An architecture independent description of a register.
+--      This needs to stay architecture independent because it is used
+--      by NCGMonad and the register allocators, which are shared
+--      by all architectures.
+--
+module GHC.Platform.Reg (
+        RegNo,
+        Reg(..),
+        regPair,
+        regSingle,
+        isRealReg,      takeRealReg,
+        isVirtualReg,   takeVirtualReg,
+
+        VirtualReg(..),
+        renameVirtualReg,
+        classOfVirtualReg,
+        getHiVirtualRegFromLo,
+        getHiVRegFromLo,
+
+        RealReg(..),
+        regNosOfRealReg,
+        realRegsAlias,
+
+        liftPatchFnToRegReg
+)
+
+where
+
+import GhcPrelude
+
+import Outputable
+import Unique
+import GHC.Platform.Reg.Class
+import Data.List (intersect)
+
+-- | An identifier for a primitive real machine register.
+type RegNo
+        = Int
+
+-- VirtualRegs are virtual registers.  The register allocator will
+--      eventually have to map them into RealRegs, or into spill slots.
+--
+--      VirtualRegs are allocated on the fly, usually to represent a single
+--      value in the abstract assembly code (i.e. dynamic registers are
+--      usually single assignment).
+--
+--      The  single assignment restriction isn't necessary to get correct code,
+--      although a better register allocation will result if single
+--      assignment is used -- because the allocator maps a VirtualReg into
+--      a single RealReg, even if the VirtualReg has multiple live ranges.
+--
+--      Virtual regs can be of either class, so that info is attached.
+--
+data VirtualReg
+        = VirtualRegI  {-# UNPACK #-} !Unique
+        | VirtualRegHi {-# UNPACK #-} !Unique  -- High part of 2-word register
+        | VirtualRegF  {-# UNPACK #-} !Unique
+        | VirtualRegD  {-# UNPACK #-} !Unique
+
+        deriving (Eq, Show)
+
+-- This is laborious, but necessary. We can't derive Ord because
+-- Unique doesn't have an Ord instance. Note nonDetCmpUnique in the
+-- implementation. See Note [No Ord for Unique]
+-- This is non-deterministic but we do not currently support deterministic
+-- code-generation. See Note [Unique Determinism and code generation]
+instance Ord VirtualReg where
+  compare (VirtualRegI a) (VirtualRegI b) = nonDetCmpUnique a b
+  compare (VirtualRegHi a) (VirtualRegHi b) = nonDetCmpUnique a b
+  compare (VirtualRegF a) (VirtualRegF b) = nonDetCmpUnique a b
+  compare (VirtualRegD a) (VirtualRegD b) = nonDetCmpUnique a b
+
+  compare VirtualRegI{} _ = LT
+  compare _ VirtualRegI{} = GT
+  compare VirtualRegHi{} _ = LT
+  compare _ VirtualRegHi{} = GT
+  compare VirtualRegF{} _ = LT
+  compare _ VirtualRegF{} = GT
+
+
+
+instance Uniquable VirtualReg where
+        getUnique reg
+         = case reg of
+                VirtualRegI u   -> u
+                VirtualRegHi u  -> u
+                VirtualRegF u   -> u
+                VirtualRegD u   -> u
+
+instance Outputable VirtualReg where
+        ppr reg
+         = case reg of
+                VirtualRegI  u  -> text "%vI_"   <> pprUniqueAlways u
+                VirtualRegHi u  -> text "%vHi_"  <> pprUniqueAlways u
+                -- this code is kinda wrong on x86
+                -- because float and double occupy the same register set
+                -- namely SSE2 register xmm0 .. xmm15
+                VirtualRegF  u  -> text "%vFloat_"   <> pprUniqueAlways u
+                VirtualRegD  u  -> text "%vDouble_"   <> pprUniqueAlways u
+
+
+
+renameVirtualReg :: Unique -> VirtualReg -> VirtualReg
+renameVirtualReg u r
+ = case r of
+        VirtualRegI _   -> VirtualRegI  u
+        VirtualRegHi _  -> VirtualRegHi u
+        VirtualRegF _   -> VirtualRegF  u
+        VirtualRegD _   -> VirtualRegD  u
+
+
+classOfVirtualReg :: VirtualReg -> RegClass
+classOfVirtualReg vr
+ = case vr of
+        VirtualRegI{}   -> RcInteger
+        VirtualRegHi{}  -> RcInteger
+        VirtualRegF{}   -> RcFloat
+        VirtualRegD{}   -> RcDouble
+
+
+
+-- Determine the upper-half vreg for a 64-bit quantity on a 32-bit platform
+-- when supplied with the vreg for the lower-half of the quantity.
+-- (NB. Not reversible).
+getHiVirtualRegFromLo :: VirtualReg -> VirtualReg
+getHiVirtualRegFromLo reg
+ = case reg of
+        -- makes a pseudo-unique with tag 'H'
+        VirtualRegI u   -> VirtualRegHi (newTagUnique u 'H')
+        _               -> panic "Reg.getHiVirtualRegFromLo"
+
+getHiVRegFromLo :: Reg -> Reg
+getHiVRegFromLo reg
+ = case reg of
+        RegVirtual  vr  -> RegVirtual (getHiVirtualRegFromLo vr)
+        RegReal _       -> panic "Reg.getHiVRegFromLo"
+
+
+------------------------------------------------------------------------------------
+-- | RealRegs are machine regs which are available for allocation, in
+--      the usual way.  We know what class they are, because that's part of
+--      the processor's architecture.
+--
+--      RealRegPairs are pairs of real registers that are allocated together
+--      to hold a larger value, such as with Double regs on SPARC.
+--
+data RealReg
+        = RealRegSingle {-# UNPACK #-} !RegNo
+        | RealRegPair   {-# UNPACK #-} !RegNo {-# UNPACK #-} !RegNo
+        deriving (Eq, Show, Ord)
+
+instance Uniquable RealReg where
+        getUnique reg
+         = case reg of
+                RealRegSingle i         -> mkRegSingleUnique i
+                RealRegPair r1 r2       -> mkRegPairUnique (r1 * 65536 + r2)
+
+instance Outputable RealReg where
+        ppr reg
+         = case reg of
+                RealRegSingle i         -> text "%r"  <> int i
+                RealRegPair r1 r2       -> text "%r(" <> int r1
+                                           <> vbar <> int r2 <> text ")"
+
+regNosOfRealReg :: RealReg -> [RegNo]
+regNosOfRealReg rr
+ = case rr of
+        RealRegSingle r1        -> [r1]
+        RealRegPair   r1 r2     -> [r1, r2]
+
+
+realRegsAlias :: RealReg -> RealReg -> Bool
+realRegsAlias rr1 rr2
+        = not $ null $ intersect (regNosOfRealReg rr1) (regNosOfRealReg rr2)
+
+--------------------------------------------------------------------------------
+-- | A register, either virtual or real
+data Reg
+        = RegVirtual !VirtualReg
+        | RegReal    !RealReg
+        deriving (Eq, Ord)
+
+regSingle :: RegNo -> Reg
+regSingle regNo         = RegReal $ RealRegSingle regNo
+
+regPair :: RegNo -> RegNo -> Reg
+regPair regNo1 regNo2   = RegReal $ RealRegPair regNo1 regNo2
+
+
+-- We like to have Uniques for Reg so that we can make UniqFM and UniqSets
+-- in the register allocator.
+instance Uniquable Reg where
+        getUnique reg
+         = case reg of
+                RegVirtual vr   -> getUnique vr
+                RegReal    rr   -> getUnique rr
+
+-- | Print a reg in a generic manner
+--      If you want the architecture specific names, then use the pprReg
+--      function from the appropriate Ppr module.
+instance Outputable Reg where
+        ppr reg
+         = case reg of
+                RegVirtual vr   -> ppr vr
+                RegReal    rr   -> ppr rr
+
+
+isRealReg :: Reg -> Bool
+isRealReg reg
+ = case reg of
+        RegReal _       -> True
+        RegVirtual _    -> False
+
+takeRealReg :: Reg -> Maybe RealReg
+takeRealReg reg
+ = case reg of
+        RegReal rr      -> Just rr
+        _               -> Nothing
+
+
+isVirtualReg :: Reg -> Bool
+isVirtualReg reg
+ = case reg of
+        RegReal _       -> False
+        RegVirtual _    -> True
+
+takeVirtualReg :: Reg -> Maybe VirtualReg
+takeVirtualReg reg
+ = case reg of
+        RegReal _       -> Nothing
+        RegVirtual vr   -> Just vr
+
+
+-- | The patch function supplied by the allocator maps VirtualReg to RealReg
+--      regs, but sometimes we want to apply it to plain old Reg.
+--
+liftPatchFnToRegReg  :: (VirtualReg -> RealReg) -> (Reg -> Reg)
+liftPatchFnToRegReg patchF reg
+ = case reg of
+        RegVirtual vr   -> RegReal (patchF vr)
+        RegReal _       -> reg
diff --git a/compiler/GHC/Platform/Reg/Class.hs b/compiler/GHC/Platform/Reg/Class.hs
new file mode 100644
index 0000000000..225ad05be5
--- /dev/null
+++ b/compiler/GHC/Platform/Reg/Class.hs
@@ -0,0 +1,32 @@
+-- | An architecture independent description of a register's class.
+module GHC.Platform.Reg.Class
+        ( RegClass (..) )
+
+where
+
+import GhcPrelude
+
+import  Outputable
+import  Unique
+
+
+-- | The class of a register.
+--      Used in the register allocator.
+--      We treat all registers in a class as being interchangeable.
+--
+data RegClass
+        = RcInteger
+        | RcFloat
+        | RcDouble
+        deriving Eq
+
+
+instance Uniquable RegClass where
+    getUnique RcInteger = mkRegClassUnique 0
+    getUnique RcFloat   = mkRegClassUnique 1
+    getUnique RcDouble  = mkRegClassUnique 2
+
+instance Outputable RegClass where
+    ppr RcInteger       = Outputable.text "I"
+    ppr RcFloat         = Outputable.text "F"
+    ppr RcDouble        = Outputable.text "D"
diff --git a/compiler/GHC/Platform/Regs.hs b/compiler/GHC/Platform/Regs.hs
index 51f7658db2..d214b0d89f 100644
--- a/compiler/GHC/Platform/Regs.hs
+++ b/compiler/GHC/Platform/Regs.hs
@@ -7,7 +7,7 @@ import GhcPrelude
 
 import GHC.Cmm.Expr
 import GHC.Platform
-import Reg
+import GHC.Platform.Reg
 
 import qualified GHC.Platform.ARM        as ARM
 import qualified GHC.Platform.ARM64      as ARM64
author	Sylvain Henry <sylvain@haskus.fr>	2020-02-22 15:05:20 +0100
committer	Marge Bot <ben+marge-bot@smart-cactus.org>	2020-02-24 20:55:25 -0500
commit	1b1067d14b656bbbfa7c47f156ec2700c9751549 (patch)
tree	32346e3c4c3f89117190b36364144d85dc260e05 /compiler/GHC
parent	354e2787be08fb6d973de1a39e58080ff8e107f8 (diff)
download	haskell-1b1067d14b656bbbfa7c47f156ec2700c9751549.tar.gz