Modules: CmmToAsm (#13009)

author: Sylvain Henry <sylvain@haskus.fr> 2020-02-22 15:05:20 +0100
committer: Marge Bot <ben+marge-bot@smart-cactus.org> 2020-02-24 20:55:25 -0500
commit: 1b1067d14b656bbbfa7c47f156ec2700c9751549 (patch)
tree: 32346e3c4c3f89117190b36364144d85dc260e05 /compiler/GHC/CmmToAsm/X86
parent: 354e2787be08fb6d973de1a39e58080ff8e107f8 (diff)
download: haskell-1b1067d14b656bbbfa7c47f156ec2700c9751549.tar.gz
6 files changed, 6441 insertions, 0 deletions
diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
new file mode 100644
index 0000000000..f3b20c19e1
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
@@ -0,0 +1,3747 @@
+{-# LANGUAGE CPP, GADTs, NondecreasingIndentation #-}
+{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE BangPatterns #-}
+
+#if __GLASGOW_HASKELL__ <= 808
+-- GHC 8.10 deprecates this flag, but GHC 8.8 needs it
+-- The default iteration limit is a bit too low for the definitions
+-- in this module.
+{-# OPTIONS_GHC -fmax-pmcheck-iterations=10000000 #-}
+#endif
+
+{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
+
+-----------------------------------------------------------------------------
+--
+-- Generating machine code (instruction selection)
+--
+-- (c) The University of Glasgow 1996-2004
+--
+-----------------------------------------------------------------------------
+
+-- This is a big module, but, if you pay attention to
+-- (a) the sectioning, and (b) the type signatures, the
+-- structure should not be too overwhelming.
+
+module GHC.CmmToAsm.X86.CodeGen (
+        cmmTopCodeGen,
+        generateJumpTableForInstr,
+        extractUnwindPoints,
+        invertCondBranches,
+        InstrBlock
+)
+
+where
+
+#include "HsVersions.h"
+
+-- NCG stuff:
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Instr
+import GHC.CmmToAsm.X86.Cond
+import GHC.CmmToAsm.X86.Regs
+import GHC.CmmToAsm.X86.Ppr (  )
+import GHC.CmmToAsm.X86.RegInfo
+
+import GHC.Platform.Regs
+import GHC.CmmToAsm.CPrim
+import GHC.Cmm.DebugBlock
+   ( DebugBlock(..), UnwindPoint(..), UnwindTable
+   , UnwindExpr(UwReg), toUnwindExpr
+   )
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.PIC
+import GHC.CmmToAsm.Monad
+   ( NatM, getNewRegNat, getNewLabelNat, setDeltaNat
+   , getDeltaNat, getBlockIdNat, getPicBaseNat, getNewRegPairNat
+   , getPicBaseMaybeNat, getDebugBlock, getFileId
+   , addImmediateSuccessorNat, updateCfgNat
+   )
+import GHC.CmmToAsm.CFG
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+import GHC.Platform
+
+-- Our intermediate code:
+import BasicTypes
+import GHC.Cmm.BlockId
+import Module           ( primUnitId )
+import GHC.Cmm.Utils
+import GHC.Cmm.Switch
+import GHC.Cmm
+import GHC.Cmm.Dataflow.Block
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Graph
+import GHC.Cmm.Dataflow.Label
+import GHC.Cmm.CLabel
+import CoreSyn          ( Tickish(..) )
+import SrcLoc           ( srcSpanFile, srcSpanStartLine, srcSpanStartCol )
+
+-- The rest:
+import ForeignCall      ( CCallConv(..) )
+import OrdList
+import Outputable
+import FastString
+import GHC.Driver.Session
+import Util
+import UniqSupply       ( getUniqueM )
+
+import Control.Monad
+import Data.Bits
+import Data.Foldable (fold)
+import Data.Int
+import Data.Maybe
+import Data.Word
+
+import qualified Data.Map as M
+
+is32BitPlatform :: NatM Bool
+is32BitPlatform = do
+    dflags <- getDynFlags
+    return $ target32Bit (targetPlatform dflags)
+
+sse2Enabled :: NatM Bool
+sse2Enabled = do
+  dflags <- getDynFlags
+  case platformArch (targetPlatform dflags) of
+  -- We Assume  SSE1 and SSE2 operations are available on both
+  -- x86 and x86_64. Historically we didn't default to SSE2 and
+  -- SSE1 on x86, which results in defacto nondeterminism for how
+  -- rounding behaves in the associated x87 floating point instructions
+  -- because variations in the spill/fpu stack placement of arguments for
+  -- operations would change the precision and final result of what
+  -- would otherwise be the same expressions with respect to single or
+  -- double precision IEEE floating point computations.
+    ArchX86_64 -> return True
+    ArchX86    -> return True
+    _          -> panic "trying to generate x86/x86_64 on the wrong platform"
+
+
+sse4_2Enabled :: NatM Bool
+sse4_2Enabled = do
+  dflags <- getDynFlags
+  return (isSse4_2Enabled dflags)
+
+
+cmmTopCodeGen
+        :: RawCmmDecl
+        -> NatM [NatCmmDecl (Alignment, RawCmmStatics) Instr]
+
+cmmTopCodeGen (CmmProc info lab live graph) = do
+  let blocks = toBlockListEntryFirst graph
+  (nat_blocks,statics) <- mapAndUnzipM basicBlockCodeGen blocks
+  picBaseMb <- getPicBaseMaybeNat
+  dflags <- getDynFlags
+  let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
+      tops = proc : concat statics
+      os   = platformOS $ targetPlatform dflags
+
+  case picBaseMb of
+      Just picBase -> initializePicBase_x86 ArchX86 os picBase tops
+      Nothing -> return tops
+
+cmmTopCodeGen (CmmData sec dat) = do
+  return [CmmData sec (mkAlignment 1, dat)]  -- no translation, we just use CmmStatic
+
+{- Note [Verifying basic blocks]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   We want to guarantee a few things about the results
+   of instruction selection.
+
+   Namely that each basic blocks consists of:
+    * A (potentially empty) sequence of straight line instructions
+  followed by
+    * A (potentially empty) sequence of jump like instructions.
+
+    We can verify this by going through the instructions and
+    making sure that any non-jumpish instruction can't appear
+    after a jumpish instruction.
+
+    There are gotchas however:
+    * CALLs are strictly speaking control flow but here we care
+      not about them. Hence we treat them as regular instructions.
+
+      It's safe for them to appear inside a basic block
+      as (ignoring side effects inside the call) they will result in
+      straight line code.
+
+    * NEWBLOCK marks the start of a new basic block so can
+      be followed by any instructions.
+-}
+
+-- Verifying basic blocks is cheap, but not cheap enough to enable it unconditionally.
+verifyBasicBlock :: [Instr] -> ()
+verifyBasicBlock instrs
+  | debugIsOn     = go False instrs
+  | otherwise     = ()
+  where
+    go _     [] = ()
+    go atEnd (i:instr)
+        = case i of
+            -- Start a new basic block
+            NEWBLOCK {} -> go False instr
+            -- Calls are not viable block terminators
+            CALL {}     | atEnd -> faultyBlockWith i
+                        | not atEnd -> go atEnd instr
+            -- All instructions ok, check if we reached the end and continue.
+            _ | not atEnd -> go (isJumpishInstr i) instr
+              -- Only jumps allowed at the end of basic blocks.
+              | otherwise -> if isJumpishInstr i
+                                then go True instr
+                                else faultyBlockWith i
+    faultyBlockWith i
+        = pprPanic "Non control flow instructions after end of basic block."
+                   (ppr i <+> text "in:" $$ vcat (map ppr instrs))
+
+basicBlockCodeGen
+        :: CmmBlock
+        -> NatM ( [NatBasicBlock Instr]
+                , [NatCmmDecl (Alignment, RawCmmStatics) Instr])
+
+basicBlockCodeGen block = do
+  let (_, nodes, tail)  = blockSplit block
+      id = entryLabel block
+      stmts = blockToList nodes
+  -- Generate location directive
+  dbg <- getDebugBlock (entryLabel block)
+  loc_instrs <- case dblSourceTick =<< dbg of
+    Just (SourceNote span name)
+      -> do fileId <- getFileId (srcSpanFile span)
+            let line = srcSpanStartLine span; col = srcSpanStartCol span
+            return $ unitOL $ LOCATION fileId line col name
+    _ -> return nilOL
+  (mid_instrs,mid_bid) <- stmtsToInstrs id stmts
+  (!tail_instrs,_) <- stmtToInstrs mid_bid tail
+  let instrs = loc_instrs `appOL` mid_instrs `appOL` tail_instrs
+  return $! verifyBasicBlock (fromOL instrs)
+  instrs' <- fold <$> traverse addSpUnwindings instrs
+  -- code generation may introduce new basic block boundaries, which
+  -- are indicated by the NEWBLOCK instruction.  We must split up the
+  -- instruction stream into basic blocks again.  Also, we extract
+  -- LDATAs here too.
+  let
+        (top,other_blocks,statics) = foldrOL mkBlocks ([],[],[]) instrs'
+
+        mkBlocks (NEWBLOCK id) (instrs,blocks,statics)
+          = ([], BasicBlock id instrs : blocks, statics)
+        mkBlocks (LDATA sec dat) (instrs,blocks,statics)
+          = (instrs, blocks, CmmData sec dat:statics)
+        mkBlocks instr (instrs,blocks,statics)
+          = (instr:instrs, blocks, statics)
+  return (BasicBlock id top : other_blocks, statics)
+
+-- | Convert 'DELTA' instructions into 'UNWIND' instructions to capture changes
+-- in the @sp@ register. See Note [What is this unwinding business?] in Debug
+-- for details.
+addSpUnwindings :: Instr -> NatM (OrdList Instr)
+addSpUnwindings instr@(DELTA d) = do
+    dflags <- getDynFlags
+    if debugLevel dflags >= 1
+        then do lbl <- mkAsmTempLabel <$> getUniqueM
+                let unwind = M.singleton MachSp (Just $ UwReg MachSp $ negate d)
+                return $ toOL [ instr, UNWIND lbl unwind ]
+        else return (unitOL instr)
+addSpUnwindings instr = return $ unitOL instr
+
+{- Note [Keeping track of the current block]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When generating instructions for Cmm we sometimes require
+the current block for things like retry loops.
+
+We also sometimes change the current block, if a MachOP
+results in branching control flow.
+
+Issues arise if we have two statements in the same block,
+which both depend on the current block id *and* change the
+basic block after them. This happens for atomic primops
+in the X86 backend where we want to update the CFG data structure
+when introducing new basic blocks.
+
+For example in #17334 we got this Cmm code:
+
+        c3Bf: // global
+            (_s3t1::I64) = call MO_AtomicRMW W64 AMO_And(_s3sQ::P64 + 88, 18);
+            (_s3t4::I64) = call MO_AtomicRMW W64 AMO_Or(_s3sQ::P64 + 88, 0);
+            _s3sT::I64 = _s3sV::I64;
+            goto c3B1;
+
+This resulted in two new basic blocks being inserted:
+
+        c3Bf:
+                movl $18,%vI_n3Bo
+                movq 88(%vI_s3sQ),%rax
+                jmp _n3Bp
+        n3Bp:
+                ...
+                cmpxchgq %vI_n3Bq,88(%vI_s3sQ)
+                jne _n3Bp
+                ...
+                jmp _n3Bs
+        n3Bs:
+                ...
+                cmpxchgq %vI_n3Bt,88(%vI_s3sQ)
+                jne _n3Bs
+                ...
+                jmp _c3B1
+        ...
+
+Based on the Cmm we called stmtToInstrs we translated both atomic operations under
+the assumption they would be placed into their Cmm basic block `c3Bf`.
+However for the retry loop we introduce new labels, so this is not the case
+for the second statement.
+This resulted in a desync between the explicit control flow graph
+we construct as a separate data type and the actual control flow graph in the code.
+
+Instead we now return the new basic block if a statement causes a change
+in the current block and use the block for all following statements.
+
+For this reason genCCall is also split into two parts.
+One for calls which *won't* change the basic blocks in
+which successive instructions will be placed.
+A different one for calls which *are* known to change the
+basic block.
+
+-}
+
+-- See Note [Keeping track of the current block] for why
+-- we pass the BlockId.
+stmtsToInstrs :: BlockId -- ^ Basic block these statement will start to be placed in.
+              -> [CmmNode O O] -- ^ Cmm Statement
+              -> NatM (InstrBlock, BlockId) -- ^ Resulting instruction
+stmtsToInstrs bid stmts =
+    go bid stmts nilOL
+  where
+    go bid  []        instrs = return (instrs,bid)
+    go bid (s:stmts)  instrs = do
+      (instrs',bid') <- stmtToInstrs bid s
+      -- If the statement introduced a new block, we use that one
+      let !newBid = fromMaybe bid bid'
+      go newBid stmts (instrs `appOL` instrs')
+
+-- | `bid` refers to the current block and is used to update the CFG
+--   if new blocks are inserted in the control flow.
+-- See Note [Keeping track of the current block] for more details.
+stmtToInstrs :: BlockId -- ^ Basic block this statement will start to be placed in.
+             -> CmmNode e x
+             -> NatM (InstrBlock, Maybe BlockId)
+             -- ^ Instructions, and bid of new block if successive
+             -- statements are placed in a different basic block.
+stmtToInstrs bid stmt = do
+  dflags <- getDynFlags
+  is32Bit <- is32BitPlatform
+  case stmt of
+    CmmUnsafeForeignCall target result_regs args
+       -> genCCall dflags is32Bit target result_regs args bid
+
+    _ -> (,Nothing) <$> case stmt of
+      CmmComment s   -> return (unitOL (COMMENT s))
+      CmmTick {}     -> return nilOL
+
+      CmmUnwind regs -> do
+        let to_unwind_entry :: (GlobalReg, Maybe CmmExpr) -> UnwindTable
+            to_unwind_entry (reg, expr) = M.singleton reg (fmap toUnwindExpr expr)
+        case foldMap to_unwind_entry regs of
+          tbl | M.null tbl -> return nilOL
+              | otherwise  -> do
+                  lbl <- mkAsmTempLabel <$> getUniqueM
+                  return $ unitOL $ UNWIND lbl tbl
+
+      CmmAssign reg src
+        | isFloatType ty         -> assignReg_FltCode format reg src
+        | is32Bit && isWord64 ty -> assignReg_I64Code      reg src
+        | otherwise              -> assignReg_IntCode format reg src
+          where ty = cmmRegType dflags reg
+                format = cmmTypeFormat ty
+
+      CmmStore addr src
+        | isFloatType ty         -> assignMem_FltCode format addr src
+        | is32Bit && isWord64 ty -> assignMem_I64Code      addr src
+        | otherwise              -> assignMem_IntCode format addr src
+          where ty = cmmExprType dflags src
+                format = cmmTypeFormat ty
+
+      CmmBranch id          -> return $ genBranch id
+
+      --We try to arrange blocks such that the likely branch is the fallthrough
+      --in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here.
+      CmmCondBranch arg true false _ -> genCondBranch bid true false arg
+      CmmSwitch arg ids -> do dflags <- getDynFlags
+                              genSwitch dflags arg ids
+      CmmCall { cml_target = arg
+              , cml_args_regs = gregs } -> do
+                                  dflags <- getDynFlags
+                                  genJump arg (jumpRegs dflags gregs)
+      _ ->
+        panic "stmtToInstrs: statement should have been cps'd away"
+
+
+jumpRegs :: DynFlags -> [GlobalReg] -> [Reg]
+jumpRegs dflags gregs = [ RegReal r | Just r <- map (globalRegMaybe platform) gregs ]
+    where platform = targetPlatform dflags
+
+--------------------------------------------------------------------------------
+-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
+--      They are really trees of insns to facilitate fast appending, where a
+--      left-to-right traversal yields the insns in the correct order.
+--
+type InstrBlock
+        = OrdList Instr
+
+
+-- | Condition codes passed up the tree.
+--
+data CondCode
+        = CondCode Bool Cond InstrBlock
+
+
+-- | a.k.a "Register64"
+--      Reg is the lower 32-bit temporary which contains the result.
+--      Use getHiVRegFromLo to find the other VRegUnique.
+--
+--      Rules of this simplified insn selection game are therefore that
+--      the returned Reg may be modified
+--
+data ChildCode64
+   = ChildCode64
+        InstrBlock
+        Reg
+
+
+-- | Register's passed up the tree.  If the stix code forces the register
+--      to live in a pre-decided machine register, it comes out as @Fixed@;
+--      otherwise, it comes out as @Any@, and the parent can decide which
+--      register to put it in.
+--
+data Register
+        = Fixed Format Reg InstrBlock
+        | Any   Format (Reg -> InstrBlock)
+
+
+swizzleRegisterRep :: Register -> Format -> Register
+swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code
+swizzleRegisterRep (Any _ codefn)     format = Any   format codefn
+
+
+-- | Grab the Reg for a CmmReg
+getRegisterReg :: Platform  -> CmmReg -> Reg
+
+getRegisterReg _   (CmmLocal (LocalReg u pk))
+  = -- by Assuming SSE2, Int,Word,Float,Double all can be register allocated
+   let fmt = cmmTypeFormat pk in
+        RegVirtual (mkVirtualReg u fmt)
+
+getRegisterReg platform  (CmmGlobal mid)
+  = case globalRegMaybe platform mid of
+        Just reg -> RegReal $ reg
+        Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
+        -- By this stage, the only MagicIds remaining should be the
+        -- ones which map to a real machine register on this
+        -- platform.  Hence ...
+
+
+-- | Memory addressing modes passed up the tree.
+data Amode
+        = Amode AddrMode InstrBlock
+
+{-
+Now, given a tree (the argument to a CmmLoad) that references memory,
+produce a suitable addressing mode.
+
+A Rule of the Game (tm) for Amodes: use of the addr bit must
+immediately follow use of the code part, since the code part puts
+values in registers which the addr then refers to.  So you can't put
+anything in between, lest it overwrite some of those registers.  If
+you need to do some other computation between the code part and use of
+the addr bit, first store the effective address from the amode in a
+temporary, then do the other computation, and then use the temporary:
+
+    code
+    LEA amode, tmp
+    ... other computation ...
+    ... (tmp) ...
+-}
+
+
+-- | Check whether an integer will fit in 32 bits.
+--      A CmmInt is intended to be truncated to the appropriate
+--      number of bits, so here we truncate it to Int64.  This is
+--      important because e.g. -1 as a CmmInt might be either
+--      -1 or 18446744073709551615.
+--
+is32BitInteger :: Integer -> Bool
+is32BitInteger i = i64 <= 0x7fffffff && i64 >= -0x80000000
+  where i64 = fromIntegral i :: Int64
+
+
+-- | Convert a BlockId to some CmmStatic data
+jumpTableEntry :: DynFlags -> Maybe BlockId -> CmmStatic
+jumpTableEntry dflags Nothing = CmmStaticLit (CmmInt 0 (wordWidth dflags))
+jumpTableEntry _ (Just blockid) = CmmStaticLit (CmmLabel blockLabel)
+    where blockLabel = blockLbl blockid
+
+
+-- -----------------------------------------------------------------------------
+-- General things for putting together code sequences
+
+-- Expand CmmRegOff.  ToDo: should we do it this way around, or convert
+-- CmmExprs into CmmRegOff?
+mangleIndexTree :: DynFlags -> CmmReg -> Int -> CmmExpr
+mangleIndexTree dflags reg off
+  = CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
+  where width = typeWidth (cmmRegType dflags reg)
+
+-- | The dual to getAnyReg: compute an expression into a register, but
+--      we don't mind which one it is.
+getSomeReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getSomeReg expr = do
+  r <- getRegister expr
+  case r of
+    Any rep code -> do
+        tmp <- getNewRegNat rep
+        return (tmp, code tmp)
+    Fixed _ reg code ->
+        return (reg, code)
+
+
+assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock
+assignMem_I64Code addrTree valueTree = do
+  Amode addr addr_code <- getAmode addrTree
+  ChildCode64 vcode rlo <- iselExpr64 valueTree
+  let
+        rhi = getHiVRegFromLo rlo
+
+        -- Little-endian store
+        mov_lo = MOV II32 (OpReg rlo) (OpAddr addr)
+        mov_hi = MOV II32 (OpReg rhi) (OpAddr (fromJust (addrOffset addr 4)))
+  return (vcode `appOL` addr_code `snocOL` mov_lo `snocOL` mov_hi)
+
+
+assignReg_I64Code :: CmmReg  -> CmmExpr -> NatM InstrBlock
+assignReg_I64Code (CmmLocal (LocalReg u_dst _)) valueTree = do
+   ChildCode64 vcode r_src_lo <- iselExpr64 valueTree
+   let
+         r_dst_lo = RegVirtual $ mkVirtualReg u_dst II32
+         r_dst_hi = getHiVRegFromLo r_dst_lo
+         r_src_hi = getHiVRegFromLo r_src_lo
+         mov_lo = MOV II32 (OpReg r_src_lo) (OpReg r_dst_lo)
+         mov_hi = MOV II32 (OpReg r_src_hi) (OpReg r_dst_hi)
+   return (
+        vcode `snocOL` mov_lo `snocOL` mov_hi
+     )
+
+assignReg_I64Code _ _
+   = panic "assignReg_I64Code(i386): invalid lvalue"
+
+
+iselExpr64        :: CmmExpr -> NatM ChildCode64
+iselExpr64 (CmmLit (CmmInt i _)) = do
+  (rlo,rhi) <- getNewRegPairNat II32
+  let
+        r = fromIntegral (fromIntegral i :: Word32)
+        q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
+        code = toOL [
+                MOV II32 (OpImm (ImmInteger r)) (OpReg rlo),
+                MOV II32 (OpImm (ImmInteger q)) (OpReg rhi)
+                ]
+  return (ChildCode64 code rlo)
+
+iselExpr64 (CmmLoad addrTree ty) | isWord64 ty = do
+   Amode addr addr_code <- getAmode addrTree
+   (rlo,rhi) <- getNewRegPairNat II32
+   let
+        mov_lo = MOV II32 (OpAddr addr) (OpReg rlo)
+        mov_hi = MOV II32 (OpAddr (fromJust (addrOffset addr 4))) (OpReg rhi)
+   return (
+            ChildCode64 (addr_code `snocOL` mov_lo `snocOL` mov_hi)
+                        rlo
+     )
+
+iselExpr64 (CmmReg (CmmLocal (LocalReg vu ty))) | isWord64 ty
+   = return (ChildCode64 nilOL (RegVirtual $ mkVirtualReg vu II32))
+
+-- we handle addition, but rather badly
+iselExpr64 (CmmMachOp (MO_Add _) [e1, CmmLit (CmmInt i _)]) = do
+   ChildCode64 code1 r1lo <- iselExpr64 e1
+   (rlo,rhi) <- getNewRegPairNat II32
+   let
+        r = fromIntegral (fromIntegral i :: Word32)
+        q = fromIntegral (fromIntegral (i `shiftR` 32) :: Word32)
+        r1hi = getHiVRegFromLo r1lo
+        code =  code1 `appOL`
+                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                       ADD II32 (OpImm (ImmInteger r)) (OpReg rlo),
+                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       ADC II32 (OpImm (ImmInteger q)) (OpReg rhi) ]
+   return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_Add _) [e1,e2]) = do
+   ChildCode64 code1 r1lo <- iselExpr64 e1
+   ChildCode64 code2 r2lo <- iselExpr64 e2
+   (rlo,rhi) <- getNewRegPairNat II32
+   let
+        r1hi = getHiVRegFromLo r1lo
+        r2hi = getHiVRegFromLo r2lo
+        code =  code1 `appOL`
+                code2 `appOL`
+                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                       ADD II32 (OpReg r2lo) (OpReg rlo),
+                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       ADC II32 (OpReg r2hi) (OpReg rhi) ]
+   return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_Sub _) [e1,e2]) = do
+   ChildCode64 code1 r1lo <- iselExpr64 e1
+   ChildCode64 code2 r2lo <- iselExpr64 e2
+   (rlo,rhi) <- getNewRegPairNat II32
+   let
+        r1hi = getHiVRegFromLo r1lo
+        r2hi = getHiVRegFromLo r2lo
+        code =  code1 `appOL`
+                code2 `appOL`
+                toOL [ MOV II32 (OpReg r1lo) (OpReg rlo),
+                       SUB II32 (OpReg r2lo) (OpReg rlo),
+                       MOV II32 (OpReg r1hi) (OpReg rhi),
+                       SBB II32 (OpReg r2hi) (OpReg rhi) ]
+   return (ChildCode64 code rlo)
+
+iselExpr64 (CmmMachOp (MO_UU_Conv _ W64) [expr]) = do
+     fn <- getAnyReg expr
+     r_dst_lo <-  getNewRegNat II32
+     let r_dst_hi = getHiVRegFromLo r_dst_lo
+         code = fn r_dst_lo
+     return (
+             ChildCode64 (code `snocOL`
+                          MOV II32 (OpImm (ImmInt 0)) (OpReg r_dst_hi))
+                          r_dst_lo
+            )
+
+iselExpr64 (CmmMachOp (MO_SS_Conv W32 W64) [expr]) = do
+     fn <- getAnyReg expr
+     r_dst_lo <-  getNewRegNat II32
+     let r_dst_hi = getHiVRegFromLo r_dst_lo
+         code = fn r_dst_lo
+     return (
+             ChildCode64 (code `snocOL`
+                          MOV II32 (OpReg r_dst_lo) (OpReg eax) `snocOL`
+                          CLTD II32 `snocOL`
+                          MOV II32 (OpReg eax) (OpReg r_dst_lo) `snocOL`
+                          MOV II32 (OpReg edx) (OpReg r_dst_hi))
+                          r_dst_lo
+            )
+
+iselExpr64 expr
+   = pprPanic "iselExpr64(i386)" (ppr expr)
+
+
+--------------------------------------------------------------------------------
+getRegister :: CmmExpr -> NatM Register
+getRegister e = do dflags <- getDynFlags
+                   is32Bit <- is32BitPlatform
+                   getRegister' dflags is32Bit e
+
+getRegister' :: DynFlags -> Bool -> CmmExpr -> NatM Register
+
+getRegister' dflags is32Bit (CmmReg reg)
+  = case reg of
+        CmmGlobal PicBaseReg
+         | is32Bit ->
+            -- on x86_64, we have %rip for PicBaseReg, but it's not
+            -- a full-featured register, it can only be used for
+            -- rip-relative addressing.
+            do reg' <- getPicBaseNat (archWordFormat is32Bit)
+               return (Fixed (archWordFormat is32Bit) reg' nilOL)
+        _ ->
+            do
+               let
+                 fmt = cmmTypeFormat (cmmRegType dflags reg)
+                 format  = fmt
+               --
+               let platform = targetPlatform dflags
+               return (Fixed format
+                             (getRegisterReg platform  reg)
+                             nilOL)
+
+
+getRegister' dflags is32Bit (CmmRegOff r n)
+  = getRegister' dflags is32Bit $ mangleIndexTree dflags r n
+
+getRegister' dflags is32Bit (CmmMachOp (MO_AlignmentCheck align _) [e])
+  = addAlignmentCheck align <$> getRegister' dflags is32Bit e
+
+-- for 32-bit architectures, support some 64 -> 32 bit conversions:
+-- TO_W_(x), TO_W_(x >> 32)
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32)
+                     [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
+ | is32Bit = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32)
+                     [CmmMachOp (MO_U_Shr W64) [x,CmmLit (CmmInt 32 _)]])
+ | is32Bit = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 (getHiVRegFromLo rlo) code
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W64 W32) [x])
+ | is32Bit = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 rlo code
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) [x])
+ | is32Bit = do
+  ChildCode64 code rlo <- iselExpr64 x
+  return $ Fixed II32 rlo code
+
+getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
+  float_const_sse2  where
+  float_const_sse2
+    | f == 0.0 = do
+      let
+          format = floatFormat w
+          code dst = unitOL  (XOR format (OpReg dst) (OpReg dst))
+        -- I don't know why there are xorpd, xorps, and pxor instructions.
+        -- They all appear to do the same thing --SDM
+      return (Any format code)
+
+   | otherwise = do
+      Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
+      loadFloatAmode w addr code
+
+-- catch simple cases of zero- or sign-extended load
+getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVZxL II8) addr
+  return (Any II32 code)
+
+getRegister' _ _ (CmmMachOp (MO_SS_Conv W8 W32) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL II8) addr
+  return (Any II32 code)
+
+getRegister' _ _ (CmmMachOp (MO_UU_Conv W16 W32) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVZxL II16) addr
+  return (Any II32 code)
+
+getRegister' _ _ (CmmMachOp (MO_SS_Conv W16 W32) [CmmLoad addr _]) = do
+  code <- intLoadCode (MOVSxL II16) addr
+  return (Any II32 code)
+
+-- catch simple cases of zero- or sign-extended load
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W8 W64) [CmmLoad addr _])
+ | not is32Bit = do
+  code <- intLoadCode (MOVZxL II8) addr
+  return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W8 W64) [CmmLoad addr _])
+ | not is32Bit = do
+  code <- intLoadCode (MOVSxL II8) addr
+  return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W16 W64) [CmmLoad addr _])
+ | not is32Bit = do
+  code <- intLoadCode (MOVZxL II16) addr
+  return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W16 W64) [CmmLoad addr _])
+ | not is32Bit = do
+  code <- intLoadCode (MOVSxL II16) addr
+  return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_UU_Conv W32 W64) [CmmLoad addr _])
+ | not is32Bit = do
+  code <- intLoadCode (MOV II32) addr -- 32-bit loads zero-extend
+  return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W32 W64) [CmmLoad addr _])
+ | not is32Bit = do
+  code <- intLoadCode (MOVSxL II32) addr
+  return (Any II64 code)
+
+getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
+                                     CmmLit displacement])
+ | not is32Bit = do
+      return $ Any II64 (\dst -> unitOL $
+        LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
+
+getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
+    case mop of
+      MO_F_Neg w  -> sse2NegCode w x
+
+
+      MO_S_Neg w -> triv_ucode NEGI (intFormat w)
+      MO_Not w   -> triv_ucode NOT  (intFormat w)
+
+      -- Nop conversions
+      MO_UU_Conv W32 W8  -> toI8Reg  W32 x
+      MO_SS_Conv W32 W8  -> toI8Reg  W32 x
+      MO_XX_Conv W32 W8  -> toI8Reg  W32 x
+      MO_UU_Conv W16 W8  -> toI8Reg  W16 x
+      MO_SS_Conv W16 W8  -> toI8Reg  W16 x
+      MO_XX_Conv W16 W8  -> toI8Reg  W16 x
+      MO_UU_Conv W32 W16 -> toI16Reg W32 x
+      MO_SS_Conv W32 W16 -> toI16Reg W32 x
+      MO_XX_Conv W32 W16 -> toI16Reg W32 x
+
+      MO_UU_Conv W64 W32 | not is32Bit -> conversionNop II64 x
+      MO_SS_Conv W64 W32 | not is32Bit -> conversionNop II64 x
+      MO_XX_Conv W64 W32 | not is32Bit -> conversionNop II64 x
+      MO_UU_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
+      MO_SS_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
+      MO_XX_Conv W64 W16 | not is32Bit -> toI16Reg W64 x
+      MO_UU_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
+      MO_SS_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
+      MO_XX_Conv W64 W8  | not is32Bit -> toI8Reg  W64 x
+
+      MO_UU_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
+      MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
+      MO_XX_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
+
+      -- widenings
+      MO_UU_Conv W8  W32 -> integerExtend W8  W32 MOVZxL x
+      MO_UU_Conv W16 W32 -> integerExtend W16 W32 MOVZxL x
+      MO_UU_Conv W8  W16 -> integerExtend W8  W16 MOVZxL x
+
+      MO_SS_Conv W8  W32 -> integerExtend W8  W32 MOVSxL x
+      MO_SS_Conv W16 W32 -> integerExtend W16 W32 MOVSxL x
+      MO_SS_Conv W8  W16 -> integerExtend W8  W16 MOVSxL x
+
+      -- We don't care about the upper bits for MO_XX_Conv, so MOV is enough. However, on 32-bit we
+      -- have 8-bit registers only for a few registers (as opposed to x86-64 where every register
+      -- has 8-bit version). So for 32-bit code, we'll just zero-extend.
+      MO_XX_Conv W8  W32
+          | is32Bit   -> integerExtend W8 W32 MOVZxL x
+          | otherwise -> integerExtend W8 W32 MOV x
+      MO_XX_Conv W8  W16
+          | is32Bit   -> integerExtend W8 W16 MOVZxL x
+          | otherwise -> integerExtend W8 W16 MOV x
+      MO_XX_Conv W16 W32 -> integerExtend W16 W32 MOV x
+
+      MO_UU_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVZxL x
+      MO_UU_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVZxL x
+      MO_UU_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVZxL x
+      MO_SS_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOVSxL x
+      MO_SS_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOVSxL x
+      MO_SS_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOVSxL x
+      -- For 32-to-64 bit zero extension, amd64 uses an ordinary movl.
+      -- However, we don't want the register allocator to throw it
+      -- away as an unnecessary reg-to-reg move, so we keep it in
+      -- the form of a movzl and print it as a movl later.
+      -- This doesn't apply to MO_XX_Conv since in this case we don't care about
+      -- the upper bits. So we can just use MOV.
+      MO_XX_Conv W8  W64 | not is32Bit -> integerExtend W8  W64 MOV x
+      MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
+      MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
+
+      MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
+
+
+      MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
+
+      MO_FS_Conv from to -> coerceFP2Int from to x
+      MO_SF_Conv from to -> coerceInt2FP from to x
+
+      MO_V_Insert {}   -> needLlvm
+      MO_V_Extract {}  -> needLlvm
+      MO_V_Add {}      -> needLlvm
+      MO_V_Sub {}      -> needLlvm
+      MO_V_Mul {}      -> needLlvm
+      MO_VS_Quot {}    -> needLlvm
+      MO_VS_Rem {}     -> needLlvm
+      MO_VS_Neg {}     -> needLlvm
+      MO_VU_Quot {}    -> needLlvm
+      MO_VU_Rem {}     -> needLlvm
+      MO_VF_Insert {}  -> needLlvm
+      MO_VF_Extract {} -> needLlvm
+      MO_VF_Add {}     -> needLlvm
+      MO_VF_Sub {}     -> needLlvm
+      MO_VF_Mul {}     -> needLlvm
+      MO_VF_Quot {}    -> needLlvm
+      MO_VF_Neg {}     -> needLlvm
+
+      _other -> pprPanic "getRegister" (pprMachOp mop)
+   where
+        triv_ucode :: (Format -> Operand -> Instr) -> Format -> NatM Register
+        triv_ucode instr format = trivialUCode format (instr format) x
+
+        -- signed or unsigned extension.
+        integerExtend :: Width -> Width
+                      -> (Format -> Operand -> Operand -> Instr)
+                      -> CmmExpr -> NatM Register
+        integerExtend from to instr expr = do
+            (reg,e_code) <- if from == W8 then getByteReg expr
+                                          else getSomeReg expr
+            let
+                code dst =
+                  e_code `snocOL`
+                  instr (intFormat from) (OpReg reg) (OpReg dst)
+            return (Any (intFormat to) code)
+
+        toI8Reg :: Width -> CmmExpr -> NatM Register
+        toI8Reg new_rep expr
+            = do codefn <- getAnyReg expr
+                 return (Any (intFormat new_rep) codefn)
+                -- HACK: use getAnyReg to get a byte-addressable register.
+                -- If the source was a Fixed register, this will add the
+                -- mov instruction to put it into the desired destination.
+                -- We're assuming that the destination won't be a fixed
+                -- non-byte-addressable register; it won't be, because all
+                -- fixed registers are word-sized.
+
+        toI16Reg = toI8Reg -- for now
+
+        conversionNop :: Format -> CmmExpr -> NatM Register
+        conversionNop new_format expr
+            = do e_code <- getRegister' dflags is32Bit expr
+                 return (swizzleRegisterRep e_code new_format)
+
+
+getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
+  case mop of
+      MO_F_Eq _ -> condFltReg is32Bit EQQ x y
+      MO_F_Ne _ -> condFltReg is32Bit NE  x y
+      MO_F_Gt _ -> condFltReg is32Bit GTT x y
+      MO_F_Ge _ -> condFltReg is32Bit GE  x y
+      -- Invert comparison condition and swap operands
+      -- See Note [SSE Parity Checks]
+      MO_F_Lt _ -> condFltReg is32Bit GTT  y x
+      MO_F_Le _ -> condFltReg is32Bit GE   y x
+
+      MO_Eq _   -> condIntReg EQQ x y
+      MO_Ne _   -> condIntReg NE  x y
+
+      MO_S_Gt _ -> condIntReg GTT x y
+      MO_S_Ge _ -> condIntReg GE  x y
+      MO_S_Lt _ -> condIntReg LTT x y
+      MO_S_Le _ -> condIntReg LE  x y
+
+      MO_U_Gt _ -> condIntReg GU  x y
+      MO_U_Ge _ -> condIntReg GEU x y
+      MO_U_Lt _ -> condIntReg LU  x y
+      MO_U_Le _ -> condIntReg LEU x y
+
+      MO_F_Add w   -> trivialFCode_sse2 w ADD  x y
+
+      MO_F_Sub w   -> trivialFCode_sse2 w SUB  x y
+
+      MO_F_Quot w  -> trivialFCode_sse2 w FDIV x y
+
+      MO_F_Mul w   -> trivialFCode_sse2 w MUL x y
+
+
+      MO_Add rep -> add_code rep x y
+      MO_Sub rep -> sub_code rep x y
+
+      MO_S_Quot rep -> div_code rep True  True  x y
+      MO_S_Rem  rep -> div_code rep True  False x y
+      MO_U_Quot rep -> div_code rep False True  x y
+      MO_U_Rem  rep -> div_code rep False False x y
+
+      MO_S_MulMayOflo rep -> imulMayOflo rep x y
+
+      MO_Mul W8  -> imulW8 x y
+      MO_Mul rep -> triv_op rep IMUL
+      MO_And rep -> triv_op rep AND
+      MO_Or  rep -> triv_op rep OR
+      MO_Xor rep -> triv_op rep XOR
+
+        {- Shift ops on x86s have constraints on their source, it
+           either has to be Imm, CL or 1
+            => trivialCode is not restrictive enough (sigh.)
+        -}
+      MO_Shl rep   -> shift_code rep SHL x y {-False-}
+      MO_U_Shr rep -> shift_code rep SHR x y {-False-}
+      MO_S_Shr rep -> shift_code rep SAR x y {-False-}
+
+      MO_V_Insert {}   -> needLlvm
+      MO_V_Extract {}  -> needLlvm
+      MO_V_Add {}      -> needLlvm
+      MO_V_Sub {}      -> needLlvm
+      MO_V_Mul {}      -> needLlvm
+      MO_VS_Quot {}    -> needLlvm
+      MO_VS_Rem {}     -> needLlvm
+      MO_VS_Neg {}     -> needLlvm
+      MO_VF_Insert {}  -> needLlvm
+      MO_VF_Extract {} -> needLlvm
+      MO_VF_Add {}     -> needLlvm
+      MO_VF_Sub {}     -> needLlvm
+      MO_VF_Mul {}     -> needLlvm
+      MO_VF_Quot {}    -> needLlvm
+      MO_VF_Neg {}     -> needLlvm
+
+      _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop)
+  where
+    --------------------
+    triv_op width instr = trivialCode width op (Just op) x y
+                        where op   = instr (intFormat width)
+
+    -- Special case for IMUL for bytes, since the result of IMULB will be in
+    -- %ax, the split to %dx/%edx/%rdx and %ax/%eax/%rax happens only for wider
+    -- values.
+    imulW8 :: CmmExpr -> CmmExpr -> NatM Register
+    imulW8 arg_a arg_b = do
+        (a_reg, a_code) <- getNonClobberedReg arg_a
+        b_code <- getAnyReg arg_b
+
+        let code = a_code `appOL` b_code eax `appOL`
+                   toOL [ IMUL2 format (OpReg a_reg) ]
+            format = intFormat W8
+
+        return (Fixed format eax code)
+
+
+    imulMayOflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
+    imulMayOflo rep a b = do
+         (a_reg, a_code) <- getNonClobberedReg a
+         b_code <- getAnyReg b
+         let
+             shift_amt  = case rep of
+                           W32 -> 31
+                           W64 -> 63
+                           _ -> panic "shift_amt"
+
+             format = intFormat rep
+             code = a_code `appOL` b_code eax `appOL`
+                        toOL [
+                           IMUL2 format (OpReg a_reg),   -- result in %edx:%eax
+                           SAR format (OpImm (ImmInt shift_amt)) (OpReg eax),
+                                -- sign extend lower part
+                           SUB format (OpReg edx) (OpReg eax)
+                                -- compare against upper
+                           -- eax==0 if high part == sign extended low part
+                        ]
+         return (Fixed format eax code)
+
+    --------------------
+    shift_code :: Width
+               -> (Format -> Operand -> Operand -> Instr)
+               -> CmmExpr
+               -> CmmExpr
+               -> NatM Register
+
+    {- Case1: shift length as immediate -}
+    shift_code width instr x (CmmLit lit) = do
+          x_code <- getAnyReg x
+          let
+               format = intFormat width
+               code dst
+                  = x_code dst `snocOL`
+                    instr format (OpImm (litToImm lit)) (OpReg dst)
+          return (Any format code)
+
+    {- Case2: shift length is complex (non-immediate)
+      * y must go in %ecx.
+      * we cannot do y first *and* put its result in %ecx, because
+        %ecx might be clobbered by x.
+      * if we do y second, then x cannot be
+        in a clobbered reg.  Also, we cannot clobber x's reg
+        with the instruction itself.
+      * so we can either:
+        - do y first, put its result in a fresh tmp, then copy it to %ecx later
+        - do y second and put its result into %ecx.  x gets placed in a fresh
+          tmp.  This is likely to be better, because the reg alloc can
+          eliminate this reg->reg move here (it won't eliminate the other one,
+          because the move is into the fixed %ecx).
+    -}
+    shift_code width instr x y{-amount-} = do
+        x_code <- getAnyReg x
+        let format = intFormat width
+        tmp <- getNewRegNat format
+        y_code <- getAnyReg y
+        let
+           code = x_code tmp `appOL`
+                  y_code ecx `snocOL`
+                  instr format (OpReg ecx) (OpReg tmp)
+        return (Fixed format tmp code)
+
+    --------------------
+    add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
+    add_code rep x (CmmLit (CmmInt y _))
+        | is32BitInteger y = add_int rep x y
+    add_code rep x y = trivialCode rep (ADD format) (Just (ADD format)) x y
+      where format = intFormat rep
+    -- TODO: There are other interesting patterns we want to replace
+    --     with a LEA, e.g. `(x + offset) + (y << shift)`.
+
+    --------------------
+    sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register
+    sub_code rep x (CmmLit (CmmInt y _))
+        | is32BitInteger (-y) = add_int rep x (-y)
+    sub_code rep x y = trivialCode rep (SUB (intFormat rep)) Nothing x y
+
+    -- our three-operand add instruction:
+    add_int width x y = do
+        (x_reg, x_code) <- getSomeReg x
+        let
+            format = intFormat width
+            imm = ImmInt (fromInteger y)
+            code dst
+               = x_code `snocOL`
+                 LEA format
+                        (OpAddr (AddrBaseIndex (EABaseReg x_reg) EAIndexNone imm))
+                        (OpReg dst)
+        --
+        return (Any format code)
+
+    ----------------------
+
+    -- See Note [DIV/IDIV for bytes]
+    div_code W8 signed quotient x y = do
+        let widen | signed    = MO_SS_Conv W8 W16
+                  | otherwise = MO_UU_Conv W8 W16
+        div_code
+            W16
+            signed
+            quotient
+            (CmmMachOp widen [x])
+            (CmmMachOp widen [y])
+
+    div_code width signed quotient x y = do
+           (y_op, y_code) <- getRegOrMem y -- cannot be clobbered
+           x_code <- getAnyReg x
+           let
+             format = intFormat width
+             widen | signed    = CLTD format
+                   | otherwise = XOR format (OpReg edx) (OpReg edx)
+
+             instr | signed    = IDIV
+                   | otherwise = DIV
+
+             code = y_code `appOL`
+                    x_code eax `appOL`
+                    toOL [widen, instr format y_op]
+
+             result | quotient  = eax
+                    | otherwise = edx
+
+           return (Fixed format result code)
+
+
+getRegister' _ _ (CmmLoad mem pk)
+  | isFloatType pk
+  = do
+    Amode addr mem_code <- getAmode mem
+    loadFloatAmode  (typeWidth pk) addr mem_code
+
+getRegister' _ is32Bit (CmmLoad mem pk)
+  | is32Bit && not (isWord64 pk)
+  = do
+    code <- intLoadCode instr mem
+    return (Any format code)
+  where
+    width = typeWidth pk
+    format = intFormat width
+    instr = case width of
+                W8     -> MOVZxL II8
+                _other -> MOV format
+        -- We always zero-extend 8-bit loads, if we
+        -- can't think of anything better.  This is because
+        -- we can't guarantee access to an 8-bit variant of every register
+        -- (esi and edi don't have 8-bit variants), so to make things
+        -- simpler we do our 8-bit arithmetic with full 32-bit registers.
+
+-- Simpler memory load code on x86_64
+getRegister' _ is32Bit (CmmLoad mem pk)
+ | not is32Bit
+  = do
+    code <- intLoadCode (MOV format) mem
+    return (Any format code)
+  where format = intFormat $ typeWidth pk
+
+getRegister' _ is32Bit (CmmLit (CmmInt 0 width))
+  = let
+        format = intFormat width
+
+        -- x86_64: 32-bit xor is one byte shorter, and zero-extends to 64 bits
+        format1 = if is32Bit then format
+                           else case format of
+                                II64 -> II32
+                                _ -> format
+        code dst
+           = unitOL (XOR format1 (OpReg dst) (OpReg dst))
+    in
+        return (Any format code)
+
+  -- optimisation for loading small literals on x86_64: take advantage
+  -- of the automatic zero-extension from 32 to 64 bits, because the 32-bit
+  -- instruction forms are shorter.
+getRegister' dflags is32Bit (CmmLit lit)
+  | not is32Bit, isWord64 (cmmLitType dflags lit), not (isBigLit lit)
+  = let
+        imm = litToImm lit
+        code dst = unitOL (MOV II32 (OpImm imm) (OpReg dst))
+    in
+        return (Any II64 code)
+  where
+   isBigLit (CmmInt i _) = i < 0 || i > 0xffffffff
+   isBigLit _ = False
+        -- note1: not the same as (not.is32BitLit), because that checks for
+        -- signed literals that fit in 32 bits, but we want unsigned
+        -- literals here.
+        -- note2: all labels are small, because we're assuming the
+        -- small memory model (see gcc docs, -mcmodel=small).
+
+getRegister' dflags _ (CmmLit lit)
+  = do let format = cmmTypeFormat (cmmLitType dflags lit)
+           imm = litToImm lit
+           code dst = unitOL (MOV format (OpImm imm) (OpReg dst))
+       return (Any format code)
+
+getRegister' _ _ other
+    | isVecExpr other  = needLlvm
+    | otherwise        = pprPanic "getRegister(x86)" (ppr other)
+
+
+intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr
+   -> NatM (Reg -> InstrBlock)
+intLoadCode instr mem = do
+  Amode src mem_code <- getAmode mem
+  return (\dst -> mem_code `snocOL` instr (OpAddr src) (OpReg dst))
+
+-- Compute an expression into *any* register, adding the appropriate
+-- move instruction if necessary.
+getAnyReg :: CmmExpr -> NatM (Reg -> InstrBlock)
+getAnyReg expr = do
+  r <- getRegister expr
+  anyReg r
+
+anyReg :: Register -> NatM (Reg -> InstrBlock)
+anyReg (Any _ code)          = return code
+anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst)
+
+-- A bit like getSomeReg, but we want a reg that can be byte-addressed.
+-- Fixed registers might not be byte-addressable, so we make sure we've
+-- got a temporary, inserting an extra reg copy if necessary.
+getByteReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getByteReg expr = do
+  is32Bit <- is32BitPlatform
+  if is32Bit
+      then do r <- getRegister expr
+              case r of
+                Any rep code -> do
+                    tmp <- getNewRegNat rep
+                    return (tmp, code tmp)
+                Fixed rep reg code
+                    | isVirtualReg reg -> return (reg,code)
+                    | otherwise -> do
+                        tmp <- getNewRegNat rep
+                        return (tmp, code `snocOL` reg2reg rep reg tmp)
+                    -- ToDo: could optimise slightly by checking for
+                    -- byte-addressable real registers, but that will
+                    -- happen very rarely if at all.
+      else getSomeReg expr -- all regs are byte-addressable on x86_64
+
+-- Another variant: this time we want the result in a register that cannot
+-- be modified by code to evaluate an arbitrary expression.
+getNonClobberedReg :: CmmExpr -> NatM (Reg, InstrBlock)
+getNonClobberedReg expr = do
+  dflags <- getDynFlags
+  r <- getRegister expr
+  case r of
+    Any rep code -> do
+        tmp <- getNewRegNat rep
+        return (tmp, code tmp)
+    Fixed rep reg code
+        -- only certain regs can be clobbered
+        | reg `elem` instrClobberedRegs (targetPlatform dflags)
+        -> do
+                tmp <- getNewRegNat rep
+                return (tmp, code `snocOL` reg2reg rep reg tmp)
+        | otherwise ->
+                return (reg, code)
+
+reg2reg :: Format -> Reg -> Reg -> Instr
+reg2reg format src dst = MOV format (OpReg src) (OpReg dst)
+
+
+--------------------------------------------------------------------------------
+getAmode :: CmmExpr -> NatM Amode
+getAmode e = do is32Bit <- is32BitPlatform
+                getAmode' is32Bit e
+
+getAmode' :: Bool -> CmmExpr -> NatM Amode
+getAmode' _ (CmmRegOff r n) = do dflags <- getDynFlags
+                                 getAmode $ mangleIndexTree dflags r n
+
+getAmode' is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
+                                                  CmmLit displacement])
+ | not is32Bit
+    = return $ Amode (ripRel (litToImm displacement)) nilOL
+
+
+-- This is all just ridiculous, since it carefully undoes
+-- what mangleIndexTree has just done.
+getAmode' is32Bit (CmmMachOp (MO_Sub _rep) [x, CmmLit lit@(CmmInt i _)])
+  | is32BitLit is32Bit lit
+  -- ASSERT(rep == II32)???
+  = do (x_reg, x_code) <- getSomeReg x
+       let off = ImmInt (-(fromInteger i))
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
+
+getAmode' is32Bit (CmmMachOp (MO_Add _rep) [x, CmmLit lit])
+  | is32BitLit is32Bit lit
+  -- ASSERT(rep == II32)???
+  = do (x_reg, x_code) <- getSomeReg x
+       let off = litToImm lit
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) EAIndexNone off) x_code)
+
+-- Turn (lit1 << n  + lit2) into  (lit2 + lit1 << n) so it will be
+-- recognised by the next rule.
+getAmode' is32Bit (CmmMachOp (MO_Add rep) [a@(CmmMachOp (MO_Shl _) _),
+                                  b@(CmmLit _)])
+  = getAmode' is32Bit (CmmMachOp (MO_Add rep) [b,a])
+
+-- Matches: (x + offset) + (y << shift)
+getAmode' _ (CmmMachOp (MO_Add _) [CmmRegOff x offset,
+                                   CmmMachOp (MO_Shl _)
+                                        [y, CmmLit (CmmInt shift _)]])
+  | shift == 0 || shift == 1 || shift == 2 || shift == 3
+  = x86_complex_amode (CmmReg x) y shift (fromIntegral offset)
+
+getAmode' _ (CmmMachOp (MO_Add _) [x, CmmMachOp (MO_Shl _)
+                                        [y, CmmLit (CmmInt shift _)]])
+  | shift == 0 || shift == 1 || shift == 2 || shift == 3
+  = x86_complex_amode x y shift 0
+
+getAmode' _ (CmmMachOp (MO_Add _)
+                [x, CmmMachOp (MO_Add _)
+                        [CmmMachOp (MO_Shl _) [y, CmmLit (CmmInt shift _)],
+                         CmmLit (CmmInt offset _)]])
+  | shift == 0 || shift == 1 || shift == 2 || shift == 3
+  && is32BitInteger offset
+  = x86_complex_amode x y shift offset
+
+getAmode' _ (CmmMachOp (MO_Add _) [x,y])
+  = x86_complex_amode x y 0 0
+
+getAmode' is32Bit (CmmLit lit) | is32BitLit is32Bit lit
+  = return (Amode (ImmAddr (litToImm lit) 0) nilOL)
+
+getAmode' _ expr = do
+  (reg,code) <- getSomeReg expr
+  return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
+
+-- | Like 'getAmode', but on 32-bit use simple register addressing
+-- (i.e. no index register). This stops us from running out of
+-- registers on x86 when using instructions such as cmpxchg, which can
+-- use up to three virtual registers and one fixed register.
+getSimpleAmode :: DynFlags -> Bool -> CmmExpr -> NatM Amode
+getSimpleAmode dflags is32Bit addr
+    | is32Bit = do
+        addr_code <- getAnyReg addr
+        addr_r <- getNewRegNat (intFormat (wordWidth dflags))
+        let amode = AddrBaseIndex (EABaseReg addr_r) EAIndexNone (ImmInt 0)
+        return $! Amode amode (addr_code addr_r)
+    | otherwise = getAmode addr
+
+x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode
+x86_complex_amode base index shift offset
+  = do (x_reg, x_code) <- getNonClobberedReg base
+        -- x must be in a temp, because it has to stay live over y_code
+        -- we could compare x_reg and y_reg and do something better here...
+       (y_reg, y_code) <- getSomeReg index
+       let
+           code = x_code `appOL` y_code
+           base = case shift of 0 -> 1; 1 -> 2; 2 -> 4; 3 -> 8;
+                                n -> panic $ "x86_complex_amode: unhandled shift! (" ++ show n ++ ")"
+       return (Amode (AddrBaseIndex (EABaseReg x_reg) (EAIndex y_reg base) (ImmInt (fromIntegral offset)))
+               code)
+
+
+
+
+-- -----------------------------------------------------------------------------
+-- getOperand: sometimes any operand will do.
+
+-- getNonClobberedOperand: the value of the operand will remain valid across
+-- the computation of an arbitrary expression, unless the expression
+-- is computed directly into a register which the operand refers to
+-- (see trivialCode where this function is used for an example).
+
+getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+getNonClobberedOperand (CmmLit lit) = do
+  if  isSuitableFloatingPointLit lit
+    then do
+      let CmmFloat _ w = lit
+      Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
+      return (OpAddr addr, code)
+     else do
+
+  is32Bit <- is32BitPlatform
+  dflags <- getDynFlags
+  if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit))
+    then return (OpImm (litToImm lit), nilOL)
+    else getNonClobberedOperand_generic (CmmLit lit)
+
+getNonClobberedOperand (CmmLoad mem pk) = do
+  is32Bit <- is32BitPlatform
+  -- this logic could be simplified
+  -- TODO FIXME
+  if   (if is32Bit then not (isWord64 pk) else True)
+      -- if 32bit and pk is at float/double/simd value
+      -- or if 64bit
+      --  this could use some eyeballs or i'll need to stare at it more later
+    then do
+      dflags <- getDynFlags
+      let platform = targetPlatform dflags
+      Amode src mem_code <- getAmode mem
+      (src',save_code) <-
+        if (amodeCouldBeClobbered platform src)
+                then do
+                   tmp <- getNewRegNat (archWordFormat is32Bit)
+                   return (AddrBaseIndex (EABaseReg tmp) EAIndexNone (ImmInt 0),
+                           unitOL (LEA (archWordFormat is32Bit)
+                                       (OpAddr src)
+                                       (OpReg tmp)))
+                else
+                   return (src, nilOL)
+      return (OpAddr src', mem_code `appOL` save_code)
+    else do
+      -- if its a word or gcptr on 32bit?
+      getNonClobberedOperand_generic (CmmLoad mem pk)
+
+getNonClobberedOperand e = getNonClobberedOperand_generic e
+
+getNonClobberedOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
+getNonClobberedOperand_generic e = do
+    (reg, code) <- getNonClobberedReg e
+    return (OpReg reg, code)
+
+amodeCouldBeClobbered :: Platform -> AddrMode -> Bool
+amodeCouldBeClobbered platform amode = any (regClobbered platform) (addrModeRegs amode)
+
+regClobbered :: Platform -> Reg -> Bool
+regClobbered platform (RegReal (RealRegSingle rr)) = freeReg platform rr
+regClobbered _ _ = False
+
+-- getOperand: the operand is not required to remain valid across the
+-- computation of an arbitrary expression.
+getOperand :: CmmExpr -> NatM (Operand, InstrBlock)
+
+getOperand (CmmLit lit) = do
+  use_sse2 <- sse2Enabled
+  if (use_sse2 && isSuitableFloatingPointLit lit)
+    then do
+      let CmmFloat _ w = lit
+      Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
+      return (OpAddr addr, code)
+    else do
+
+  is32Bit <- is32BitPlatform
+  dflags <- getDynFlags
+  if is32BitLit is32Bit lit && not (isFloatType (cmmLitType dflags lit))
+    then return (OpImm (litToImm lit), nilOL)
+    else getOperand_generic (CmmLit lit)
+
+getOperand (CmmLoad mem pk) = do
+  is32Bit <- is32BitPlatform
+  use_sse2 <- sse2Enabled
+  if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
+     then do
+       Amode src mem_code <- getAmode mem
+       return (OpAddr src, mem_code)
+     else
+       getOperand_generic (CmmLoad mem pk)
+
+getOperand e = getOperand_generic e
+
+getOperand_generic :: CmmExpr -> NatM (Operand, InstrBlock)
+getOperand_generic e = do
+    (reg, code) <- getSomeReg e
+    return (OpReg reg, code)
+
+isOperand :: Bool -> CmmExpr -> Bool
+isOperand _ (CmmLoad _ _) = True
+isOperand is32Bit (CmmLit lit)  = is32BitLit is32Bit lit
+                          || isSuitableFloatingPointLit lit
+isOperand _ _            = False
+
+-- | Given a 'Register', produce a new 'Register' with an instruction block
+-- which will check the value for alignment. Used for @-falignment-sanitisation@.
+addAlignmentCheck :: Int -> Register -> Register
+addAlignmentCheck align reg =
+    case reg of
+      Fixed fmt reg code -> Fixed fmt reg (code `appOL` check fmt reg)
+      Any fmt f          -> Any fmt (\reg -> f reg `appOL` check fmt reg)
+  where
+    check :: Format -> Reg -> InstrBlock
+    check fmt reg =
+        ASSERT(not $ isFloatFormat fmt)
+        toOL [ TEST fmt (OpImm $ ImmInt $ align-1) (OpReg reg)
+             , JXX_GBL NE $ ImmCLbl mkBadAlignmentLabel
+             ]
+
+memConstant :: Alignment -> CmmLit -> NatM Amode
+memConstant align lit = do
+  lbl <- getNewLabelNat
+  let rosection = Section ReadOnlyData lbl
+  dflags <- getDynFlags
+  (addr, addr_code) <- if target32Bit (targetPlatform dflags)
+                       then do dynRef <- cmmMakeDynamicReference
+                                             dflags
+                                             DataReference
+                                             lbl
+                               Amode addr addr_code <- getAmode dynRef
+                               return (addr, addr_code)
+                       else return (ripRel (ImmCLbl lbl), nilOL)
+  let code =
+        LDATA rosection (align, RawCmmStatics lbl [CmmStaticLit lit])
+        `consOL` addr_code
+  return (Amode addr code)
+
+
+loadFloatAmode :: Width -> AddrMode -> InstrBlock -> NatM Register
+loadFloatAmode w addr addr_code = do
+  let format = floatFormat w
+      code dst = addr_code `snocOL`
+                    MOV format (OpAddr addr) (OpReg dst)
+
+  return (Any format code)
+
+
+-- if we want a floating-point literal as an operand, we can
+-- use it directly from memory.  However, if the literal is
+-- zero, we're better off generating it into a register using
+-- xor.
+isSuitableFloatingPointLit :: CmmLit -> Bool
+isSuitableFloatingPointLit (CmmFloat f _) = f /= 0.0
+isSuitableFloatingPointLit _ = False
+
+getRegOrMem :: CmmExpr -> NatM (Operand, InstrBlock)
+getRegOrMem e@(CmmLoad mem pk) = do
+  is32Bit <- is32BitPlatform
+  use_sse2 <- sse2Enabled
+  if (not (isFloatType pk) || use_sse2) && (if is32Bit then not (isWord64 pk) else True)
+     then do
+       Amode src mem_code <- getAmode mem
+       return (OpAddr src, mem_code)
+     else do
+       (reg, code) <- getNonClobberedReg e
+       return (OpReg reg, code)
+getRegOrMem e = do
+    (reg, code) <- getNonClobberedReg e
+    return (OpReg reg, code)
+
+is32BitLit :: Bool -> CmmLit -> Bool
+is32BitLit is32Bit (CmmInt i W64)
+ | not is32Bit
+    = -- assume that labels are in the range 0-2^31-1: this assumes the
+      -- small memory model (see gcc docs, -mcmodel=small).
+      is32BitInteger i
+is32BitLit _ _ = True
+
+
+
+
+-- Set up a condition code for a conditional branch.
+
+getCondCode :: CmmExpr -> NatM CondCode
+
+-- yes, they really do seem to want exactly the same!
+
+getCondCode (CmmMachOp mop [x, y])
+  =
+    case mop of
+      MO_F_Eq W32 -> condFltCode EQQ x y
+      MO_F_Ne W32 -> condFltCode NE  x y
+      MO_F_Gt W32 -> condFltCode GTT x y
+      MO_F_Ge W32 -> condFltCode GE  x y
+      -- Invert comparison condition and swap operands
+      -- See Note [SSE Parity Checks]
+      MO_F_Lt W32 -> condFltCode GTT  y x
+      MO_F_Le W32 -> condFltCode GE   y x
+
+      MO_F_Eq W64 -> condFltCode EQQ x y
+      MO_F_Ne W64 -> condFltCode NE  x y
+      MO_F_Gt W64 -> condFltCode GTT x y
+      MO_F_Ge W64 -> condFltCode GE  x y
+      MO_F_Lt W64 -> condFltCode GTT y x
+      MO_F_Le W64 -> condFltCode GE  y x
+
+      _ -> condIntCode (machOpToCond mop) x y
+
+getCondCode other = pprPanic "getCondCode(2)(x86,x86_64)" (ppr other)
+
+machOpToCond :: MachOp -> Cond
+machOpToCond mo = case mo of
+  MO_Eq _   -> EQQ
+  MO_Ne _   -> NE
+  MO_S_Gt _ -> GTT
+  MO_S_Ge _ -> GE
+  MO_S_Lt _ -> LTT
+  MO_S_Le _ -> LE
+  MO_U_Gt _ -> GU
+  MO_U_Ge _ -> GEU
+  MO_U_Lt _ -> LU
+  MO_U_Le _ -> LEU
+  _other -> pprPanic "machOpToCond" (pprMachOp mo)
+
+
+-- @cond(Int|Flt)Code@: Turn a boolean expression into a condition, to be
+-- passed back up the tree.
+
+condIntCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+condIntCode cond x y = do is32Bit <- is32BitPlatform
+                          condIntCode' is32Bit cond x y
+
+condIntCode' :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+
+-- memory vs immediate
+condIntCode' is32Bit cond (CmmLoad x pk) (CmmLit lit)
+ | is32BitLit is32Bit lit = do
+    Amode x_addr x_code <- getAmode x
+    let
+        imm  = litToImm lit
+        code = x_code `snocOL`
+                  CMP (cmmTypeFormat pk) (OpImm imm) (OpAddr x_addr)
+    --
+    return (CondCode False cond code)
+
+-- anything vs zero, using a mask
+-- TODO: Add some sanity checking!!!!
+condIntCode' is32Bit cond (CmmMachOp (MO_And _) [x,o2]) (CmmLit (CmmInt 0 pk))
+    | (CmmLit lit@(CmmInt mask _)) <- o2, is32BitLit is32Bit lit
+    = do
+      (x_reg, x_code) <- getSomeReg x
+      let
+         code = x_code `snocOL`
+                TEST (intFormat pk) (OpImm (ImmInteger mask)) (OpReg x_reg)
+      --
+      return (CondCode False cond code)
+
+-- anything vs zero
+condIntCode' _ cond x (CmmLit (CmmInt 0 pk)) = do
+    (x_reg, x_code) <- getSomeReg x
+    let
+        code = x_code `snocOL`
+                  TEST (intFormat pk) (OpReg x_reg) (OpReg x_reg)
+    --
+    return (CondCode False cond code)
+
+-- anything vs operand
+condIntCode' is32Bit cond x y
+ | isOperand is32Bit y = do
+    dflags <- getDynFlags
+    (x_reg, x_code) <- getNonClobberedReg x
+    (y_op,  y_code) <- getOperand y
+    let
+        code = x_code `appOL` y_code `snocOL`
+                  CMP (cmmTypeFormat (cmmExprType dflags x)) y_op (OpReg x_reg)
+    return (CondCode False cond code)
+-- operand vs. anything: invert the comparison so that we can use a
+-- single comparison instruction.
+ | isOperand is32Bit x
+ , Just revcond <- maybeFlipCond cond = do
+    dflags <- getDynFlags
+    (y_reg, y_code) <- getNonClobberedReg y
+    (x_op,  x_code) <- getOperand x
+    let
+        code = y_code `appOL` x_code `snocOL`
+                  CMP (cmmTypeFormat (cmmExprType dflags x)) x_op (OpReg y_reg)
+    return (CondCode False revcond code)
+
+-- anything vs anything
+condIntCode' _ cond x y = do
+  dflags <- getDynFlags
+  (y_reg, y_code) <- getNonClobberedReg y
+  (x_op, x_code) <- getRegOrMem x
+  let
+        code = y_code `appOL`
+               x_code `snocOL`
+                  CMP (cmmTypeFormat (cmmExprType dflags x)) (OpReg y_reg) x_op
+  return (CondCode False cond code)
+
+
+
+--------------------------------------------------------------------------------
+condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
+
+condFltCode cond x y
+  =  condFltCode_sse2
+  where
+
+
+  -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
+  -- an operand, but the right must be a reg.  We can probably do better
+  -- than this general case...
+  condFltCode_sse2 = do
+    dflags <- getDynFlags
+    (x_reg, x_code) <- getNonClobberedReg x
+    (y_op, y_code) <- getOperand y
+    let
+        code = x_code `appOL`
+               y_code `snocOL`
+                  CMP (floatFormat $ cmmExprWidth dflags x) y_op (OpReg x_reg)
+        -- NB(1): we need to use the unsigned comparison operators on the
+        -- result of this comparison.
+    return (CondCode True (condToUnsigned cond) code)
+
+-- -----------------------------------------------------------------------------
+-- Generating assignments
+
+-- Assignments are really at the heart of the whole code generation
+-- business.  Almost all top-level nodes of any real importance are
+-- assignments, which correspond to loads, stores, or register
+-- transfers.  If we're really lucky, some of the register transfers
+-- will go away, because we can use the destination register to
+-- complete the code generation for the right hand side.  This only
+-- fails when the right hand side is forced into a fixed register
+-- (e.g. the result of a call).
+
+assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_IntCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+
+assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
+assignReg_FltCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock
+
+
+-- integer assignment to memory
+
+-- specific case of adding/subtracting an integer to a particular address.
+-- ToDo: catch other cases where we can use an operation directly on a memory
+-- address.
+assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _,
+                                                 CmmLit (CmmInt i _)])
+   | addr == addr2, pk /= II64 || is32BitInteger i,
+     Just instr <- check op
+   = do Amode amode code_addr <- getAmode addr
+        let code = code_addr `snocOL`
+                   instr pk (OpImm (ImmInt (fromIntegral i))) (OpAddr amode)
+        return code
+   where
+        check (MO_Add _) = Just ADD
+        check (MO_Sub _) = Just SUB
+        check _ = Nothing
+        -- ToDo: more?
+
+-- general case
+assignMem_IntCode pk addr src = do
+    is32Bit <- is32BitPlatform
+    Amode addr code_addr <- getAmode addr
+    (code_src, op_src)   <- get_op_RI is32Bit src
+    let
+        code = code_src `appOL`
+               code_addr `snocOL`
+                  MOV pk op_src (OpAddr addr)
+        -- NOTE: op_src is stable, so it will still be valid
+        -- after code_addr.  This may involve the introduction
+        -- of an extra MOV to a temporary register, but we hope
+        -- the register allocator will get rid of it.
+    --
+    return code
+  where
+    get_op_RI :: Bool -> CmmExpr -> NatM (InstrBlock,Operand)   -- code, operator
+    get_op_RI is32Bit (CmmLit lit) | is32BitLit is32Bit lit
+      = return (nilOL, OpImm (litToImm lit))
+    get_op_RI _ op
+      = do (reg,code) <- getNonClobberedReg op
+           return (code, OpReg reg)
+
+
+-- Assign; dst is a reg, rhs is mem
+assignReg_IntCode pk reg (CmmLoad src _) = do
+  load_code <- intLoadCode (MOV pk) src
+  dflags <- getDynFlags
+  let platform = targetPlatform dflags
+  return (load_code (getRegisterReg platform reg))
+
+-- dst is a reg, but src could be anything
+assignReg_IntCode _ reg src = do
+  dflags <- getDynFlags
+  let platform = targetPlatform dflags
+  code <- getAnyReg src
+  return (code (getRegisterReg platform reg))
+
+
+-- Floating point assignment to memory
+assignMem_FltCode pk addr src = do
+  (src_reg, src_code) <- getNonClobberedReg src
+  Amode addr addr_code <- getAmode addr
+  let
+        code = src_code `appOL`
+               addr_code `snocOL`
+               MOV pk (OpReg src_reg) (OpAddr addr)
+
+  return code
+
+-- Floating point assignment to a register/temporary
+assignReg_FltCode _ reg src = do
+  src_code <- getAnyReg src
+  dflags <- getDynFlags
+  let platform = targetPlatform dflags
+  return (src_code (getRegisterReg platform  reg))
+
+
+genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
+
+genJump (CmmLoad mem _) regs = do
+  Amode target code <- getAmode mem
+  return (code `snocOL` JMP (OpAddr target) regs)
+
+genJump (CmmLit lit) regs = do
+  return (unitOL (JMP (OpImm (litToImm lit)) regs))
+
+genJump expr regs = do
+  (reg,code) <- getSomeReg expr
+  return (code `snocOL` JMP (OpReg reg) regs)
+
+
+-- -----------------------------------------------------------------------------
+--  Unconditional branches
+
+genBranch :: BlockId -> InstrBlock
+genBranch = toOL . mkJumpInstr
+
+
+
+-- -----------------------------------------------------------------------------
+--  Conditional jumps/branches
+
+{-
+Conditional jumps are always to local labels, so we can use branch
+instructions.  We peek at the arguments to decide what kind of
+comparison to do.
+
+I386: First, we have to ensure that the condition
+codes are set according to the supplied comparison operation.
+-}
+
+
+genCondBranch
+    :: BlockId      -- the source of the jump
+    -> BlockId      -- the true branch target
+    -> BlockId      -- the false branch target
+    -> CmmExpr      -- the condition on which to branch
+    -> NatM InstrBlock -- Instructions
+
+genCondBranch bid id false expr = do
+  is32Bit <- is32BitPlatform
+  genCondBranch' is32Bit bid id false expr
+
+-- | We return the instructions generated.
+genCondBranch' :: Bool -> BlockId -> BlockId -> BlockId -> CmmExpr
+               -> NatM InstrBlock
+
+-- 64-bit integer comparisons on 32-bit
+genCondBranch' is32Bit _bid true false (CmmMachOp mop [e1,e2])
+  | is32Bit, Just W64 <- maybeIntComparison mop = do
+  ChildCode64 code1 r1_lo <- iselExpr64 e1
+  ChildCode64 code2 r2_lo <- iselExpr64 e2
+  let r1_hi = getHiVRegFromLo r1_lo
+      r2_hi = getHiVRegFromLo r2_lo
+      cond = machOpToCond mop
+      Just cond' = maybeFlipCond cond
+  --TODO: Update CFG for x86
+  let code = code1 `appOL` code2 `appOL` toOL [
+        CMP II32 (OpReg r2_hi) (OpReg r1_hi),
+        JXX cond true,
+        JXX cond' false,
+        CMP II32 (OpReg r2_lo) (OpReg r1_lo),
+        JXX cond true] `appOL` genBranch false
+  return code
+
+genCondBranch' _ bid id false bool = do
+  CondCode is_float cond cond_code <- getCondCode bool
+  use_sse2 <- sse2Enabled
+  if not is_float || not use_sse2
+    then
+        return (cond_code `snocOL` JXX cond id `appOL` genBranch false)
+    else do
+        -- See Note [SSE Parity Checks]
+        let jmpFalse = genBranch false
+            code
+                = case cond of
+                  NE  -> or_unordered
+                  GU  -> plain_test
+                  GEU -> plain_test
+                  -- Use ASSERT so we don't break releases if
+                  -- LTT/LE creep in somehow.
+                  LTT ->
+                    ASSERT2(False, ppr "Should have been turned into >")
+                    and_ordered
+                  LE  ->
+                    ASSERT2(False, ppr "Should have been turned into >=")
+                    and_ordered
+                  _   -> and_ordered
+
+            plain_test = unitOL (
+                  JXX cond id
+                ) `appOL` jmpFalse
+            or_unordered = toOL [
+                  JXX cond id,
+                  JXX PARITY id
+                ] `appOL` jmpFalse
+            and_ordered = toOL [
+                  JXX PARITY false,
+                  JXX cond id,
+                  JXX ALWAYS false
+                ]
+        updateCfgNat (\cfg -> adjustEdgeWeight cfg (+3) bid false)
+        return (cond_code `appOL` code)
+
+{-  Note [Introducing cfg edges inside basic blocks]
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    During instruction selection a statement `s`
+    in a block B with control of the sort: B -> C
+    will sometimes result in control
+    flow of the sort:
+
+            ┌ < ┐
+            v   ^
+      B ->  B1  ┴ -> C
+
+    as is the case for some atomic operations.
+
+    Now to keep the CFG in sync when introducing B1 we clearly
+    want to insert it between B and C. However there is
+    a catch when we have to deal with self loops.
+
+    We might start with code and a CFG of these forms:
+
+    loop:
+        stmt1               ┌ < ┐
+        ....                v   ^
+        stmtX              loop ┘
+        stmtY
+        ....
+        goto loop:
+
+    Now we introduce B1:
+                            ┌ ─ ─ ─ ─ ─┐
+        loop:               │   ┌ <  ┐ │
+        instrs              v   │    │ ^
+        ....               loop ┴ B1 ┴ ┘
+        instrsFromX
+        stmtY
+        goto loop:
+
+    This is simple, all outgoing edges from loop now simply
+    start from B1 instead and the code generator knows which
+    new edges it introduced for the self loop of B1.
+
+    Disaster strikes if the statement Y follows the same pattern.
+    If we apply the same rule that all outgoing edges change then
+    we end up with:
+
+        loop ─> B1 ─> B2 ┬─┐
+          │      │    └─<┤ │
+          │      └───<───┘ │
+          └───────<────────┘
+
+    This is problematic. The edge B1->B1 is modified as expected.
+    However the modification is wrong!
+
+    The assembly in this case looked like this:
+
+    _loop:
+        <instrs>
+    _B1:
+        ...
+        cmpxchgq ...
+        jne _B1
+        <instrs>
+        <end _B1>
+    _B2:
+        ...
+        cmpxchgq ...
+        jne _B2
+        <instrs>
+        jmp loop
+
+    There is no edge _B2 -> _B1 here. It's still a self loop onto _B1.
+
+    The problem here is that really B1 should be two basic blocks.
+    Otherwise we have control flow in the *middle* of a basic block.
+    A contradiction!
+
+    So to account for this we add yet another basic block marker:
+
+    _B:
+        <instrs>
+    _B1:
+        ...
+        cmpxchgq ...
+        jne _B1
+        jmp _B1'
+    _B1':
+        <instrs>
+        <end _B1>
+    _B2:
+        ...
+
+    Now when inserting B2 we will only look at the outgoing edges of B1' and
+    everything will work out nicely.
+
+    You might also wonder why we don't insert jumps at the end of _B1'. There is
+    no way another block ends up jumping to the labels _B1 or _B2 since they are
+    essentially invisible to other blocks. View them as control flow labels local
+    to the basic block if you'd like.
+
+    Not doing this ultimately caused (part 2 of) #17334.
+-}
+
+
+-- -----------------------------------------------------------------------------
+--  Generating C calls
+
+-- Now the biggest nightmare---calls.  Most of the nastiness is buried in
+-- @get_arg@, which moves the arguments to the correct registers/stack
+-- locations.  Apart from that, the code is easy.
+--
+-- (If applicable) Do not fill the delay slots here; you will confuse the
+-- register allocator.
+--
+-- See Note [Keeping track of the current block] for information why we need
+-- to take/return a block id.
+
+genCCall
+    :: DynFlags
+    -> Bool                     -- 32 bit platform?
+    -> ForeignTarget            -- function to call
+    -> [CmmFormal]        -- where to put the result
+    -> [CmmActual]        -- arguments (of mixed type)
+    -> BlockId      -- The block we are in
+    -> NatM (InstrBlock, Maybe BlockId)
+
+-- First we deal with cases which might introduce new blocks in the stream.
+
+genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
+                                           [dst] [addr, n] bid = do
+    Amode amode addr_code <-
+        if amop `elem` [AMO_Add, AMO_Sub]
+        then getAmode addr
+        else getSimpleAmode dflags is32Bit addr  -- See genCCall for MO_Cmpxchg
+    arg <- getNewRegNat format
+    arg_code <- getAnyReg n
+    let platform = targetPlatform dflags
+        dst_r    = getRegisterReg platform  (CmmLocal dst)
+    (code, lbl) <- op_code dst_r arg amode
+    return (addr_code `appOL` arg_code arg `appOL` code, Just lbl)
+  where
+    -- Code for the operation
+    op_code :: Reg       -- Destination reg
+            -> Reg       -- Register containing argument
+            -> AddrMode  -- Address of location to mutate
+            -> NatM (OrdList Instr,BlockId) -- TODO: Return Maybe BlockId
+    op_code dst_r arg amode = case amop of
+        -- In the common case where dst_r is a virtual register the
+        -- final move should go away, because it's the last use of arg
+        -- and the first use of dst_r.
+        AMO_Add  -> return $ (toOL [ LOCK (XADD format (OpReg arg) (OpAddr amode))
+                                  , MOV format (OpReg arg) (OpReg dst_r)
+                                  ], bid)
+        AMO_Sub  -> return $ (toOL [ NEGI format (OpReg arg)
+                                  , LOCK (XADD format (OpReg arg) (OpAddr amode))
+                                  , MOV format (OpReg arg) (OpReg dst_r)
+                                  ], bid)
+        -- In these cases we need a new block id, and have to return it so
+        -- that later instruction selection can reference it.
+        AMO_And  -> cmpxchg_code (\ src dst -> unitOL $ AND format src dst)
+        AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND format src dst
+                                                    , NOT format dst
+                                                    ])
+        AMO_Or   -> cmpxchg_code (\ src dst -> unitOL $ OR format src dst)
+        AMO_Xor  -> cmpxchg_code (\ src dst -> unitOL $ XOR format src dst)
+      where
+        -- Simulate operation that lacks a dedicated instruction using
+        -- cmpxchg.
+        cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
+                     -> NatM (OrdList Instr, BlockId)
+        cmpxchg_code instrs = do
+            lbl1 <- getBlockIdNat
+            lbl2 <- getBlockIdNat
+            tmp <- getNewRegNat format
+
+            --Record inserted blocks
+            --  We turn A -> B into A -> A' -> A'' -> B
+            --  with a self loop on A'.
+            addImmediateSuccessorNat bid lbl1
+            addImmediateSuccessorNat lbl1 lbl2
+            updateCfgNat (addWeightEdge lbl1 lbl1 0)
+
+            return $ (toOL
+                [ MOV format (OpAddr amode) (OpReg eax)
+                , JXX ALWAYS lbl1
+                , NEWBLOCK lbl1
+                  -- Keep old value so we can return it:
+                , MOV format (OpReg eax) (OpReg dst_r)
+                , MOV format (OpReg eax) (OpReg tmp)
+                ]
+                `appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
+                [ LOCK (CMPXCHG format (OpReg tmp) (OpAddr amode))
+                , JXX NE lbl1
+                -- See Note [Introducing cfg edges inside basic blocks]
+                -- why this basic block is required.
+                , JXX ALWAYS lbl2
+                , NEWBLOCK lbl2
+                ],
+                lbl2)
+    format = intFormat width
+
+genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
+  | is32Bit, width == W64 = do
+      ChildCode64 vcode rlo <- iselExpr64 src
+      let rhi     = getHiVRegFromLo rlo
+          dst_r   = getRegisterReg platform  (CmmLocal dst)
+      lbl1 <- getBlockIdNat
+      lbl2 <- getBlockIdNat
+      let format = if width == W8 then II16 else intFormat width
+      tmp_r <- getNewRegNat format
+
+      -- New CFG Edges:
+      --  bid -> lbl2
+      --  bid -> lbl1 -> lbl2
+      --  We also changes edges originating at bid to start at lbl2 instead.
+      updateCfgNat (addWeightEdge bid lbl1 110 .
+                    addWeightEdge lbl1 lbl2 110 .
+                    addImmediateSuccessor bid lbl2)
+
+      -- The following instruction sequence corresponds to the pseudo-code
+      --
+      --  if (src) {
+      --    dst = src.lo32 ? BSF(src.lo32) : (BSF(src.hi32) + 32);
+      --  } else {
+      --    dst = 64;
+      --  }
+      let !instrs = vcode `appOL` toOL
+               ([ MOV      II32 (OpReg rhi)         (OpReg tmp_r)
+                , OR       II32 (OpReg rlo)         (OpReg tmp_r)
+                , MOV      II32 (OpImm (ImmInt 64)) (OpReg dst_r)
+                , JXX EQQ    lbl2
+                , JXX ALWAYS lbl1
+
+                , NEWBLOCK   lbl1
+                , BSF     II32 (OpReg rhi)         dst_r
+                , ADD     II32 (OpImm (ImmInt 32)) (OpReg dst_r)
+                , BSF     II32 (OpReg rlo)         tmp_r
+                , CMOV NE II32 (OpReg tmp_r)       dst_r
+                , JXX ALWAYS lbl2
+
+                , NEWBLOCK   lbl2
+                ])
+      return (instrs, Just lbl2)
+
+  | otherwise = do
+    code_src <- getAnyReg src
+    let dst_r = getRegisterReg platform (CmmLocal dst)
+
+    if isBmi2Enabled dflags
+    then do
+        src_r <- getNewRegNat (intFormat width)
+        let instrs = appOL (code_src src_r) $ case width of
+                W8 -> toOL
+                    [ OR    II32 (OpImm (ImmInteger 0xFFFFFF00)) (OpReg src_r)
+                    , TZCNT II32 (OpReg src_r)        dst_r
+                    ]
+                W16 -> toOL
+                    [ TZCNT  II16 (OpReg src_r) dst_r
+                    , MOVZxL II16 (OpReg dst_r) (OpReg dst_r)
+                    ]
+                _ -> unitOL $ TZCNT (intFormat width) (OpReg src_r) dst_r
+        return (instrs, Nothing)
+    else do
+        -- The following insn sequence makes sure 'ctz 0' has a defined value.
+        -- starting with Haswell, one could use the TZCNT insn instead.
+        let format = if width == W8 then II16 else intFormat width
+        src_r <- getNewRegNat format
+        tmp_r <- getNewRegNat format
+        let !instrs = code_src src_r `appOL` toOL
+                 ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+                  [ BSF     format (OpReg src_r) tmp_r
+                  , MOV     II32   (OpImm (ImmInt bw)) (OpReg dst_r)
+                  , CMOV NE format (OpReg tmp_r) dst_r
+                  ]) -- NB: We don't need to zero-extend the result for the
+                     -- W8/W16 cases because the 'MOV' insn already
+                     -- took care of implicitly clearing the upper bits
+        return (instrs, Nothing)
+  where
+    bw = widthInBits width
+    platform = targetPlatform dflags
+
+genCCall dflags bits mop dst args bid = do
+  instr <- genCCall' dflags bits mop dst args bid
+  return (instr, Nothing)
+
+-- genCCall' handles cases not introducing new code blocks.
+genCCall'
+    :: DynFlags
+    -> Bool                     -- 32 bit platform?
+    -> ForeignTarget            -- function to call
+    -> [CmmFormal]        -- where to put the result
+    -> [CmmActual]        -- arguments (of mixed type)
+    -> BlockId      -- The block we are in
+    -> NatM InstrBlock
+
+-- Unroll memcpy calls if the number of bytes to copy isn't too
+-- large.  Otherwise, call C's memcpy.
+genCCall' dflags _ (PrimTarget (MO_Memcpy align)) _
+         [dst, src, CmmLit (CmmInt n _)] _
+    | fromInteger insns <= maxInlineMemcpyInsns dflags = do
+        code_dst <- getAnyReg dst
+        dst_r <- getNewRegNat format
+        code_src <- getAnyReg src
+        src_r <- getNewRegNat format
+        tmp_r <- getNewRegNat format
+        return $ code_dst dst_r `appOL` code_src src_r `appOL`
+            go dst_r src_r tmp_r (fromInteger n)
+  where
+    -- The number of instructions we will generate (approx). We need 2
+    -- instructions per move.
+    insns = 2 * ((n + sizeBytes - 1) `div` sizeBytes)
+
+    maxAlignment = wordAlignment dflags -- only machine word wide MOVs are supported
+    effectiveAlignment = min (alignmentOf align) maxAlignment
+    format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
+
+    -- The size of each move, in bytes.
+    sizeBytes :: Integer
+    sizeBytes = fromIntegral (formatInBytes format)
+
+    go :: Reg -> Reg -> Reg -> Integer -> OrdList Instr
+    go dst src tmp i
+        | i >= sizeBytes =
+            unitOL (MOV format (OpAddr src_addr) (OpReg tmp)) `appOL`
+            unitOL (MOV format (OpReg tmp) (OpAddr dst_addr)) `appOL`
+            go dst src tmp (i - sizeBytes)
+        -- Deal with remaining bytes.
+        | i >= 4 =  -- Will never happen on 32-bit
+            unitOL (MOV II32 (OpAddr src_addr) (OpReg tmp)) `appOL`
+            unitOL (MOV II32 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+            go dst src tmp (i - 4)
+        | i >= 2 =
+            unitOL (MOVZxL II16 (OpAddr src_addr) (OpReg tmp)) `appOL`
+            unitOL (MOV II16 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+            go dst src tmp (i - 2)
+        | i >= 1 =
+            unitOL (MOVZxL II8 (OpAddr src_addr) (OpReg tmp)) `appOL`
+            unitOL (MOV II8 (OpReg tmp) (OpAddr dst_addr)) `appOL`
+            go dst src tmp (i - 1)
+        | otherwise = nilOL
+      where
+        src_addr = AddrBaseIndex (EABaseReg src) EAIndexNone
+                   (ImmInteger (n - i))
+        dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
+                   (ImmInteger (n - i))
+
+genCCall' dflags _ (PrimTarget (MO_Memset align)) _
+         [dst,
+          CmmLit (CmmInt c _),
+          CmmLit (CmmInt n _)]
+         _
+    | fromInteger insns <= maxInlineMemsetInsns dflags = do
+        code_dst <- getAnyReg dst
+        dst_r <- getNewRegNat format
+        if format == II64 && n >= 8 then do
+          code_imm8byte <- getAnyReg (CmmLit (CmmInt c8 W64))
+          imm8byte_r <- getNewRegNat II64
+          return $ code_dst dst_r `appOL`
+                   code_imm8byte imm8byte_r `appOL`
+                   go8 dst_r imm8byte_r (fromInteger n)
+        else
+          return $ code_dst dst_r `appOL`
+                   go4 dst_r (fromInteger n)
+  where
+    maxAlignment = wordAlignment dflags -- only machine word wide MOVs are supported
+    effectiveAlignment = min (alignmentOf align) maxAlignment
+    format = intFormat . widthFromBytes $ alignmentBytes effectiveAlignment
+    c2 = c `shiftL` 8 .|. c
+    c4 = c2 `shiftL` 16 .|. c2
+    c8 = c4 `shiftL` 32 .|. c4
+
+    -- The number of instructions we will generate (approx). We need 1
+    -- instructions per move.
+    insns = (n + sizeBytes - 1) `div` sizeBytes
+
+    -- The size of each move, in bytes.
+    sizeBytes :: Integer
+    sizeBytes = fromIntegral (formatInBytes format)
+
+    -- Depending on size returns the widest MOV instruction and its
+    -- width.
+    gen4 :: AddrMode -> Integer -> (InstrBlock, Integer)
+    gen4 addr size
+        | size >= 4 =
+            (unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4)
+        | size >= 2 =
+            (unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2)
+        | size >= 1 =
+            (unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1)
+        | otherwise = (nilOL, 0)
+
+    -- Generates a 64-bit wide MOV instruction from REG to MEM.
+    gen8 :: AddrMode -> Reg -> InstrBlock
+    gen8 addr reg8byte =
+      unitOL (MOV format (OpReg reg8byte) (OpAddr addr))
+
+    -- Unrolls memset when the widest MOV is <= 4 bytes.
+    go4 :: Reg -> Integer -> InstrBlock
+    go4 dst left =
+      if left <= 0 then nilOL
+      else curMov `appOL` go4 dst (left - curWidth)
+      where
+        possibleWidth = minimum [left, sizeBytes]
+        dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
+        (curMov, curWidth) = gen4 dst_addr possibleWidth
+
+    -- Unrolls memset when the widest MOV is 8 bytes (thus another Reg
+    -- argument). Falls back to go4 when all 8 byte moves are
+    -- exhausted.
+    go8 :: Reg -> Reg -> Integer -> InstrBlock
+    go8 dst reg8byte left =
+      if possibleWidth >= 8 then
+        let curMov = gen8 dst_addr reg8byte
+        in  curMov `appOL` go8 dst reg8byte (left - 8)
+      else go4 dst left
+      where
+        possibleWidth = minimum [left, sizeBytes]
+        dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
+
+genCCall' _ _ (PrimTarget MO_ReadBarrier) _ _ _  = return nilOL
+genCCall' _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
+        -- barriers compile to no code on x86/x86-64;
+        -- we keep it this long in order to prevent earlier optimisations.
+
+genCCall' _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL
+
+genCCall' _ is32bit (PrimTarget (MO_Prefetch_Data n )) _  [src] _ =
+        case n of
+            0 -> genPrefetch src $ PREFETCH NTA  format
+            1 -> genPrefetch src $ PREFETCH Lvl2 format
+            2 -> genPrefetch src $ PREFETCH Lvl1 format
+            3 -> genPrefetch src $ PREFETCH Lvl0 format
+            l -> panic $ "unexpected prefetch level in genCCall MO_Prefetch_Data: " ++ (show l)
+            -- the c / llvm prefetch convention is 0, 1, 2, and 3
+            -- the x86 corresponding names are : NTA, 2 , 1, and 0
+   where
+        format = archWordFormat is32bit
+        -- need to know what register width for pointers!
+        genPrefetch inRegSrc prefetchCTor =
+            do
+                code_src <- getAnyReg inRegSrc
+                src_r <- getNewRegNat format
+                return $ code_src src_r `appOL`
+                  (unitOL (prefetchCTor  (OpAddr
+                              ((AddrBaseIndex (EABaseReg src_r )   EAIndexNone (ImmInt 0))))  ))
+                  -- prefetch always takes an address
+
+genCCall' dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
+    let platform = targetPlatform dflags
+    let dst_r = getRegisterReg platform (CmmLocal dst)
+    case width of
+        W64 | is32Bit -> do
+               ChildCode64 vcode rlo <- iselExpr64 src
+               let dst_rhi = getHiVRegFromLo dst_r
+                   rhi     = getHiVRegFromLo rlo
+               return $ vcode `appOL`
+                        toOL [ MOV II32 (OpReg rlo) (OpReg dst_rhi),
+                               MOV II32 (OpReg rhi) (OpReg dst_r),
+                               BSWAP II32 dst_rhi,
+                               BSWAP II32 dst_r ]
+        W16 -> do code_src <- getAnyReg src
+                  return $ code_src dst_r `appOL`
+                           unitOL (BSWAP II32 dst_r) `appOL`
+                           unitOL (SHR II32 (OpImm $ ImmInt 16) (OpReg dst_r))
+        _   -> do code_src <- getAnyReg src
+                  return $ code_src dst_r `appOL` unitOL (BSWAP format dst_r)
+  where
+    format = intFormat width
+
+genCCall' dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
+         args@[src] bid = do
+    sse4_2 <- sse4_2Enabled
+    let platform = targetPlatform dflags
+    if sse4_2
+        then do code_src <- getAnyReg src
+                src_r <- getNewRegNat format
+                let dst_r = getRegisterReg platform  (CmmLocal dst)
+                return $ code_src src_r `appOL`
+                    (if width == W8 then
+                         -- The POPCNT instruction doesn't take a r/m8
+                         unitOL (MOVZxL II8 (OpReg src_r) (OpReg src_r)) `appOL`
+                         unitOL (POPCNT II16 (OpReg src_r) dst_r)
+                     else
+                         unitOL (POPCNT format (OpReg src_r) dst_r)) `appOL`
+                    (if width == W8 || width == W16 then
+                         -- We used a 16-bit destination register above,
+                         -- so zero-extend
+                         unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+                     else nilOL)
+        else do
+            targetExpr <- cmmMakeDynamicReference dflags
+                          CallReference lbl
+            let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+                                                           [NoHint] [NoHint]
+                                                           CmmMayReturn)
+            genCCall' dflags is32Bit target dest_regs args bid
+  where
+    format = intFormat width
+    lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width))
+
+genCCall' dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
+         args@[src, mask] bid = do
+    let platform = targetPlatform dflags
+    if isBmi2Enabled dflags
+        then do code_src  <- getAnyReg src
+                code_mask <- getAnyReg mask
+                src_r     <- getNewRegNat format
+                mask_r    <- getNewRegNat format
+                let dst_r = getRegisterReg platform  (CmmLocal dst)
+                return $ code_src src_r `appOL` code_mask mask_r `appOL`
+                    (if width == W8 then
+                         -- The PDEP instruction doesn't take a r/m8
+                         unitOL (MOVZxL II8  (OpReg src_r ) (OpReg src_r )) `appOL`
+                         unitOL (MOVZxL II8  (OpReg mask_r) (OpReg mask_r)) `appOL`
+                         unitOL (PDEP   II16 (OpReg mask_r) (OpReg src_r ) dst_r)
+                     else
+                         unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
+                    (if width == W8 || width == W16 then
+                         -- We used a 16-bit destination register above,
+                         -- so zero-extend
+                         unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+                     else nilOL)
+        else do
+            targetExpr <- cmmMakeDynamicReference dflags
+                          CallReference lbl
+            let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+                                                           [NoHint] [NoHint]
+                                                           CmmMayReturn)
+            genCCall' dflags is32Bit target dest_regs args bid
+  where
+    format = intFormat width
+    lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width))
+
+genCCall' dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
+         args@[src, mask] bid = do
+    let platform = targetPlatform dflags
+    if isBmi2Enabled dflags
+        then do code_src  <- getAnyReg src
+                code_mask <- getAnyReg mask
+                src_r     <- getNewRegNat format
+                mask_r    <- getNewRegNat format
+                let dst_r = getRegisterReg platform  (CmmLocal dst)
+                return $ code_src src_r `appOL` code_mask mask_r `appOL`
+                    (if width == W8 then
+                         -- The PEXT instruction doesn't take a r/m8
+                         unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
+                         unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
+                         unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r)
+                     else
+                         unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
+                    (if width == W8 || width == W16 then
+                         -- We used a 16-bit destination register above,
+                         -- so zero-extend
+                         unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
+                     else nilOL)
+        else do
+            targetExpr <- cmmMakeDynamicReference dflags
+                          CallReference lbl
+            let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+                                                           [NoHint] [NoHint]
+                                                           CmmMayReturn)
+            genCCall' dflags is32Bit target dest_regs args bid
+  where
+    format = intFormat width
+    lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width))
+
+genCCall' dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] bid
+  | is32Bit && width == W64 = do
+    -- Fallback to `hs_clz64` on i386
+    targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
+    let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+                                           [NoHint] [NoHint]
+                                           CmmMayReturn)
+    genCCall' dflags is32Bit target dest_regs args bid
+
+  | otherwise = do
+    code_src <- getAnyReg src
+    let dst_r = getRegisterReg platform (CmmLocal dst)
+    if isBmi2Enabled dflags
+        then do
+            src_r <- getNewRegNat (intFormat width)
+            return $ appOL (code_src src_r) $ case width of
+                W8 -> toOL
+                    [ MOVZxL II8  (OpReg src_r)       (OpReg src_r) -- zero-extend to 32 bit
+                    , LZCNT  II32 (OpReg src_r)       dst_r         -- lzcnt with extra 24 zeros
+                    , SUB    II32 (OpImm (ImmInt 24)) (OpReg dst_r) -- compensate for extra zeros
+                    ]
+                W16 -> toOL
+                    [ LZCNT  II16 (OpReg src_r) dst_r
+                    , MOVZxL II16 (OpReg dst_r) (OpReg dst_r) -- zero-extend from 16 bit
+                    ]
+                _ -> unitOL (LZCNT (intFormat width) (OpReg src_r) dst_r)
+        else do
+            let format = if width == W8 then II16 else intFormat width
+            src_r <- getNewRegNat format
+            tmp_r <- getNewRegNat format
+            return $ code_src src_r `appOL` toOL
+                     ([ MOVZxL  II8    (OpReg src_r) (OpReg src_r) | width == W8 ] ++
+                      [ BSR     format (OpReg src_r) tmp_r
+                      , MOV     II32   (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
+                      , CMOV NE format (OpReg tmp_r) dst_r
+                      , XOR     format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
+                      ]) -- NB: We don't need to zero-extend the result for the
+                         -- W8/W16 cases because the 'MOV' insn already
+                         -- took care of implicitly clearing the upper bits
+  where
+    bw = widthInBits width
+    platform = targetPlatform dflags
+    lbl = mkCmmCodeLabel primUnitId (fsLit (clzLabel width))
+
+genCCall' dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
+    targetExpr <- cmmMakeDynamicReference dflags
+                  CallReference lbl
+    let target = ForeignTarget targetExpr (ForeignConvention CCallConv
+                                           [NoHint] [NoHint]
+                                           CmmMayReturn)
+    genCCall' dflags is32Bit target dest_regs args bid
+  where
+    lbl = mkCmmCodeLabel primUnitId (fsLit (word2FloatLabel width))
+
+genCCall' dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
+  load_code <- intLoadCode (MOV (intFormat width)) addr
+  let platform = targetPlatform dflags
+
+  return (load_code (getRegisterReg platform  (CmmLocal dst)))
+
+genCCall' _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
+    code <- assignMem_IntCode (intFormat width) addr val
+    return $ code `snocOL` MFENCE
+
+genCCall' dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _ = do
+    -- On x86 we don't have enough registers to use cmpxchg with a
+    -- complicated addressing mode, so on that architecture we
+    -- pre-compute the address first.
+    Amode amode addr_code <- getSimpleAmode dflags is32Bit addr
+    newval <- getNewRegNat format
+    newval_code <- getAnyReg new
+    oldval <- getNewRegNat format
+    oldval_code <- getAnyReg old
+    let platform = targetPlatform dflags
+        dst_r    = getRegisterReg platform  (CmmLocal dst)
+        code     = toOL
+                   [ MOV format (OpReg oldval) (OpReg eax)
+                   , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode))
+                   , MOV format (OpReg eax) (OpReg dst_r)
+                   ]
+    return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval
+        `appOL` code
+  where
+    format = intFormat width
+
+genCCall' _ is32Bit target dest_regs args bid = do
+  dflags <- getDynFlags
+  let platform = targetPlatform dflags
+  case (target, dest_regs) of
+    -- void return type prim op
+    (PrimTarget op, []) ->
+        outOfLineCmmOp bid op Nothing args
+    -- we only cope with a single result for foreign calls
+    (PrimTarget op, [r])  -> case op of
+          MO_F32_Fabs -> case args of
+            [x] -> sse2FabsCode W32 x
+            _ -> panic "genCCall: Wrong number of arguments for fabs"
+          MO_F64_Fabs -> case args of
+            [x] -> sse2FabsCode W64 x
+            _ -> panic "genCCall: Wrong number of arguments for fabs"
+
+          MO_F32_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF32 args
+          MO_F64_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF64 args
+          _other_op -> outOfLineCmmOp bid op (Just r) args
+
+       where
+        actuallyInlineSSE2Op = actuallyInlineFloatOp'
+
+        actuallyInlineFloatOp'  instr format [x]
+              = do res <- trivialUFCode format (instr format) x
+                   any <- anyReg res
+                   return (any (getRegisterReg platform  (CmmLocal r)))
+
+        actuallyInlineFloatOp' _ _ args
+              = panic $ "genCCall.actuallyInlineFloatOp': bad number of arguments! ("
+                      ++ show (length args) ++ ")"
+
+        sse2FabsCode :: Width -> CmmExpr -> NatM InstrBlock
+        sse2FabsCode w x = do
+          let fmt = floatFormat w
+          x_code <- getAnyReg x
+          let
+            const | FF32 <- fmt = CmmInt 0x7fffffff W32
+                  | otherwise   = CmmInt 0x7fffffffffffffff W64
+          Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const
+          tmp <- getNewRegNat fmt
+          let
+            code dst = x_code dst `appOL` amode_code `appOL` toOL [
+                MOV fmt (OpAddr amode) (OpReg tmp),
+                AND fmt (OpReg tmp) (OpReg dst)
+                ]
+
+          return $ code (getRegisterReg platform (CmmLocal r))
+
+    (PrimTarget (MO_S_QuotRem  width), _) -> divOp1 platform True  width dest_regs args
+    (PrimTarget (MO_U_QuotRem  width), _) -> divOp1 platform False width dest_regs args
+    (PrimTarget (MO_U_QuotRem2 width), _) -> divOp2 platform False width dest_regs args
+    (PrimTarget (MO_Add2 width), [res_h, res_l]) ->
+        case args of
+        [arg_x, arg_y] ->
+            do hCode <- getAnyReg (CmmLit (CmmInt 0 width))
+               let format = intFormat width
+               lCode <- anyReg =<< trivialCode width (ADD_CC format)
+                                     (Just (ADD_CC format)) arg_x arg_y
+               let reg_l = getRegisterReg platform (CmmLocal res_l)
+                   reg_h = getRegisterReg platform (CmmLocal res_h)
+                   code = hCode reg_h `appOL`
+                          lCode reg_l `snocOL`
+                          ADC format (OpImm (ImmInteger 0)) (OpReg reg_h)
+               return code
+        _ -> panic "genCCall: Wrong number of arguments/results for add2"
+    (PrimTarget (MO_AddWordC width), [res_r, res_c]) ->
+        addSubIntC platform ADD_CC (const Nothing) CARRY width res_r res_c args
+    (PrimTarget (MO_SubWordC width), [res_r, res_c]) ->
+        addSubIntC platform SUB_CC (const Nothing) CARRY width res_r res_c args
+    (PrimTarget (MO_AddIntC width), [res_r, res_c]) ->
+        addSubIntC platform ADD_CC (Just . ADD_CC) OFLO width res_r res_c args
+    (PrimTarget (MO_SubIntC width), [res_r, res_c]) ->
+        addSubIntC platform SUB_CC (const Nothing) OFLO width res_r res_c args
+    (PrimTarget (MO_U_Mul2 width), [res_h, res_l]) ->
+        case args of
+        [arg_x, arg_y] ->
+            do (y_reg, y_code) <- getRegOrMem arg_y
+               x_code <- getAnyReg arg_x
+               let format = intFormat width
+                   reg_h = getRegisterReg platform (CmmLocal res_h)
+                   reg_l = getRegisterReg platform (CmmLocal res_l)
+                   code = y_code `appOL`
+                          x_code rax `appOL`
+                          toOL [MUL2 format y_reg,
+                                MOV format (OpReg rdx) (OpReg reg_h),
+                                MOV format (OpReg rax) (OpReg reg_l)]
+               return code
+        _ -> panic "genCCall: Wrong number of arguments/results for mul2"
+    (PrimTarget (MO_S_Mul2 width), [res_c, res_h, res_l]) ->
+        case args of
+        [arg_x, arg_y] ->
+            do (y_reg, y_code) <- getRegOrMem arg_y
+               x_code <- getAnyReg arg_x
+               reg_tmp <- getNewRegNat II8
+               let format = intFormat width
+                   reg_h = getRegisterReg platform (CmmLocal res_h)
+                   reg_l = getRegisterReg platform (CmmLocal res_l)
+                   reg_c = getRegisterReg platform (CmmLocal res_c)
+                   code = y_code `appOL`
+                          x_code rax `appOL`
+                          toOL [ IMUL2 format y_reg
+                               , MOV format (OpReg rdx) (OpReg reg_h)
+                               , MOV format (OpReg rax) (OpReg reg_l)
+                               , SETCC CARRY (OpReg reg_tmp)
+                               , MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
+                               ]
+               return code
+        _ -> panic "genCCall: Wrong number of arguments/results for imul2"
+
+    _ -> if is32Bit
+         then genCCall32' dflags target dest_regs args
+         else genCCall64' dflags target dest_regs args
+
+  where divOp1 platform signed width results [arg_x, arg_y]
+            = divOp platform signed width results Nothing arg_x arg_y
+        divOp1 _ _ _ _ _
+            = panic "genCCall: Wrong number of arguments for divOp1"
+        divOp2 platform signed width results [arg_x_high, arg_x_low, arg_y]
+            = divOp platform signed width results (Just arg_x_high) arg_x_low arg_y
+        divOp2 _ _ _ _ _
+            = panic "genCCall: Wrong number of arguments for divOp2"
+
+        -- See Note [DIV/IDIV for bytes]
+        divOp platform signed W8 [res_q, res_r] m_arg_x_high arg_x_low arg_y =
+            let widen | signed = MO_SS_Conv W8 W16
+                      | otherwise = MO_UU_Conv W8 W16
+                arg_x_low_16 = CmmMachOp widen [arg_x_low]
+                arg_y_16 = CmmMachOp widen [arg_y]
+                m_arg_x_high_16 = (\p -> CmmMachOp widen [p]) <$> m_arg_x_high
+            in divOp
+                  platform signed W16 [res_q, res_r]
+                  m_arg_x_high_16 arg_x_low_16 arg_y_16
+
+        divOp platform signed width [res_q, res_r]
+              m_arg_x_high arg_x_low arg_y
+            = do let format = intFormat width
+                     reg_q = getRegisterReg platform (CmmLocal res_q)
+                     reg_r = getRegisterReg platform (CmmLocal res_r)
+                     widen | signed    = CLTD format
+                           | otherwise = XOR format (OpReg rdx) (OpReg rdx)
+                     instr | signed    = IDIV
+                           | otherwise = DIV
+                 (y_reg, y_code) <- getRegOrMem arg_y
+                 x_low_code <- getAnyReg arg_x_low
+                 x_high_code <- case m_arg_x_high of
+                                Just arg_x_high ->
+                                    getAnyReg arg_x_high
+                                Nothing ->
+                                    return $ const $ unitOL widen
+                 return $ y_code `appOL`
+                          x_low_code rax `appOL`
+                          x_high_code rdx `appOL`
+                          toOL [instr format y_reg,
+                                MOV format (OpReg rax) (OpReg reg_q),
+                                MOV format (OpReg rdx) (OpReg reg_r)]
+        divOp _ _ _ _ _ _ _
+            = panic "genCCall: Wrong number of results for divOp"
+
+        addSubIntC platform instr mrevinstr cond width
+                   res_r res_c [arg_x, arg_y]
+            = do let format = intFormat width
+                 rCode <- anyReg =<< trivialCode width (instr format)
+                                       (mrevinstr format) arg_x arg_y
+                 reg_tmp <- getNewRegNat II8
+                 let reg_c = getRegisterReg platform  (CmmLocal res_c)
+                     reg_r = getRegisterReg platform  (CmmLocal res_r)
+                     code = rCode reg_r `snocOL`
+                            SETCC cond (OpReg reg_tmp) `snocOL`
+                            MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
+
+                 return code
+        addSubIntC _ _ _ _ _ _ _ _
+            = panic "genCCall: Wrong number of arguments/results for addSubIntC"
+
+-- Note [DIV/IDIV for bytes]
+--
+-- IDIV reminder:
+--   Size    Dividend   Divisor   Quotient    Remainder
+--   byte    %ax         r/m8      %al          %ah
+--   word    %dx:%ax     r/m16     %ax          %dx
+--   dword   %edx:%eax   r/m32     %eax         %edx
+--   qword   %rdx:%rax   r/m64     %rax         %rdx
+--
+-- We do a special case for the byte division because the current
+-- codegen doesn't deal well with accessing %ah register (also,
+-- accessing %ah in 64-bit mode is complicated because it cannot be an
+-- operand of many instructions). So we just widen operands to 16 bits
+-- and get the results from %al, %dl. This is not optimal, but a few
+-- register moves are probably not a huge deal when doing division.
+
+genCCall32' :: DynFlags
+            -> ForeignTarget            -- function to call
+            -> [CmmFormal]        -- where to put the result
+            -> [CmmActual]        -- arguments (of mixed type)
+            -> NatM InstrBlock
+genCCall32' dflags target dest_regs args = do
+        let
+            prom_args = map (maybePromoteCArg dflags W32) args
+
+            -- Align stack to 16n for calls, assuming a starting stack
+            -- alignment of 16n - word_size on procedure entry. Which we
+            -- maintiain. See Note [rts/StgCRun.c : Stack Alignment on X86]
+            sizes               = map (arg_size_bytes . cmmExprType dflags) (reverse args)
+            raw_arg_size        = sum sizes + wORD_SIZE dflags
+            arg_pad_size        = (roundTo 16 $ raw_arg_size) - raw_arg_size
+            tot_arg_size        = raw_arg_size + arg_pad_size - wORD_SIZE dflags
+        delta0 <- getDeltaNat
+        setDeltaNat (delta0 - arg_pad_size)
+
+        push_codes <- mapM push_arg (reverse prom_args)
+        delta <- getDeltaNat
+        MASSERT(delta == delta0 - tot_arg_size)
+
+        -- deal with static vs dynamic call targets
+        (callinsns,cconv) <-
+          case target of
+            ForeignTarget (CmmLit (CmmLabel lbl)) conv
+               -> -- ToDo: stdcall arg sizes
+                  return (unitOL (CALL (Left fn_imm) []), conv)
+               where fn_imm = ImmCLbl lbl
+            ForeignTarget expr conv
+               -> do { (dyn_r, dyn_c) <- getSomeReg expr
+                     ; ASSERT( isWord32 (cmmExprType dflags expr) )
+                       return (dyn_c `snocOL` CALL (Right dyn_r) [], conv) }
+            PrimTarget _
+                -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
+                            ++ "probably because too many return values."
+
+        let push_code
+                | arg_pad_size /= 0
+                = toOL [SUB II32 (OpImm (ImmInt arg_pad_size)) (OpReg esp),
+                        DELTA (delta0 - arg_pad_size)]
+                  `appOL` concatOL push_codes
+                | otherwise
+                = concatOL push_codes
+
+              -- Deallocate parameters after call for ccall;
+              -- but not for stdcall (callee does it)
+              --
+              -- We have to pop any stack padding we added
+              -- even if we are doing stdcall, though (#5052)
+            pop_size
+               | ForeignConvention StdCallConv _ _ _ <- cconv = arg_pad_size
+               | otherwise = tot_arg_size
+
+            call = callinsns `appOL`
+                   toOL (
+                      (if pop_size==0 then [] else
+                       [ADD II32 (OpImm (ImmInt pop_size)) (OpReg esp)])
+                      ++
+                      [DELTA delta0]
+                   )
+        setDeltaNat delta0
+
+        dflags <- getDynFlags
+        let platform = targetPlatform dflags
+
+        let
+            -- assign the results, if necessary
+            assign_code []     = nilOL
+            assign_code [dest]
+              | isFloatType ty =
+                  -- we assume SSE2
+                  let tmp_amode = AddrBaseIndex (EABaseReg esp)
+                                                       EAIndexNone
+                                                       (ImmInt 0)
+                      fmt = floatFormat w
+                         in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
+                                   DELTA (delta0 - b),
+                                   X87Store fmt  tmp_amode,
+                                   -- X87Store only supported for the CDECL ABI
+                                   -- NB: This code will need to be
+                                   -- revisted once GHC does more work around
+                                   -- SIGFPE f
+                                   MOV fmt (OpAddr tmp_amode) (OpReg r_dest),
+                                   ADD II32 (OpImm (ImmInt b)) (OpReg esp),
+                                   DELTA delta0]
+              | isWord64 ty    = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
+                                        MOV II32 (OpReg edx) (OpReg r_dest_hi)]
+              | otherwise      = unitOL (MOV (intFormat w)
+                                             (OpReg eax)
+                                             (OpReg r_dest))
+              where
+                    ty = localRegType dest
+                    w  = typeWidth ty
+                    b  = widthInBytes w
+                    r_dest_hi = getHiVRegFromLo r_dest
+                    r_dest    = getRegisterReg platform  (CmmLocal dest)
+            assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
+
+        return (push_code `appOL`
+                call `appOL`
+                assign_code dest_regs)
+
+      where
+        -- If the size is smaller than the word, we widen things (see maybePromoteCArg)
+        arg_size_bytes :: CmmType -> Int
+        arg_size_bytes ty = max (widthInBytes (typeWidth ty)) (widthInBytes (wordWidth dflags))
+
+        roundTo a x | x `mod` a == 0 = x
+                    | otherwise = x + a - (x `mod` a)
+
+        push_arg :: CmmActual {-current argument-}
+                        -> NatM InstrBlock  -- code
+
+        push_arg  arg -- we don't need the hints on x86
+          | isWord64 arg_ty = do
+            ChildCode64 code r_lo <- iselExpr64 arg
+            delta <- getDeltaNat
+            setDeltaNat (delta - 8)
+            let r_hi = getHiVRegFromLo r_lo
+            return (       code `appOL`
+                           toOL [PUSH II32 (OpReg r_hi), DELTA (delta - 4),
+                                 PUSH II32 (OpReg r_lo), DELTA (delta - 8),
+                                 DELTA (delta-8)]
+                )
+
+          | isFloatType arg_ty = do
+            (reg, code) <- getSomeReg arg
+            delta <- getDeltaNat
+            setDeltaNat (delta-size)
+            return (code `appOL`
+                            toOL [SUB II32 (OpImm (ImmInt size)) (OpReg esp),
+                                  DELTA (delta-size),
+                                  let addr = AddrBaseIndex (EABaseReg esp)
+                                                            EAIndexNone
+                                                            (ImmInt 0)
+                                      format = floatFormat (typeWidth arg_ty)
+                                  in
+
+                                  -- assume SSE2
+                                   MOV format (OpReg reg) (OpAddr addr)
+
+                                 ]
+                           )
+
+          | otherwise = do
+            -- Arguments can be smaller than 32-bit, but we still use @PUSH
+            -- II32@ - the usual calling conventions expect integers to be
+            -- 4-byte aligned.
+            ASSERT((typeWidth arg_ty) <= W32) return ()
+            (operand, code) <- getOperand arg
+            delta <- getDeltaNat
+            setDeltaNat (delta-size)
+            return (code `snocOL`
+                    PUSH II32 operand `snocOL`
+                    DELTA (delta-size))
+
+          where
+             arg_ty = cmmExprType dflags arg
+             size = arg_size_bytes arg_ty -- Byte size
+
+genCCall64' :: DynFlags
+            -> ForeignTarget      -- function to call
+            -> [CmmFormal]        -- where to put the result
+            -> [CmmActual]        -- arguments (of mixed type)
+            -> NatM InstrBlock
+genCCall64' dflags target dest_regs args = do
+    -- load up the register arguments
+    let prom_args = map (maybePromoteCArg dflags W32) args
+
+    (stack_args, int_regs_used, fp_regs_used, load_args_code, assign_args_code)
+         <-
+        if platformOS platform == OSMinGW32
+        then load_args_win prom_args [] [] (allArgRegs platform) nilOL
+        else do
+           (stack_args, aregs, fregs, load_args_code, assign_args_code)
+               <- load_args prom_args (allIntArgRegs platform)
+                                      (allFPArgRegs platform)
+                                      nilOL nilOL
+           let used_regs rs as = reverse (drop (length rs) (reverse as))
+               fregs_used      = used_regs fregs (allFPArgRegs platform)
+               aregs_used      = used_regs aregs (allIntArgRegs platform)
+           return (stack_args, aregs_used, fregs_used, load_args_code
+                                                      , assign_args_code)
+
+    let
+        arg_regs_used = int_regs_used ++ fp_regs_used
+        arg_regs = [eax] ++ arg_regs_used
+                -- for annotating the call instruction with
+        sse_regs = length fp_regs_used
+        arg_stack_slots = if platformOS platform == OSMinGW32
+                          then length stack_args + length (allArgRegs platform)
+                          else length stack_args
+        tot_arg_size = arg_size * arg_stack_slots
+
+
+    -- Align stack to 16n for calls, assuming a starting stack
+    -- alignment of 16n - word_size on procedure entry. Which we
+    -- maintain. See Note [rts/StgCRun.c : Stack Alignment on X86]
+    (real_size, adjust_rsp) <-
+        if (tot_arg_size + wORD_SIZE dflags) `rem` 16 == 0
+            then return (tot_arg_size, nilOL)
+            else do -- we need to adjust...
+                delta <- getDeltaNat
+                setDeltaNat (delta - wORD_SIZE dflags)
+                return (tot_arg_size + wORD_SIZE dflags, toOL [
+                                SUB II64 (OpImm (ImmInt (wORD_SIZE dflags))) (OpReg rsp),
+                                DELTA (delta - wORD_SIZE dflags) ])
+
+    -- push the stack args, right to left
+    push_code <- push_args (reverse stack_args) nilOL
+    -- On Win64, we also have to leave stack space for the arguments
+    -- that we are passing in registers
+    lss_code <- if platformOS platform == OSMinGW32
+                then leaveStackSpace (length (allArgRegs platform))
+                else return nilOL
+    delta <- getDeltaNat
+
+    -- deal with static vs dynamic call targets
+    (callinsns,_cconv) <-
+      case target of
+        ForeignTarget (CmmLit (CmmLabel lbl)) conv
+           -> -- ToDo: stdcall arg sizes
+              return (unitOL (CALL (Left fn_imm) arg_regs), conv)
+           where fn_imm = ImmCLbl lbl
+        ForeignTarget expr conv
+           -> do (dyn_r, dyn_c) <- getSomeReg expr
+                 return (dyn_c `snocOL` CALL (Right dyn_r) arg_regs, conv)
+        PrimTarget _
+            -> panic $ "genCCall: Can't handle PrimTarget call type here, error "
+                        ++ "probably because too many return values."
+
+    let
+        -- The x86_64 ABI requires us to set %al to the number of SSE2
+        -- registers that contain arguments, if the called routine
+        -- is a varargs function.  We don't know whether it's a
+        -- varargs function or not, so we have to assume it is.
+        --
+        -- It's not safe to omit this assignment, even if the number
+        -- of SSE2 regs in use is zero.  If %al is larger than 8
+        -- on entry to a varargs function, seg faults ensue.
+        assign_eax n = unitOL (MOV II32 (OpImm (ImmInt n)) (OpReg eax))
+
+    let call = callinsns `appOL`
+               toOL (
+                    -- Deallocate parameters after call for ccall;
+                    -- stdcall has callee do it, but is not supported on
+                    -- x86_64 target (see #3336)
+                  (if real_size==0 then [] else
+                   [ADD (intFormat (wordWidth dflags)) (OpImm (ImmInt real_size)) (OpReg esp)])
+                  ++
+                  [DELTA (delta + real_size)]
+               )
+    setDeltaNat (delta + real_size)
+
+    let
+        -- assign the results, if necessary
+        assign_code []     = nilOL
+        assign_code [dest] =
+          case typeWidth rep of
+                W32 | isFloatType rep -> unitOL (MOV (floatFormat W32)
+                                                     (OpReg xmm0)
+                                                     (OpReg r_dest))
+                W64 | isFloatType rep -> unitOL (MOV (floatFormat W64)
+                                                     (OpReg xmm0)
+                                                     (OpReg r_dest))
+                _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
+          where
+                rep = localRegType dest
+                r_dest = getRegisterReg platform  (CmmLocal dest)
+        assign_code _many = panic "genCCall.assign_code many"
+
+    return (adjust_rsp          `appOL`
+            push_code           `appOL`
+            load_args_code      `appOL`
+            assign_args_code    `appOL`
+            lss_code            `appOL`
+            assign_eax sse_regs `appOL`
+            call                `appOL`
+            assign_code dest_regs)
+
+  where platform = targetPlatform dflags
+        arg_size = 8 -- always, at the mo
+
+
+        load_args :: [CmmExpr]
+                  -> [Reg]         -- int regs avail for args
+                  -> [Reg]         -- FP regs avail for args
+                  -> InstrBlock    -- code computing args
+                  -> InstrBlock    -- code assigning args to ABI regs
+                  -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
+        -- no more regs to use
+        load_args args [] [] code acode     =
+            return (args, [], [], code, acode)
+
+        -- no more args to push
+        load_args [] aregs fregs code acode =
+            return ([], aregs, fregs, code, acode)
+
+        load_args (arg : rest) aregs fregs code acode
+            | isFloatType arg_rep = case fregs of
+                 []     -> push_this_arg
+                 (r:rs) -> do
+                    (code',acode') <- reg_this_arg r
+                    load_args rest aregs rs code' acode'
+            | otherwise           = case aregs of
+                 []     -> push_this_arg
+                 (r:rs) -> do
+                    (code',acode') <- reg_this_arg r
+                    load_args rest rs fregs code' acode'
+            where
+
+              -- put arg into the list of stack pushed args
+              push_this_arg = do
+                 (args',ars,frs,code',acode')
+                     <- load_args rest aregs fregs code acode
+                 return (arg:args', ars, frs, code', acode')
+
+              -- pass the arg into the given register
+              reg_this_arg r
+                -- "operand" args can be directly assigned into r
+                | isOperand False arg = do
+                    arg_code <- getAnyReg arg
+                    return (code, (acode `appOL` arg_code r))
+                -- The last non-operand arg can be directly assigned after its
+                -- computation without going into a temporary register
+                | all (isOperand False) rest = do
+                    arg_code   <- getAnyReg arg
+                    return (code `appOL` arg_code r,acode)
+
+                -- other args need to be computed beforehand to avoid clobbering
+                -- previously assigned registers used to pass parameters (see
+                -- #11792, #12614). They are assigned into temporary registers
+                -- and get assigned to proper call ABI registers after they all
+                -- have been computed.
+                | otherwise     = do
+                    arg_code <- getAnyReg arg
+                    tmp      <- getNewRegNat arg_fmt
+                    let
+                      code'  = code `appOL` arg_code tmp
+                      acode' = acode `snocOL` reg2reg arg_fmt tmp r
+                    return (code',acode')
+
+              arg_rep = cmmExprType dflags arg
+              arg_fmt = cmmTypeFormat arg_rep
+
+        load_args_win :: [CmmExpr]
+                      -> [Reg]        -- used int regs
+                      -> [Reg]        -- used FP regs
+                      -> [(Reg, Reg)] -- (int, FP) regs avail for args
+                      -> InstrBlock
+                      -> NatM ([CmmExpr],[Reg],[Reg],InstrBlock,InstrBlock)
+        load_args_win args usedInt usedFP [] code
+            = return (args, usedInt, usedFP, code, nilOL)
+            -- no more regs to use
+        load_args_win [] usedInt usedFP _ code
+            = return ([], usedInt, usedFP, code, nilOL)
+            -- no more args to push
+        load_args_win (arg : rest) usedInt usedFP
+                      ((ireg, freg) : regs) code
+            | isFloatType arg_rep = do
+                 arg_code <- getAnyReg arg
+                 load_args_win rest (ireg : usedInt) (freg : usedFP) regs
+                               (code `appOL`
+                                arg_code freg `snocOL`
+                                -- If we are calling a varargs function
+                                -- then we need to define ireg as well
+                                -- as freg
+                                MOV II64 (OpReg freg) (OpReg ireg))
+            | otherwise = do
+                 arg_code <- getAnyReg arg
+                 load_args_win rest (ireg : usedInt) usedFP regs
+                               (code `appOL` arg_code ireg)
+            where
+              arg_rep = cmmExprType dflags arg
+
+        push_args [] code = return code
+        push_args (arg:rest) code
+           | isFloatType arg_rep = do
+             (arg_reg, arg_code) <- getSomeReg arg
+             delta <- getDeltaNat
+             setDeltaNat (delta-arg_size)
+             let code' = code `appOL` arg_code `appOL` toOL [
+                            SUB (intFormat (wordWidth dflags)) (OpImm (ImmInt arg_size)) (OpReg rsp),
+                            DELTA (delta-arg_size),
+                            MOV (floatFormat width) (OpReg arg_reg) (OpAddr (spRel dflags 0))]
+             push_args rest code'
+
+           | otherwise = do
+             -- Arguments can be smaller than 64-bit, but we still use @PUSH
+             -- II64@ - the usual calling conventions expect integers to be
+             -- 8-byte aligned.
+             ASSERT(width <= W64) return ()
+             (arg_op, arg_code) <- getOperand arg
+             delta <- getDeltaNat
+             setDeltaNat (delta-arg_size)
+             let code' = code `appOL` arg_code `appOL` toOL [
+                                    PUSH II64 arg_op,
+                                    DELTA (delta-arg_size)]
+             push_args rest code'
+            where
+              arg_rep = cmmExprType dflags arg
+              width = typeWidth arg_rep
+
+        leaveStackSpace n = do
+             delta <- getDeltaNat
+             setDeltaNat (delta - n * arg_size)
+             return $ toOL [
+                         SUB II64 (OpImm (ImmInt (n * wORD_SIZE dflags))) (OpReg rsp),
+                         DELTA (delta - n * arg_size)]
+
+maybePromoteCArg :: DynFlags -> Width -> CmmExpr -> CmmExpr
+maybePromoteCArg dflags wto arg
+ | wfrom < wto = CmmMachOp (MO_UU_Conv wfrom wto) [arg]
+ | otherwise   = arg
+ where
+   wfrom = cmmExprWidth dflags arg
+
+outOfLineCmmOp :: BlockId -> CallishMachOp -> Maybe CmmFormal -> [CmmActual]
+               -> NatM InstrBlock
+outOfLineCmmOp bid mop res args
+  = do
+      dflags <- getDynFlags
+      targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
+      let target = ForeignTarget targetExpr
+                           (ForeignConvention CCallConv [] [] CmmMayReturn)
+
+      -- We know foreign calls results in no new basic blocks, so we can ignore
+      -- the returned block id.
+      (instrs, _) <- stmtToInstrs bid (CmmUnsafeForeignCall target (catMaybes [res]) args)
+      return instrs
+  where
+        -- Assume we can call these functions directly, and that they're not in a dynamic library.
+        -- TODO: Why is this ok? Under linux this code will be in libm.so
+        --       Is it because they're really implemented as a primitive instruction by the assembler??  -- BL 2009/12/31
+        lbl = mkForeignLabel fn Nothing ForeignLabelInThisPackage IsFunction
+
+        fn = case mop of
+              MO_F32_Sqrt  -> fsLit "sqrtf"
+              MO_F32_Fabs  -> fsLit "fabsf"
+              MO_F32_Sin   -> fsLit "sinf"
+              MO_F32_Cos   -> fsLit "cosf"
+              MO_F32_Tan   -> fsLit "tanf"
+              MO_F32_Exp   -> fsLit "expf"
+              MO_F32_ExpM1 -> fsLit "expm1f"
+              MO_F32_Log   -> fsLit "logf"
+              MO_F32_Log1P -> fsLit "log1pf"
+
+              MO_F32_Asin  -> fsLit "asinf"
+              MO_F32_Acos  -> fsLit "acosf"
+              MO_F32_Atan  -> fsLit "atanf"
+
+              MO_F32_Sinh  -> fsLit "sinhf"
+              MO_F32_Cosh  -> fsLit "coshf"
+              MO_F32_Tanh  -> fsLit "tanhf"
+              MO_F32_Pwr   -> fsLit "powf"
+
+              MO_F32_Asinh -> fsLit "asinhf"
+              MO_F32_Acosh -> fsLit "acoshf"
+              MO_F32_Atanh -> fsLit "atanhf"
+
+              MO_F64_Sqrt  -> fsLit "sqrt"
+              MO_F64_Fabs  -> fsLit "fabs"
+              MO_F64_Sin   -> fsLit "sin"
+              MO_F64_Cos   -> fsLit "cos"
+              MO_F64_Tan   -> fsLit "tan"
+              MO_F64_Exp   -> fsLit "exp"
+              MO_F64_ExpM1 -> fsLit "expm1"
+              MO_F64_Log   -> fsLit "log"
+              MO_F64_Log1P -> fsLit "log1p"
+
+              MO_F64_Asin  -> fsLit "asin"
+              MO_F64_Acos  -> fsLit "acos"
+              MO_F64_Atan  -> fsLit "atan"
+
+              MO_F64_Sinh  -> fsLit "sinh"
+              MO_F64_Cosh  -> fsLit "cosh"
+              MO_F64_Tanh  -> fsLit "tanh"
+              MO_F64_Pwr   -> fsLit "pow"
+
+              MO_F64_Asinh  -> fsLit "asinh"
+              MO_F64_Acosh  -> fsLit "acosh"
+              MO_F64_Atanh  -> fsLit "atanh"
+
+              MO_Memcpy _  -> fsLit "memcpy"
+              MO_Memset _  -> fsLit "memset"
+              MO_Memmove _ -> fsLit "memmove"
+              MO_Memcmp _  -> fsLit "memcmp"
+
+              MO_PopCnt _  -> fsLit "popcnt"
+              MO_BSwap _   -> fsLit "bswap"
+              {- Here the C implementation is used as there is no x86
+              instruction to reverse a word's bit order.
+              -}
+              MO_BRev w    -> fsLit $ bRevLabel w
+              MO_Clz w     -> fsLit $ clzLabel w
+              MO_Ctz _     -> unsupported
+
+              MO_Pdep w    -> fsLit $ pdepLabel w
+              MO_Pext w    -> fsLit $ pextLabel w
+
+              MO_AtomicRMW _ _ -> fsLit "atomicrmw"
+              MO_AtomicRead _  -> fsLit "atomicread"
+              MO_AtomicWrite _ -> fsLit "atomicwrite"
+              MO_Cmpxchg _     -> fsLit "cmpxchg"
+
+              MO_UF_Conv _ -> unsupported
+
+              MO_S_Mul2    {}  -> unsupported
+              MO_S_QuotRem {}  -> unsupported
+              MO_U_QuotRem {}  -> unsupported
+              MO_U_QuotRem2 {} -> unsupported
+              MO_Add2 {}       -> unsupported
+              MO_AddIntC {}    -> unsupported
+              MO_SubIntC {}    -> unsupported
+              MO_AddWordC {}   -> unsupported
+              MO_SubWordC {}   -> unsupported
+              MO_U_Mul2 {}     -> unsupported
+              MO_ReadBarrier   -> unsupported
+              MO_WriteBarrier  -> unsupported
+              MO_Touch         -> unsupported
+              (MO_Prefetch_Data _ ) -> unsupported
+        unsupported = panic ("outOfLineCmmOp: " ++ show mop
+                          ++ " not supported here")
+
+-- -----------------------------------------------------------------------------
+-- Generating a table-branch
+
+genSwitch :: DynFlags -> CmmExpr -> SwitchTargets -> NatM InstrBlock
+
+genSwitch dflags expr targets
+  | positionIndependent dflags
+  = do
+        (reg,e_code) <- getNonClobberedReg (cmmOffset dflags expr offset)
+           -- getNonClobberedReg because it needs to survive across t_code
+        lbl <- getNewLabelNat
+        dflags <- getDynFlags
+        let is32bit = target32Bit (targetPlatform dflags)
+            os = platformOS (targetPlatform dflags)
+            -- Might want to use .rodata.<function we're in> instead, but as
+            -- long as it's something unique it'll work out since the
+            -- references to the jump table are in the appropriate section.
+            rosection = case os of
+              -- on Mac OS X/x86_64, put the jump table in the text section to
+              -- work around a limitation of the linker.
+              -- ld64 is unable to handle the relocations for
+              --     .quad L1 - L0
+              -- if L0 is not preceded by a non-anonymous label in its section.
+              OSDarwin | not is32bit -> Section Text lbl
+              _ -> Section ReadOnlyData lbl
+        dynRef <- cmmMakeDynamicReference dflags DataReference lbl
+        (tableReg,t_code) <- getSomeReg $ dynRef
+        let op = OpAddr (AddrBaseIndex (EABaseReg tableReg)
+                                       (EAIndex reg (wORD_SIZE dflags)) (ImmInt 0))
+
+        offsetReg <- getNewRegNat (intFormat (wordWidth dflags))
+        return $ if is32bit || os == OSDarwin
+                 then e_code `appOL` t_code `appOL` toOL [
+                                ADD (intFormat (wordWidth dflags)) op (OpReg tableReg),
+                                JMP_TBL (OpReg tableReg) ids rosection lbl
+                       ]
+                 else -- HACK: On x86_64 binutils<2.17 is only able to generate
+                      -- PC32 relocations, hence we only get 32-bit offsets in
+                      -- the jump table. As these offsets are always negative
+                      -- we need to properly sign extend them to 64-bit. This
+                      -- hack should be removed in conjunction with the hack in
+                      -- PprMach.hs/pprDataItem once binutils 2.17 is standard.
+                      e_code `appOL` t_code `appOL` toOL [
+                               MOVSxL II32 op (OpReg offsetReg),
+                               ADD (intFormat (wordWidth dflags))
+                                   (OpReg offsetReg)
+                                   (OpReg tableReg),
+                               JMP_TBL (OpReg tableReg) ids rosection lbl
+                       ]
+  | otherwise
+  = do
+        (reg,e_code) <- getSomeReg (cmmOffset dflags expr offset)
+        lbl <- getNewLabelNat
+        let op = OpAddr (AddrBaseIndex EABaseNone (EAIndex reg (wORD_SIZE dflags)) (ImmCLbl lbl))
+            code = e_code `appOL` toOL [
+                    JMP_TBL op ids (Section ReadOnlyData lbl) lbl
+                 ]
+        return code
+  where
+    (offset, blockIds) = switchTargetsToTable targets
+    ids = map (fmap DestBlockId) blockIds
+
+generateJumpTableForInstr :: DynFlags -> Instr -> Maybe (NatCmmDecl (Alignment, RawCmmStatics) Instr)
+generateJumpTableForInstr dflags (JMP_TBL _ ids section lbl)
+    = let getBlockId (DestBlockId id) = id
+          getBlockId _ = panic "Non-Label target in Jump Table"
+          blockIds = map (fmap getBlockId) ids
+      in Just (createJumpTable dflags blockIds section lbl)
+generateJumpTableForInstr _ _ = Nothing
+
+createJumpTable :: DynFlags -> [Maybe BlockId] -> Section -> CLabel
+                -> GenCmmDecl (Alignment, RawCmmStatics) h g
+createJumpTable dflags ids section lbl
+    = let jumpTable
+            | positionIndependent dflags =
+                  let ww = wordWidth dflags
+                      jumpTableEntryRel Nothing
+                          = CmmStaticLit (CmmInt 0 ww)
+                      jumpTableEntryRel (Just blockid)
+                          = CmmStaticLit (CmmLabelDiffOff blockLabel lbl 0 ww)
+                          where blockLabel = blockLbl blockid
+                  in map jumpTableEntryRel ids
+            | otherwise = map (jumpTableEntry dflags) ids
+      in CmmData section (mkAlignment 1, RawCmmStatics lbl jumpTable)
+
+extractUnwindPoints :: [Instr] -> [UnwindPoint]
+extractUnwindPoints instrs =
+    [ UnwindPoint lbl unwinds | UNWIND lbl unwinds <- instrs]
+
+-- -----------------------------------------------------------------------------
+-- 'condIntReg' and 'condFltReg': condition codes into registers
+
+-- Turn those condition codes into integers now (when they appear on
+-- the right hand side of an assignment).
+--
+-- (If applicable) Do not fill the delay slots here; you will confuse the
+-- register allocator.
+
+condIntReg :: Cond -> CmmExpr -> CmmExpr -> NatM Register
+
+condIntReg cond x y = do
+  CondCode _ cond cond_code <- condIntCode cond x y
+  tmp <- getNewRegNat II8
+  let
+        code dst = cond_code `appOL` toOL [
+                    SETCC cond (OpReg tmp),
+                    MOVZxL II8 (OpReg tmp) (OpReg dst)
+                  ]
+  return (Any II32 code)
+
+
+-----------------------------------------------------------
+---          Note [SSE Parity Checks]                   ---
+-----------------------------------------------------------
+
+-- We have to worry about unordered operands (eg. comparisons
+-- against NaN).  If the operands are unordered, the comparison
+-- sets the parity flag, carry flag and zero flag.
+-- All comparisons are supposed to return false for unordered
+-- operands except for !=, which returns true.
+--
+-- Optimisation: we don't have to test the parity flag if we
+-- know the test has already excluded the unordered case: eg >
+-- and >= test for a zero carry flag, which can only occur for
+-- ordered operands.
+--
+-- By reversing comparisons we can avoid testing the parity
+-- for < and <= as well. If any of the arguments is an NaN we
+-- return false either way. If both arguments are valid then
+-- x <= y  <->  y >= x  holds. So it's safe to swap these.
+--
+-- We invert the condition inside getRegister'and  getCondCode
+-- which should cover all invertable cases.
+-- All other functions translating FP comparisons to assembly
+-- use these to two generate the comparison code.
+--
+-- As an example consider a simple check:
+--
+-- func :: Float -> Float -> Int
+-- func x y = if x < y then 1 else 0
+--
+-- Which in Cmm gives the floating point comparison.
+--
+--  if (%MO_F_Lt_W32(F1, F2)) goto c2gg; else goto c2gf;
+--
+-- We used to compile this to an assembly code block like this:
+-- _c2gh:
+--  ucomiss %xmm2,%xmm1
+--  jp _c2gf
+--  jb _c2gg
+--  jmp _c2gf
+--
+-- Where we have to introduce an explicit
+-- check for unordered results (using jmp parity):
+--
+-- We can avoid this by exchanging the arguments and inverting the direction
+-- of the comparison. This results in the sequence of:
+--
+--  ucomiss %xmm1,%xmm2
+--  ja _c2g2
+--  jmp _c2g1
+--
+-- Removing the jump reduces the pressure on the branch predidiction system
+-- and plays better with the uOP cache.
+
+condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register
+condFltReg is32Bit cond x y = condFltReg_sse2
+ where
+
+
+  condFltReg_sse2 = do
+    CondCode _ cond cond_code <- condFltCode cond x y
+    tmp1 <- getNewRegNat (archWordFormat is32Bit)
+    tmp2 <- getNewRegNat (archWordFormat is32Bit)
+    let -- See Note [SSE Parity Checks]
+        code dst =
+           cond_code `appOL`
+             (case cond of
+                NE  -> or_unordered dst
+                GU  -> plain_test   dst
+                GEU -> plain_test   dst
+                -- Use ASSERT so we don't break releases if these creep in.
+                LTT -> ASSERT2(False, ppr "Should have been turned into >")
+                       and_ordered  dst
+                LE  -> ASSERT2(False, ppr "Should have been turned into >=")
+                       and_ordered  dst
+                _   -> and_ordered  dst)
+
+        plain_test dst = toOL [
+                    SETCC cond (OpReg tmp1),
+                    MOVZxL II8 (OpReg tmp1) (OpReg dst)
+                 ]
+        or_unordered dst = toOL [
+                    SETCC cond (OpReg tmp1),
+                    SETCC PARITY (OpReg tmp2),
+                    OR II8 (OpReg tmp1) (OpReg tmp2),
+                    MOVZxL II8 (OpReg tmp2) (OpReg dst)
+                  ]
+        and_ordered dst = toOL [
+                    SETCC cond (OpReg tmp1),
+                    SETCC NOTPARITY (OpReg tmp2),
+                    AND II8 (OpReg tmp1) (OpReg tmp2),
+                    MOVZxL II8 (OpReg tmp2) (OpReg dst)
+                  ]
+    return (Any II32 code)
+
+
+-- -----------------------------------------------------------------------------
+-- 'trivial*Code': deal with trivial instructions
+
+-- Trivial (dyadic: 'trivialCode', floating-point: 'trivialFCode',
+-- unary: 'trivialUCode', unary fl-pt:'trivialUFCode') instructions.
+-- Only look for constants on the right hand side, because that's
+-- where the generic optimizer will have put them.
+
+-- Similarly, for unary instructions, we don't have to worry about
+-- matching an StInt as the argument, because genericOpt will already
+-- have handled the constant-folding.
+
+
+{-
+The Rules of the Game are:
+
+* You cannot assume anything about the destination register dst;
+  it may be anything, including a fixed reg.
+
+* You may compute an operand into a fixed reg, but you may not
+  subsequently change the contents of that fixed reg.  If you
+  want to do so, first copy the value either to a temporary
+  or into dst.  You are free to modify dst even if it happens
+  to be a fixed reg -- that's not your problem.
+
+* You cannot assume that a fixed reg will stay live over an
+  arbitrary computation.  The same applies to the dst reg.
+
+* Temporary regs obtained from getNewRegNat are distinct from
+  each other and from all other regs, and stay live over
+  arbitrary computations.
+
+--------------------
+
+SDM's version of The Rules:
+
+* If getRegister returns Any, that means it can generate correct
+  code which places the result in any register, period.  Even if that
+  register happens to be read during the computation.
+
+  Corollary #1: this means that if you are generating code for an
+  operation with two arbitrary operands, you cannot assign the result
+  of the first operand into the destination register before computing
+  the second operand.  The second operand might require the old value
+  of the destination register.
+
+  Corollary #2: A function might be able to generate more efficient
+  code if it knows the destination register is a new temporary (and
+  therefore not read by any of the sub-computations).
+
+* If getRegister returns Any, then the code it generates may modify only:
+        (a) fresh temporaries
+        (b) the destination register
+        (c) known registers (eg. %ecx is used by shifts)
+  In particular, it may *not* modify global registers, unless the global
+  register happens to be the destination register.
+-}
+
+trivialCode :: Width -> (Operand -> Operand -> Instr)
+            -> Maybe (Operand -> Operand -> Instr)
+            -> CmmExpr -> CmmExpr -> NatM Register
+trivialCode width instr m a b
+    = do is32Bit <- is32BitPlatform
+         trivialCode' is32Bit width instr m a b
+
+trivialCode' :: Bool -> Width -> (Operand -> Operand -> Instr)
+             -> Maybe (Operand -> Operand -> Instr)
+             -> CmmExpr -> CmmExpr -> NatM Register
+trivialCode' is32Bit width _ (Just revinstr) (CmmLit lit_a) b
+  | is32BitLit is32Bit lit_a = do
+  b_code <- getAnyReg b
+  let
+       code dst
+         = b_code dst `snocOL`
+           revinstr (OpImm (litToImm lit_a)) (OpReg dst)
+  return (Any (intFormat width) code)
+
+trivialCode' _ width instr _ a b
+  = genTrivialCode (intFormat width) instr a b
+
+-- This is re-used for floating pt instructions too.
+genTrivialCode :: Format -> (Operand -> Operand -> Instr)
+               -> CmmExpr -> CmmExpr -> NatM Register
+genTrivialCode rep instr a b = do
+  (b_op, b_code) <- getNonClobberedOperand b
+  a_code <- getAnyReg a
+  tmp <- getNewRegNat rep
+  let
+     -- We want the value of b to stay alive across the computation of a.
+     -- But, we want to calculate a straight into the destination register,
+     -- because the instruction only has two operands (dst := dst `op` src).
+     -- The troublesome case is when the result of b is in the same register
+     -- as the destination reg.  In this case, we have to save b in a
+     -- new temporary across the computation of a.
+     code dst
+        | dst `regClashesWithOp` b_op =
+                b_code `appOL`
+                unitOL (MOV rep b_op (OpReg tmp)) `appOL`
+                a_code dst `snocOL`
+                instr (OpReg tmp) (OpReg dst)
+        | otherwise =
+                b_code `appOL`
+                a_code dst `snocOL`
+                instr b_op (OpReg dst)
+  return (Any rep code)
+
+regClashesWithOp :: Reg -> Operand -> Bool
+reg `regClashesWithOp` OpReg reg2   = reg == reg2
+reg `regClashesWithOp` OpAddr amode = any (==reg) (addrModeRegs amode)
+_   `regClashesWithOp` _            = False
+
+-----------
+
+trivialUCode :: Format -> (Operand -> Instr)
+             -> CmmExpr -> NatM Register
+trivialUCode rep instr x = do
+  x_code <- getAnyReg x
+  let
+     code dst =
+        x_code dst `snocOL`
+        instr (OpReg dst)
+  return (Any rep code)
+
+-----------
+
+
+trivialFCode_sse2 :: Width -> (Format -> Operand -> Operand -> Instr)
+                  -> CmmExpr -> CmmExpr -> NatM Register
+trivialFCode_sse2 pk instr x y
+    = genTrivialCode format (instr format) x y
+    where format = floatFormat pk
+
+
+trivialUFCode :: Format -> (Reg -> Reg -> Instr) -> CmmExpr -> NatM Register
+trivialUFCode format instr x = do
+  (x_reg, x_code) <- getSomeReg x
+  let
+     code dst =
+        x_code `snocOL`
+        instr x_reg dst
+  return (Any format code)
+
+
+--------------------------------------------------------------------------------
+coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
+coerceInt2FP from to x =  coerce_sse2
+ where
+
+   coerce_sse2 = do
+     (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+     let
+           opc  = case to of W32 -> CVTSI2SS; W64 -> CVTSI2SD
+                             n -> panic $ "coerceInt2FP.sse: unhandled width ("
+                                         ++ show n ++ ")"
+           code dst = x_code `snocOL` opc (intFormat from) x_op dst
+     return (Any (floatFormat to) code)
+        -- works even if the destination rep is <II32
+
+--------------------------------------------------------------------------------
+coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
+coerceFP2Int from to x =  coerceFP2Int_sse2
+ where
+   coerceFP2Int_sse2 = do
+     (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
+     let
+           opc  = case from of W32 -> CVTTSS2SIQ; W64 -> CVTTSD2SIQ;
+                               n -> panic $ "coerceFP2Init.sse: unhandled width ("
+                                           ++ show n ++ ")"
+           code dst = x_code `snocOL` opc (intFormat to) x_op dst
+     return (Any (intFormat to) code)
+         -- works even if the destination rep is <II32
+
+
+--------------------------------------------------------------------------------
+coerceFP2FP :: Width -> CmmExpr -> NatM Register
+coerceFP2FP to x = do
+  (x_reg, x_code) <- getSomeReg x
+  let
+        opc  = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
+                                     n -> panic $ "coerceFP2FP: unhandled width ("
+                                                 ++ show n ++ ")"
+        code dst = x_code `snocOL` opc x_reg dst
+  return (Any ( floatFormat to) code)
+
+--------------------------------------------------------------------------------
+
+sse2NegCode :: Width -> CmmExpr -> NatM Register
+sse2NegCode w x = do
+  let fmt = floatFormat w
+  x_code <- getAnyReg x
+  -- This is how gcc does it, so it can't be that bad:
+  let
+    const = case fmt of
+      FF32 -> CmmInt 0x80000000 W32
+      FF64 -> CmmInt 0x8000000000000000 W64
+      x@II8  -> wrongFmt x
+      x@II16 -> wrongFmt x
+      x@II32 -> wrongFmt x
+      x@II64 -> wrongFmt x
+
+      where
+        wrongFmt x = panic $ "sse2NegCode: " ++ show x
+  Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const
+  tmp <- getNewRegNat fmt
+  let
+    code dst = x_code dst `appOL` amode_code `appOL` toOL [
+        MOV fmt (OpAddr amode) (OpReg tmp),
+        XOR fmt (OpReg tmp) (OpReg dst)
+        ]
+  --
+  return (Any fmt code)
+
+isVecExpr :: CmmExpr -> Bool
+isVecExpr (CmmMachOp (MO_V_Insert {}) _)   = True
+isVecExpr (CmmMachOp (MO_V_Extract {}) _)  = True
+isVecExpr (CmmMachOp (MO_V_Add {}) _)      = True
+isVecExpr (CmmMachOp (MO_V_Sub {}) _)      = True
+isVecExpr (CmmMachOp (MO_V_Mul {}) _)      = True
+isVecExpr (CmmMachOp (MO_VS_Quot {}) _)    = True
+isVecExpr (CmmMachOp (MO_VS_Rem {}) _)     = True
+isVecExpr (CmmMachOp (MO_VS_Neg {}) _)     = True
+isVecExpr (CmmMachOp (MO_VF_Insert {}) _)  = True
+isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True
+isVecExpr (CmmMachOp (MO_VF_Add {}) _)     = True
+isVecExpr (CmmMachOp (MO_VF_Sub {}) _)     = True
+isVecExpr (CmmMachOp (MO_VF_Mul {}) _)     = True
+isVecExpr (CmmMachOp (MO_VF_Quot {}) _)    = True
+isVecExpr (CmmMachOp (MO_VF_Neg {}) _)     = True
+isVecExpr (CmmMachOp _ [e])                = isVecExpr e
+isVecExpr _                                = False
+
+needLlvm :: NatM a
+needLlvm =
+    sorry $ unlines ["The native code generator does not support vector"
+                    ,"instructions. Please use -fllvm."]
+
+-- | This works on the invariant that all jumps in the given blocks are required.
+--   Starting from there we try to make a few more jumps redundant by reordering
+--   them.
+--   We depend on the information in the CFG to do so so without a given CFG
+--   we do nothing.
+invertCondBranches :: Maybe CFG  -- ^ CFG if present
+                   -> LabelMap a -- ^ Blocks with info tables
+                   -> [NatBasicBlock Instr] -- ^ List of basic blocks
+                   -> [NatBasicBlock Instr]
+invertCondBranches Nothing _       bs = bs
+invertCondBranches (Just cfg) keep bs =
+    invert bs
+  where
+    invert :: [NatBasicBlock Instr] -> [NatBasicBlock Instr]
+    invert ((BasicBlock lbl1 ins@(_:_:_xs)):b2@(BasicBlock lbl2 _):bs)
+      | --pprTrace "Block" (ppr lbl1) True,
+        (jmp1,jmp2) <- last2 ins
+      , JXX cond1 target1 <- jmp1
+      , target1 == lbl2
+      --, pprTrace "CutChance" (ppr b1) True
+      , JXX ALWAYS target2 <- jmp2
+      -- We have enough information to check if we can perform the inversion
+      -- TODO: We could also check for the last asm instruction which sets
+      -- status flags instead. Which I suspect is worse in terms of compiler
+      -- performance, but might be applicable to more cases
+      , Just edgeInfo1 <- getEdgeInfo lbl1 target1 cfg
+      , Just edgeInfo2 <- getEdgeInfo lbl1 target2 cfg
+      -- Both jumps come from the same cmm statement
+      , transitionSource edgeInfo1 == transitionSource edgeInfo2
+      , CmmSource {trans_cmmNode = cmmCondBranch} <- transitionSource edgeInfo1
+
+      --Int comparisons are invertable
+      , CmmCondBranch (CmmMachOp op _args) _ _ _ <- cmmCondBranch
+      , Just _ <- maybeIntComparison op
+      , Just invCond <- maybeInvertCond cond1
+
+      --Swap the last two jumps, invert the conditional jumps condition.
+      = let jumps =
+              case () of
+                -- We are free the eliminate the jmp. So we do so.
+                _ | not (mapMember target1 keep)
+                    -> [JXX invCond target2]
+                -- If the conditional target is unlikely we put the other
+                -- target at the front.
+                  | edgeWeight edgeInfo2 > edgeWeight edgeInfo1
+                    -> [JXX invCond target2, JXX ALWAYS target1]
+                -- Keep things as-is otherwise
+                  | otherwise
+                    -> [jmp1, jmp2]
+        in --pprTrace "Cutable" (ppr [jmp1,jmp2] <+> text "=>" <+> ppr jumps) $
+           (BasicBlock lbl1
+            (dropTail 2 ins ++ jumps))
+            : invert (b2:bs)
+    invert (b:bs) = b : invert bs
+    invert [] = []
diff --git a/compiler/GHC/CmmToAsm/X86/Cond.hs b/compiler/GHC/CmmToAsm/X86/Cond.hs
new file mode 100644
index 0000000000..bb8f61438b
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/Cond.hs
@@ -0,0 +1,109 @@
+module GHC.CmmToAsm.X86.Cond (
+        Cond(..),
+        condUnsigned,
+        condToSigned,
+        condToUnsigned,
+        maybeFlipCond,
+        maybeInvertCond
+)
+
+where
+
+import GhcPrelude
+
+data Cond
+        = ALWAYS        -- What's really used? ToDo
+        | EQQ
+        | GE
+        | GEU
+        | GTT
+        | GU
+        | LE
+        | LEU
+        | LTT
+        | LU
+        | NE
+        | NEG
+        | POS
+        | CARRY
+        | OFLO
+        | PARITY
+        | NOTPARITY
+        deriving Eq
+
+condUnsigned :: Cond -> Bool
+condUnsigned GU  = True
+condUnsigned LU  = True
+condUnsigned GEU = True
+condUnsigned LEU = True
+condUnsigned _   = False
+
+
+condToSigned :: Cond -> Cond
+condToSigned GU  = GTT
+condToSigned LU  = LTT
+condToSigned GEU = GE
+condToSigned LEU = LE
+condToSigned x   = x
+
+
+condToUnsigned :: Cond -> Cond
+condToUnsigned GTT = GU
+condToUnsigned LTT = LU
+condToUnsigned GE  = GEU
+condToUnsigned LE  = LEU
+condToUnsigned x   = x
+
+-- | @maybeFlipCond c@ returns @Just c'@ if it is possible to flip the
+-- arguments to the conditional @c@, and the new condition should be @c'@.
+maybeFlipCond :: Cond -> Maybe Cond
+maybeFlipCond cond  = case cond of
+        EQQ   -> Just EQQ
+        NE    -> Just NE
+        LU    -> Just GU
+        GU    -> Just LU
+        LEU   -> Just GEU
+        GEU   -> Just LEU
+        LTT   -> Just GTT
+        GTT   -> Just LTT
+        LE    -> Just GE
+        GE    -> Just LE
+        _other -> Nothing
+
+-- | If we apply @maybeInvertCond@ to the condition of a jump we turn
+-- jumps taken into jumps not taken and vice versa.
+--
+-- Careful! If the used comparison and the conditional jump
+-- don't match the above behaviour will NOT hold.
+-- When used for FP comparisons this does not consider unordered
+-- numbers.
+-- Also inverting twice might return a synonym for the original condition.
+maybeInvertCond :: Cond -> Maybe Cond
+maybeInvertCond cond  = case cond of
+        ALWAYS  -> Nothing
+        EQQ     -> Just NE
+        NE      -> Just EQQ
+
+        NEG     -> Just POS
+        POS     -> Just NEG
+
+        GEU     -> Just LU
+        LU      -> Just GEU
+
+        GE      -> Just LTT
+        LTT     -> Just GE
+
+        GTT     -> Just LE
+        LE      -> Just GTT
+
+        GU      -> Just LEU
+        LEU     -> Just GU
+
+        --GEU "==" NOTCARRY, they are synonyms
+        --at the assembly level
+        CARRY   -> Just GEU
+
+        OFLO    -> Nothing
+
+        PARITY  -> Just NOTPARITY
+        NOTPARITY -> Just PARITY
diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs
new file mode 100644
index 0000000000..4171806695
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/Instr.hs
@@ -0,0 +1,1056 @@
+{-# LANGUAGE CPP, TypeFamilies #-}
+
+-----------------------------------------------------------------------------
+--
+-- Machine-dependent assembly language
+--
+-- (c) The University of Glasgow 1993-2004
+--
+-----------------------------------------------------------------------------
+
+module GHC.CmmToAsm.X86.Instr
+   ( Instr(..), Operand(..), PrefetchVariant(..), JumpDest(..)
+   , getJumpDestBlockId, canShortcut, shortcutStatics
+   , shortcutJump, allocMoreStack
+   , maxSpillSlots, archWordFormat
+   )
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Cond
+import GHC.CmmToAsm.X86.Regs
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg.Class
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Reg.Target
+
+import GHC.Cmm.BlockId
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import GHC.Platform.Regs
+import GHC.Cmm
+import FastString
+import Outputable
+import GHC.Platform
+
+import BasicTypes       (Alignment)
+import GHC.Cmm.CLabel
+import GHC.Driver.Session
+import UniqSet
+import Unique
+import UniqSupply
+import GHC.Cmm.DebugBlock (UnwindTable)
+
+import Control.Monad
+import Data.Maybe       (fromMaybe)
+
+-- Format of an x86/x86_64 memory address, in bytes.
+--
+archWordFormat :: Bool -> Format
+archWordFormat is32Bit
+ | is32Bit   = II32
+ | otherwise = II64
+
+-- | Instruction instance for x86 instruction set.
+instance Instruction Instr where
+        regUsageOfInstr         = x86_regUsageOfInstr
+        patchRegsOfInstr        = x86_patchRegsOfInstr
+        isJumpishInstr          = x86_isJumpishInstr
+        jumpDestsOfInstr        = x86_jumpDestsOfInstr
+        patchJumpInstr          = x86_patchJumpInstr
+        mkSpillInstr            = x86_mkSpillInstr
+        mkLoadInstr             = x86_mkLoadInstr
+        takeDeltaInstr          = x86_takeDeltaInstr
+        isMetaInstr             = x86_isMetaInstr
+        mkRegRegMoveInstr       = x86_mkRegRegMoveInstr
+        takeRegRegMoveInstr     = x86_takeRegRegMoveInstr
+        mkJumpInstr             = x86_mkJumpInstr
+        mkStackAllocInstr       = x86_mkStackAllocInstr
+        mkStackDeallocInstr     = x86_mkStackDeallocInstr
+
+
+-- -----------------------------------------------------------------------------
+-- Intel x86 instructions
+
+{-
+Intel, in their infinite wisdom, selected a stack model for floating
+point registers on x86.  That might have made sense back in 1979 --
+nowadays we can see it for the nonsense it really is.  A stack model
+fits poorly with the existing nativeGen infrastructure, which assumes
+flat integer and FP register sets.  Prior to this commit, nativeGen
+could not generate correct x86 FP code -- to do so would have meant
+somehow working the register-stack paradigm into the register
+allocator and spiller, which sounds very difficult.
+
+We have decided to cheat, and go for a simple fix which requires no
+infrastructure modifications, at the expense of generating ropey but
+correct FP code.  All notions of the x86 FP stack and its insns have
+been removed.  Instead, we pretend (to the instruction selector and
+register allocator) that x86 has six floating point registers, %fake0
+.. %fake5, which can be used in the usual flat manner.  We further
+claim that x86 has floating point instructions very similar to SPARC
+and Alpha, that is, a simple 3-operand register-register arrangement.
+Code generation and register allocation proceed on this basis.
+
+When we come to print out the final assembly, our convenient fiction
+is converted to dismal reality.  Each fake instruction is
+independently converted to a series of real x86 instructions.
+%fake0 .. %fake5 are mapped to %st(0) .. %st(5).  To do reg-reg
+arithmetic operations, the two operands are pushed onto the top of the
+FP stack, the operation done, and the result copied back into the
+relevant register.  There are only six %fake registers because 2 are
+needed for the translation, and x86 has 8 in total.
+
+The translation is inefficient but is simple and it works.  A cleverer
+translation would handle a sequence of insns, simulating the FP stack
+contents, would not impose a fixed mapping from %fake to %st regs, and
+hopefully could avoid most of the redundant reg-reg moves of the
+current translation.
+
+We might as well make use of whatever unique FP facilities Intel have
+chosen to bless us with (let's not be churlish, after all).
+Hence GLDZ and GLD1.  Bwahahahahahahaha!
+-}
+
+{-
+Note [x86 Floating point precision]
+
+Intel's internal floating point registers are by default 80 bit
+extended precision.  This means that all operations done on values in
+registers are done at 80 bits, and unless the intermediate values are
+truncated to the appropriate size (32 or 64 bits) by storing in
+memory, calculations in registers will give different results from
+calculations which pass intermediate values in memory (eg. via
+function calls).
+
+One solution is to set the FPU into 64 bit precision mode.  Some OSs
+do this (eg. FreeBSD) and some don't (eg. Linux).  The problem here is
+that this will only affect 64-bit precision arithmetic; 32-bit
+calculations will still be done at 64-bit precision in registers.  So
+it doesn't solve the whole problem.
+
+There's also the issue of what the C library is expecting in terms of
+precision.  It seems to be the case that glibc on Linux expects the
+FPU to be set to 80 bit precision, so setting it to 64 bit could have
+unexpected effects.  Changing the default could have undesirable
+effects on other 3rd-party library code too, so the right thing would
+be to save/restore the FPU control word across Haskell code if we were
+to do this.
+
+gcc's -ffloat-store gives consistent results by always storing the
+results of floating-point calculations in memory, which works for both
+32 and 64-bit precision.  However, it only affects the values of
+user-declared floating point variables in C, not intermediate results.
+GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
+flag).
+
+Another problem is how to spill floating point registers in the
+register allocator.  Should we spill the whole 80 bits, or just 64?
+On an OS which is set to 64 bit precision, spilling 64 is fine.  On
+Linux, spilling 64 bits will round the results of some operations.
+This is what gcc does.  Spilling at 80 bits requires taking up a full
+128 bit slot (so we get alignment).  We spill at 80-bits and ignore
+the alignment problems.
+
+In the future [edit: now available in GHC 7.0.1, with the -msse2
+flag], we'll use the SSE registers for floating point.  This requires
+a CPU that supports SSE2 (ordinary SSE only supports 32 bit precision
+float ops), which means P4 or Xeon and above.  Using SSE will solve
+all these problems, because the SSE registers use fixed 32 bit or 64
+bit precision.
+
+--SDM 1/2003
+-}
+
+data Instr
+        -- comment pseudo-op
+        = COMMENT FastString
+
+        -- location pseudo-op (file, line, col, name)
+        | LOCATION Int Int Int String
+
+        -- some static data spat out during code
+        -- generation.  Will be extracted before
+        -- pretty-printing.
+        | LDATA   Section (Alignment, RawCmmStatics)
+
+        -- start a new basic block.  Useful during
+        -- codegen, removed later.  Preceding
+        -- instruction should be a jump, as per the
+        -- invariants for a BasicBlock (see Cmm).
+        | NEWBLOCK BlockId
+
+        -- unwinding information
+        -- See Note [Unwinding information in the NCG].
+        | UNWIND CLabel UnwindTable
+
+        -- specify current stack offset for benefit of subsequent passes.
+        -- This carries a BlockId so it can be used in unwinding information.
+        | DELTA  Int
+
+        -- Moves.
+        | MOV         Format Operand Operand
+        | CMOV   Cond Format Operand Reg
+        | MOVZxL      Format Operand Operand -- format is the size of operand 1
+        | MOVSxL      Format Operand Operand -- format is the size of operand 1
+        -- x86_64 note: plain mov into a 32-bit register always zero-extends
+        -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
+        -- don't affect the high bits of the register.
+
+        -- Load effective address (also a very useful three-operand add instruction :-)
+        | LEA         Format Operand Operand
+
+        -- Int Arithmetic.
+        | ADD         Format Operand Operand
+        | ADC         Format Operand Operand
+        | SUB         Format Operand Operand
+        | SBB         Format Operand Operand
+
+        | MUL         Format Operand Operand
+        | MUL2        Format Operand         -- %edx:%eax = operand * %rax
+        | IMUL        Format Operand Operand -- signed int mul
+        | IMUL2       Format Operand         -- %edx:%eax = operand * %eax
+
+        | DIV         Format Operand         -- eax := eax:edx/op, edx := eax:edx%op
+        | IDIV        Format Operand         -- ditto, but signed
+
+        -- Int Arithmetic, where the effects on the condition register
+        -- are important. Used in specialized sequences such as MO_Add2.
+        -- Do not rewrite these instructions to "equivalent" ones that
+        -- have different effect on the condition register! (See #9013.)
+        | ADD_CC      Format Operand Operand
+        | SUB_CC      Format Operand Operand
+
+        -- Simple bit-twiddling.
+        | AND         Format Operand Operand
+        | OR          Format Operand Operand
+        | XOR         Format Operand Operand
+        | NOT         Format Operand
+        | NEGI        Format Operand         -- NEG instruction (name clash with Cond)
+        | BSWAP       Format Reg
+
+        -- Shifts (amount may be immediate or %cl only)
+        | SHL         Format Operand{-amount-} Operand
+        | SAR         Format Operand{-amount-} Operand
+        | SHR         Format Operand{-amount-} Operand
+
+        | BT          Format Imm Operand
+        | NOP
+
+
+        -- We need to support the FSTP (x87 store and pop) instruction
+        -- so that we can correctly read off the return value of an
+        -- x86 CDECL C function call when its floating point.
+        -- so we dont include a register argument, and just use st(0)
+        -- this instruction is used ONLY for return values of C ffi calls
+        -- in x86_32 abi
+        | X87Store         Format  AddrMode -- st(0), dst
+
+
+        -- SSE2 floating point: we use a restricted set of the available SSE2
+        -- instructions for floating-point.
+        -- use MOV for moving (either movss or movsd (movlpd better?))
+        | CVTSS2SD      Reg Reg            -- F32 to F64
+        | CVTSD2SS      Reg Reg            -- F64 to F32
+        | CVTTSS2SIQ    Format Operand Reg -- F32 to I32/I64 (with truncation)
+        | CVTTSD2SIQ    Format Operand Reg -- F64 to I32/I64 (with truncation)
+        | CVTSI2SS      Format Operand Reg -- I32/I64 to F32
+        | CVTSI2SD      Format Operand Reg -- I32/I64 to F64
+
+        -- use ADD, SUB, and SQRT for arithmetic.  In both cases, operands
+        -- are  Operand Reg.
+
+        -- SSE2 floating-point division:
+        | FDIV          Format Operand Operand   -- divisor, dividend(dst)
+
+        -- use CMP for comparisons.  ucomiss and ucomisd instructions
+        -- compare single/double prec floating point respectively.
+
+        | SQRT          Format Operand Reg      -- src, dst
+
+
+        -- Comparison
+        | TEST          Format Operand Operand
+        | CMP           Format Operand Operand
+        | SETCC         Cond Operand
+
+        -- Stack Operations.
+        | PUSH          Format Operand
+        | POP           Format Operand
+        -- both unused (SDM):
+        --  | PUSHA
+        --  | POPA
+
+        -- Jumping around.
+        | JMP         Operand [Reg] -- including live Regs at the call
+        | JXX         Cond BlockId  -- includes unconditional branches
+        | JXX_GBL     Cond Imm      -- non-local version of JXX
+        -- Table jump
+        | JMP_TBL     Operand   -- Address to jump to
+                      [Maybe JumpDest] -- Targets of the jump table
+                      Section   -- Data section jump table should be put in
+                      CLabel    -- Label of jump table
+        -- | X86 call instruction
+        | CALL        (Either Imm Reg) -- ^ Jump target
+                      [Reg]            -- ^ Arguments (required for register allocation)
+
+        -- Other things.
+        | CLTD Format            -- sign extend %eax into %edx:%eax
+
+        | FETCHGOT    Reg        -- pseudo-insn for ELF position-independent code
+                                 -- pretty-prints as
+                                 --       call 1f
+                                 -- 1:    popl %reg
+                                 --       addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
+        | FETCHPC     Reg        -- pseudo-insn for Darwin position-independent code
+                                 -- pretty-prints as
+                                 --       call 1f
+                                 -- 1:    popl %reg
+
+    -- bit counting instructions
+        | POPCNT      Format Operand Reg -- [SSE4.2] count number of bits set to 1
+        | LZCNT       Format Operand Reg -- [BMI2] count number of leading zeros
+        | TZCNT       Format Operand Reg -- [BMI2] count number of trailing zeros
+        | BSF         Format Operand Reg -- bit scan forward
+        | BSR         Format Operand Reg -- bit scan reverse
+
+    -- bit manipulation instructions
+        | PDEP        Format Operand Operand Reg -- [BMI2] deposit bits to   the specified mask
+        | PEXT        Format Operand Operand Reg -- [BMI2] extract bits from the specified mask
+
+    -- prefetch
+        | PREFETCH  PrefetchVariant Format Operand -- prefetch Variant, addr size, address to prefetch
+                                        -- variant can be NTA, Lvl0, Lvl1, or Lvl2
+
+        | LOCK        Instr -- lock prefix
+        | XADD        Format Operand Operand -- src (r), dst (r/m)
+        | CMPXCHG     Format Operand Operand -- src (r), dst (r/m), eax implicit
+        | MFENCE
+
+data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
+
+
+data Operand
+        = OpReg  Reg            -- register
+        | OpImm  Imm            -- immediate value
+        | OpAddr AddrMode       -- memory reference
+
+
+
+-- | Returns which registers are read and written as a (read, written)
+-- pair.
+x86_regUsageOfInstr :: Platform -> Instr -> RegUsage
+x86_regUsageOfInstr platform instr
+ = case instr of
+    MOV    _ src dst    -> usageRW src dst
+    CMOV _ _ src dst    -> mkRU (use_R src [dst]) [dst]
+    MOVZxL _ src dst    -> usageRW src dst
+    MOVSxL _ src dst    -> usageRW src dst
+    LEA    _ src dst    -> usageRW src dst
+    ADD    _ src dst    -> usageRM src dst
+    ADC    _ src dst    -> usageRM src dst
+    SUB    _ src dst    -> usageRM src dst
+    SBB    _ src dst    -> usageRM src dst
+    IMUL   _ src dst    -> usageRM src dst
+
+    -- Result of IMULB will be in just in %ax
+    IMUL2  II8 src       -> mkRU (eax:use_R src []) [eax]
+    -- Result of IMUL for wider values, will be split between %dx/%edx/%rdx and
+    -- %ax/%eax/%rax.
+    IMUL2  _ src        -> mkRU (eax:use_R src []) [eax,edx]
+
+    MUL    _ src dst    -> usageRM src dst
+    MUL2   _ src        -> mkRU (eax:use_R src []) [eax,edx]
+    DIV    _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
+    IDIV   _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
+    ADD_CC _ src dst    -> usageRM src dst
+    SUB_CC _ src dst    -> usageRM src dst
+    AND    _ src dst    -> usageRM src dst
+    OR     _ src dst    -> usageRM src dst
+
+    XOR    _ (OpReg src) (OpReg dst)
+        | src == dst    -> mkRU [] [dst]
+
+    XOR    _ src dst    -> usageRM src dst
+    NOT    _ op         -> usageM op
+    BSWAP  _ reg        -> mkRU [reg] [reg]
+    NEGI   _ op         -> usageM op
+    SHL    _ imm dst    -> usageRM imm dst
+    SAR    _ imm dst    -> usageRM imm dst
+    SHR    _ imm dst    -> usageRM imm dst
+    BT     _ _   src    -> mkRUR (use_R src [])
+
+    PUSH   _ op         -> mkRUR (use_R op [])
+    POP    _ op         -> mkRU [] (def_W op)
+    TEST   _ src dst    -> mkRUR (use_R src $! use_R dst [])
+    CMP    _ src dst    -> mkRUR (use_R src $! use_R dst [])
+    SETCC  _ op         -> mkRU [] (def_W op)
+    JXX    _ _          -> mkRU [] []
+    JXX_GBL _ _         -> mkRU [] []
+    JMP     op regs     -> mkRUR (use_R op regs)
+    JMP_TBL op _ _ _    -> mkRUR (use_R op [])
+    CALL (Left _)  params   -> mkRU params (callClobberedRegs platform)
+    CALL (Right reg) params -> mkRU (reg:params) (callClobberedRegs platform)
+    CLTD   _            -> mkRU [eax] [edx]
+    NOP                 -> mkRU [] []
+
+    X87Store    _  dst    -> mkRUR ( use_EA dst [])
+
+    CVTSS2SD   src dst  -> mkRU [src] [dst]
+    CVTSD2SS   src dst  -> mkRU [src] [dst]
+    CVTTSS2SIQ _ src dst -> mkRU (use_R src []) [dst]
+    CVTTSD2SIQ _ src dst -> mkRU (use_R src []) [dst]
+    CVTSI2SS   _ src dst -> mkRU (use_R src []) [dst]
+    CVTSI2SD   _ src dst -> mkRU (use_R src []) [dst]
+    FDIV _     src dst  -> usageRM src dst
+    SQRT _ src dst      -> mkRU (use_R src []) [dst]
+
+    FETCHGOT reg        -> mkRU [] [reg]
+    FETCHPC  reg        -> mkRU [] [reg]
+
+    COMMENT _           -> noUsage
+    LOCATION{}          -> noUsage
+    UNWIND{}            -> noUsage
+    DELTA   _           -> noUsage
+
+    POPCNT _ src dst -> mkRU (use_R src []) [dst]
+    LZCNT  _ src dst -> mkRU (use_R src []) [dst]
+    TZCNT  _ src dst -> mkRU (use_R src []) [dst]
+    BSF    _ src dst -> mkRU (use_R src []) [dst]
+    BSR    _ src dst -> mkRU (use_R src []) [dst]
+
+    PDEP   _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
+    PEXT   _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
+
+    -- note: might be a better way to do this
+    PREFETCH _  _ src -> mkRU (use_R src []) []
+    LOCK i              -> x86_regUsageOfInstr platform i
+    XADD _ src dst      -> usageMM src dst
+    CMPXCHG _ src dst   -> usageRMM src dst (OpReg eax)
+    MFENCE -> noUsage
+
+    _other              -> panic "regUsage: unrecognised instr"
+ where
+    -- # Definitions
+    --
+    -- Written: If the operand is a register, it's written. If it's an
+    -- address, registers mentioned in the address are read.
+    --
+    -- Modified: If the operand is a register, it's both read and
+    -- written. If it's an address, registers mentioned in the address
+    -- are read.
+
+    -- 2 operand form; first operand Read; second Written
+    usageRW :: Operand -> Operand -> RegUsage
+    usageRW op (OpReg reg)      = mkRU (use_R op []) [reg]
+    usageRW op (OpAddr ea)      = mkRUR (use_R op $! use_EA ea [])
+    usageRW _ _                 = panic "X86.RegInfo.usageRW: no match"
+
+    -- 2 operand form; first operand Read; second Modified
+    usageRM :: Operand -> Operand -> RegUsage
+    usageRM op (OpReg reg)      = mkRU (use_R op [reg]) [reg]
+    usageRM op (OpAddr ea)      = mkRUR (use_R op $! use_EA ea [])
+    usageRM _ _                 = panic "X86.RegInfo.usageRM: no match"
+
+    -- 2 operand form; first operand Modified; second Modified
+    usageMM :: Operand -> Operand -> RegUsage
+    usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst]
+    usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src]
+    usageMM _ _                     = panic "X86.RegInfo.usageMM: no match"
+
+    -- 3 operand form; first operand Read; second Modified; third Modified
+    usageRMM :: Operand -> Operand -> Operand -> RegUsage
+    usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg]
+    usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg]
+    usageRMM _ _ _                               = panic "X86.RegInfo.usageRMM: no match"
+
+    -- 1 operand form; operand Modified
+    usageM :: Operand -> RegUsage
+    usageM (OpReg reg)          = mkRU [reg] [reg]
+    usageM (OpAddr ea)          = mkRUR (use_EA ea [])
+    usageM _                    = panic "X86.RegInfo.usageM: no match"
+
+    -- Registers defd when an operand is written.
+    def_W (OpReg reg)           = [reg]
+    def_W (OpAddr _ )           = []
+    def_W _                     = panic "X86.RegInfo.def_W: no match"
+
+    -- Registers used when an operand is read.
+    use_R (OpReg reg)  tl = reg : tl
+    use_R (OpImm _)    tl = tl
+    use_R (OpAddr ea)  tl = use_EA ea tl
+
+    -- Registers used to compute an effective address.
+    use_EA (ImmAddr _ _) tl = tl
+    use_EA (AddrBaseIndex base index _) tl =
+        use_base base $! use_index index tl
+        where use_base (EABaseReg r)  tl = r : tl
+              use_base _              tl = tl
+              use_index EAIndexNone   tl = tl
+              use_index (EAIndex i _) tl = i : tl
+
+    mkRUR src = src' `seq` RU src' []
+        where src' = filter (interesting platform) src
+
+    mkRU src dst = src' `seq` dst' `seq` RU src' dst'
+        where src' = filter (interesting platform) src
+              dst' = filter (interesting platform) dst
+
+-- | Is this register interesting for the register allocator?
+interesting :: Platform -> Reg -> Bool
+interesting _        (RegVirtual _)              = True
+interesting platform (RegReal (RealRegSingle i)) = freeReg platform i
+interesting _        (RegReal (RealRegPair{}))   = panic "X86.interesting: no reg pairs on this arch"
+
+
+
+-- | Applies the supplied function to all registers in instructions.
+-- Typically used to change virtual registers to real registers.
+x86_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
+x86_patchRegsOfInstr instr env
+ = case instr of
+    MOV  fmt src dst     -> patch2 (MOV  fmt) src dst
+    CMOV cc fmt src dst  -> CMOV cc fmt (patchOp src) (env dst)
+    MOVZxL fmt src dst   -> patch2 (MOVZxL fmt) src dst
+    MOVSxL fmt src dst   -> patch2 (MOVSxL fmt) src dst
+    LEA  fmt src dst     -> patch2 (LEA  fmt) src dst
+    ADD  fmt src dst     -> patch2 (ADD  fmt) src dst
+    ADC  fmt src dst     -> patch2 (ADC  fmt) src dst
+    SUB  fmt src dst     -> patch2 (SUB  fmt) src dst
+    SBB  fmt src dst     -> patch2 (SBB  fmt) src dst
+    IMUL fmt src dst     -> patch2 (IMUL fmt) src dst
+    IMUL2 fmt src        -> patch1 (IMUL2 fmt) src
+    MUL fmt src dst      -> patch2 (MUL fmt) src dst
+    MUL2 fmt src         -> patch1 (MUL2 fmt) src
+    IDIV fmt op          -> patch1 (IDIV fmt) op
+    DIV fmt op           -> patch1 (DIV fmt) op
+    ADD_CC fmt src dst   -> patch2 (ADD_CC fmt) src dst
+    SUB_CC fmt src dst   -> patch2 (SUB_CC fmt) src dst
+    AND  fmt src dst     -> patch2 (AND  fmt) src dst
+    OR   fmt src dst     -> patch2 (OR   fmt) src dst
+    XOR  fmt src dst     -> patch2 (XOR  fmt) src dst
+    NOT  fmt op          -> patch1 (NOT  fmt) op
+    BSWAP fmt reg        -> BSWAP fmt (env reg)
+    NEGI fmt op          -> patch1 (NEGI fmt) op
+    SHL  fmt imm dst     -> patch1 (SHL fmt imm) dst
+    SAR  fmt imm dst     -> patch1 (SAR fmt imm) dst
+    SHR  fmt imm dst     -> patch1 (SHR fmt imm) dst
+    BT   fmt imm src     -> patch1 (BT  fmt imm) src
+    TEST fmt src dst     -> patch2 (TEST fmt) src dst
+    CMP  fmt src dst     -> patch2 (CMP  fmt) src dst
+    PUSH fmt op          -> patch1 (PUSH fmt) op
+    POP  fmt op          -> patch1 (POP  fmt) op
+    SETCC cond op        -> patch1 (SETCC cond) op
+    JMP op regs          -> JMP (patchOp op) regs
+    JMP_TBL op ids s lbl -> JMP_TBL (patchOp op) ids s lbl
+
+    -- literally only support storing the top x87 stack value st(0)
+    X87Store  fmt  dst     -> X87Store fmt  (lookupAddr dst)
+
+    CVTSS2SD src dst    -> CVTSS2SD (env src) (env dst)
+    CVTSD2SS src dst    -> CVTSD2SS (env src) (env dst)
+    CVTTSS2SIQ fmt src dst -> CVTTSS2SIQ fmt (patchOp src) (env dst)
+    CVTTSD2SIQ fmt src dst -> CVTTSD2SIQ fmt (patchOp src) (env dst)
+    CVTSI2SS fmt src dst -> CVTSI2SS fmt (patchOp src) (env dst)
+    CVTSI2SD fmt src dst -> CVTSI2SD fmt (patchOp src) (env dst)
+    FDIV fmt src dst     -> FDIV fmt (patchOp src) (patchOp dst)
+    SQRT fmt src dst    -> SQRT fmt (patchOp src) (env dst)
+
+    CALL (Left _)  _    -> instr
+    CALL (Right reg) p  -> CALL (Right (env reg)) p
+
+    FETCHGOT reg        -> FETCHGOT (env reg)
+    FETCHPC  reg        -> FETCHPC  (env reg)
+
+    NOP                 -> instr
+    COMMENT _           -> instr
+    LOCATION {}         -> instr
+    UNWIND {}           -> instr
+    DELTA _             -> instr
+
+    JXX _ _             -> instr
+    JXX_GBL _ _         -> instr
+    CLTD _              -> instr
+
+    POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst)
+    LZCNT  fmt src dst -> LZCNT  fmt (patchOp src) (env dst)
+    TZCNT  fmt src dst -> TZCNT  fmt (patchOp src) (env dst)
+    PDEP   fmt src mask dst -> PDEP   fmt (patchOp src) (patchOp mask) (env dst)
+    PEXT   fmt src mask dst -> PEXT   fmt (patchOp src) (patchOp mask) (env dst)
+    BSF    fmt src dst -> BSF    fmt (patchOp src) (env dst)
+    BSR    fmt src dst -> BSR    fmt (patchOp src) (env dst)
+
+    PREFETCH lvl format src -> PREFETCH lvl format (patchOp src)
+
+    LOCK i               -> LOCK (x86_patchRegsOfInstr i env)
+    XADD fmt src dst     -> patch2 (XADD fmt) src dst
+    CMPXCHG fmt src dst  -> patch2 (CMPXCHG fmt) src dst
+    MFENCE               -> instr
+
+    _other              -> panic "patchRegs: unrecognised instr"
+
+  where
+    patch1 :: (Operand -> a) -> Operand -> a
+    patch1 insn op      = insn $! patchOp op
+    patch2 :: (Operand -> Operand -> a) -> Operand -> Operand -> a
+    patch2 insn src dst = (insn $! patchOp src) $! patchOp dst
+
+    patchOp (OpReg  reg) = OpReg $! env reg
+    patchOp (OpImm  imm) = OpImm imm
+    patchOp (OpAddr ea)  = OpAddr $! lookupAddr ea
+
+    lookupAddr (ImmAddr imm off) = ImmAddr imm off
+    lookupAddr (AddrBaseIndex base index disp)
+      = ((AddrBaseIndex $! lookupBase base) $! lookupIndex index) disp
+      where
+        lookupBase EABaseNone       = EABaseNone
+        lookupBase EABaseRip        = EABaseRip
+        lookupBase (EABaseReg r)    = EABaseReg $! env r
+
+        lookupIndex EAIndexNone     = EAIndexNone
+        lookupIndex (EAIndex r i)   = (EAIndex $! env r) i
+
+
+--------------------------------------------------------------------------------
+x86_isJumpishInstr
+        :: Instr -> Bool
+
+x86_isJumpishInstr instr
+ = case instr of
+        JMP{}           -> True
+        JXX{}           -> True
+        JXX_GBL{}       -> True
+        JMP_TBL{}       -> True
+        CALL{}          -> True
+        _               -> False
+
+
+x86_jumpDestsOfInstr
+        :: Instr
+        -> [BlockId]
+
+x86_jumpDestsOfInstr insn
+  = case insn of
+        JXX _ id        -> [id]
+        JMP_TBL _ ids _ _ -> [id | Just (DestBlockId id) <- ids]
+        _               -> []
+
+
+x86_patchJumpInstr
+        :: Instr -> (BlockId -> BlockId) -> Instr
+
+x86_patchJumpInstr insn patchF
+  = case insn of
+        JXX cc id       -> JXX cc (patchF id)
+        JMP_TBL op ids section lbl
+          -> JMP_TBL op (map (fmap (patchJumpDest patchF)) ids) section lbl
+        _               -> insn
+    where
+        patchJumpDest f (DestBlockId id) = DestBlockId (f id)
+        patchJumpDest _ dest             = dest
+
+
+
+
+
+-- -----------------------------------------------------------------------------
+-- | Make a spill instruction.
+x86_mkSpillInstr
+    :: DynFlags
+    -> Reg      -- register to spill
+    -> Int      -- current stack delta
+    -> Int      -- spill slot to use
+    -> Instr
+
+x86_mkSpillInstr dflags reg delta slot
+  = let off     = spillSlotToOffset platform slot - delta
+    in
+    case targetClassOfReg platform reg of
+           RcInteger   -> MOV (archWordFormat is32Bit)
+                              (OpReg reg) (OpAddr (spRel dflags off))
+           RcDouble    -> MOV FF64 (OpReg reg) (OpAddr (spRel dflags off))
+           _         -> panic "X86.mkSpillInstr: no match"
+    where platform = targetPlatform dflags
+          is32Bit = target32Bit platform
+
+-- | Make a spill reload instruction.
+x86_mkLoadInstr
+    :: DynFlags
+    -> Reg      -- register to load
+    -> Int      -- current stack delta
+    -> Int      -- spill slot to use
+    -> Instr
+
+x86_mkLoadInstr dflags reg delta slot
+  = let off     = spillSlotToOffset platform slot - delta
+    in
+        case targetClassOfReg platform reg of
+              RcInteger -> MOV (archWordFormat is32Bit)
+                               (OpAddr (spRel dflags off)) (OpReg reg)
+              RcDouble  -> MOV FF64 (OpAddr (spRel dflags off)) (OpReg reg)
+              _           -> panic "X86.x86_mkLoadInstr"
+    where platform = targetPlatform dflags
+          is32Bit = target32Bit platform
+
+spillSlotSize :: Platform -> Int
+spillSlotSize dflags = if is32Bit then 12 else 8
+    where is32Bit = target32Bit dflags
+
+maxSpillSlots :: DynFlags -> Int
+maxSpillSlots dflags
+    = ((rESERVED_C_STACK_BYTES dflags - 64) `div` spillSlotSize (targetPlatform dflags)) - 1
+--     = 0 -- useful for testing allocMoreStack
+
+-- number of bytes that the stack pointer should be aligned to
+stackAlign :: Int
+stackAlign = 16
+
+-- convert a spill slot number to a *byte* offset, with no sign:
+-- decide on a per arch basis whether you are spilling above or below
+-- the C stack pointer.
+spillSlotToOffset :: Platform -> Int -> Int
+spillSlotToOffset platform slot
+   = 64 + spillSlotSize platform * slot
+
+--------------------------------------------------------------------------------
+
+-- | See if this instruction is telling us the current C stack delta
+x86_takeDeltaInstr
+        :: Instr
+        -> Maybe Int
+
+x86_takeDeltaInstr instr
+ = case instr of
+        DELTA i         -> Just i
+        _               -> Nothing
+
+
+x86_isMetaInstr
+        :: Instr
+        -> Bool
+
+x86_isMetaInstr instr
+ = case instr of
+        COMMENT{}       -> True
+        LOCATION{}      -> True
+        LDATA{}         -> True
+        NEWBLOCK{}      -> True
+        UNWIND{}        -> True
+        DELTA{}         -> True
+        _               -> False
+
+
+
+---  TODO: why is there
+-- | Make a reg-reg move instruction.
+--      On SPARC v8 there are no instructions to move directly between
+--      floating point and integer regs. If we need to do that then we
+--      have to go via memory.
+--
+x86_mkRegRegMoveInstr
+    :: Platform
+    -> Reg
+    -> Reg
+    -> Instr
+
+x86_mkRegRegMoveInstr platform src dst
+ = case targetClassOfReg platform src of
+        RcInteger -> case platformArch platform of
+                     ArchX86    -> MOV II32 (OpReg src) (OpReg dst)
+                     ArchX86_64 -> MOV II64 (OpReg src) (OpReg dst)
+                     _          -> panic "x86_mkRegRegMoveInstr: Bad arch"
+        RcDouble    ->  MOV FF64 (OpReg src) (OpReg dst)
+        -- this code is the lie we tell ourselves because both float and double
+        -- use the same register class.on x86_64 and x86 32bit with SSE2,
+        -- more plainly, both use the XMM registers
+        _     -> panic "X86.RegInfo.mkRegRegMoveInstr: no match"
+
+-- | Check whether an instruction represents a reg-reg move.
+--      The register allocator attempts to eliminate reg->reg moves whenever it can,
+--      by assigning the src and dest temporaries to the same real register.
+--
+x86_takeRegRegMoveInstr
+        :: Instr
+        -> Maybe (Reg,Reg)
+
+x86_takeRegRegMoveInstr (MOV _ (OpReg r1) (OpReg r2))
+        = Just (r1,r2)
+
+x86_takeRegRegMoveInstr _  = Nothing
+
+
+-- | Make an unconditional branch instruction.
+x86_mkJumpInstr
+        :: BlockId
+        -> [Instr]
+
+x86_mkJumpInstr id
+        = [JXX ALWAYS id]
+
+-- Note [Windows stack layout]
+-- | On most OSes the kernel will place a guard page after the current stack
+--   page.  If you allocate larger than a page worth you may jump over this
+--   guard page.  Not only is this a security issue, but on certain OSes such
+--   as Windows a new page won't be allocated if you don't hit the guard.  This
+--   will cause a segfault or access fault.
+--
+--   This function defines if the current allocation amount requires a probe.
+--   On Windows (for now) we emit a call to _chkstk for this.  For other OSes
+--   this is not yet implemented.
+--   See https://docs.microsoft.com/en-us/windows/desktop/DevNotes/-win32-chkstk
+--   The Windows stack looks like this:
+--
+--                         +-------------------+
+--                         |        SP         |
+--                         +-------------------+
+--                         |                   |
+--                         |    GUARD PAGE     |
+--                         |                   |
+--                         +-------------------+
+--                         |                   |
+--                         |                   |
+--                         |     UNMAPPED      |
+--                         |                   |
+--                         |                   |
+--                         +-------------------+
+--
+--   In essence each allocation larger than a page size needs to be chunked and
+--   a probe emitted after each page allocation.  You have to hit the guard
+--   page so the kernel can map in the next page, otherwise you'll segfault.
+--
+needs_probe_call :: Platform -> Int -> Bool
+needs_probe_call platform amount
+  = case platformOS platform of
+     OSMinGW32 -> case platformArch platform of
+                    ArchX86    -> amount > (4 * 1024)
+                    ArchX86_64 -> amount > (8 * 1024)
+                    _          -> False
+     _         -> False
+
+x86_mkStackAllocInstr
+        :: Platform
+        -> Int
+        -> [Instr]
+x86_mkStackAllocInstr platform amount
+  = case platformOS platform of
+      OSMinGW32 ->
+        -- These will clobber AX but this should be ok because
+        --
+        -- 1. It is the first thing we do when entering the closure and AX is
+        --    a caller saved registers on Windows both on x86_64 and x86.
+        --
+        -- 2. The closures are only entered via a call or longjmp in which case
+        --    there are no expectations for volatile registers.
+        --
+        -- 3. When the target is a local branch point it is re-targeted
+        --    after the dealloc, preserving #2.  See note [extra spill slots].
+        --
+        -- We emit a call because the stack probes are quite involved and
+        -- would bloat code size a lot.  GHC doesn't really have an -Os.
+        -- __chkstk is guaranteed to leave all nonvolatile registers and AX
+        -- untouched.  It's part of the standard prologue code for any Windows
+        -- function dropping the stack more than a page.
+        -- See Note [Windows stack layout]
+        case platformArch platform of
+            ArchX86    | needs_probe_call platform amount ->
+                           [ MOV II32 (OpImm (ImmInt amount)) (OpReg eax)
+                           , CALL (Left $ strImmLit "___chkstk_ms") [eax]
+                           , SUB II32 (OpReg eax) (OpReg esp)
+                           ]
+                       | otherwise ->
+                           [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp)
+                           , TEST II32 (OpReg esp) (OpReg esp)
+                           ]
+            ArchX86_64 | needs_probe_call platform amount ->
+                           [ MOV II64 (OpImm (ImmInt amount)) (OpReg rax)
+                           , CALL (Left $ strImmLit "___chkstk_ms") [rax]
+                           , SUB II64 (OpReg rax) (OpReg rsp)
+                           ]
+                       | otherwise ->
+                           [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp)
+                           , TEST II64 (OpReg rsp) (OpReg rsp)
+                           ]
+            _ -> panic "x86_mkStackAllocInstr"
+      _       ->
+        case platformArch platform of
+          ArchX86    -> [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) ]
+          ArchX86_64 -> [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) ]
+          _ -> panic "x86_mkStackAllocInstr"
+
+x86_mkStackDeallocInstr
+        :: Platform
+        -> Int
+        -> [Instr]
+x86_mkStackDeallocInstr platform amount
+  = case platformArch platform of
+      ArchX86    -> [ADD II32 (OpImm (ImmInt amount)) (OpReg esp)]
+      ArchX86_64 -> [ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)]
+      _ -> panic "x86_mkStackDeallocInstr"
+
+
+--
+-- Note [extra spill slots]
+--
+-- If the register allocator used more spill slots than we have
+-- pre-allocated (rESERVED_C_STACK_BYTES), then we must allocate more
+-- C stack space on entry and exit from this proc.  Therefore we
+-- insert a "sub $N, %rsp" at every entry point, and an "add $N, %rsp"
+-- before every non-local jump.
+--
+-- This became necessary when the new codegen started bundling entire
+-- functions together into one proc, because the register allocator
+-- assigns a different stack slot to each virtual reg within a proc.
+-- To avoid using so many slots we could also:
+--
+--   - split up the proc into connected components before code generator
+--
+--   - rename the virtual regs, so that we re-use vreg names and hence
+--     stack slots for non-overlapping vregs.
+--
+-- Note that when a block is both a non-local entry point (with an
+-- info table) and a local branch target, we have to split it into
+-- two, like so:
+--
+--    <info table>
+--    L:
+--       <code>
+--
+-- becomes
+--
+--    <info table>
+--    L:
+--       subl $rsp, N
+--       jmp Lnew
+--    Lnew:
+--       <code>
+--
+-- and all branches pointing to L are retargetted to point to Lnew.
+-- Otherwise, we would repeat the $rsp adjustment for each branch to
+-- L.
+--
+-- Returns a list of (L,Lnew) pairs.
+--
+allocMoreStack
+  :: Platform
+  -> Int
+  -> NatCmmDecl statics GHC.CmmToAsm.X86.Instr.Instr
+  -> UniqSM (NatCmmDecl statics GHC.CmmToAsm.X86.Instr.Instr, [(BlockId,BlockId)])
+
+allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
+allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
+    let entries = entryBlocks proc
+
+    uniqs <- replicateM (length entries) getUniqueM
+
+    let
+      delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
+        where x = slots * spillSlotSize platform -- sp delta
+
+      alloc   = mkStackAllocInstr   platform delta
+      dealloc = mkStackDeallocInstr platform delta
+
+      retargetList = (zip entries (map mkBlockId uniqs))
+
+      new_blockmap :: LabelMap BlockId
+      new_blockmap = mapFromList retargetList
+
+      insert_stack_insns (BasicBlock id insns)
+         | Just new_blockid <- mapLookup id new_blockmap
+         = [ BasicBlock id $ alloc ++ [JXX ALWAYS new_blockid]
+           , BasicBlock new_blockid block' ]
+         | otherwise
+         = [ BasicBlock id block' ]
+         where
+           block' = foldr insert_dealloc [] insns
+
+      insert_dealloc insn r = case insn of
+         JMP _ _     -> dealloc ++ (insn : r)
+         JXX_GBL _ _ -> panic "insert_dealloc: cannot handle JXX_GBL"
+         _other      -> x86_patchJumpInstr insn retarget : r
+           where retarget b = fromMaybe b (mapLookup b new_blockmap)
+
+      new_code = concatMap insert_stack_insns code
+    -- in
+    return (CmmProc info lbl live (ListGraph new_code), retargetList)
+
+data JumpDest = DestBlockId BlockId | DestImm Imm
+
+-- Debug Instance
+instance Outputable JumpDest where
+  ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid
+  ppr (DestImm _imm)    = text "jd<imm>:noShow"
+
+
+getJumpDestBlockId :: JumpDest -> Maybe BlockId
+getJumpDestBlockId (DestBlockId bid) = Just bid
+getJumpDestBlockId _                 = Nothing
+
+canShortcut :: Instr -> Maybe JumpDest
+canShortcut (JXX ALWAYS id)      = Just (DestBlockId id)
+canShortcut (JMP (OpImm imm) _)  = Just (DestImm imm)
+canShortcut _                    = Nothing
+
+
+-- This helper shortcuts a sequence of branches.
+-- The blockset helps avoid following cycles.
+shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
+shortcutJump fn insn = shortcutJump' fn (setEmpty :: LabelSet) insn
+  where
+    shortcutJump' :: (BlockId -> Maybe JumpDest) -> LabelSet -> Instr -> Instr
+    shortcutJump' fn seen insn@(JXX cc id) =
+        if setMember id seen then insn
+        else case fn id of
+            Nothing                -> insn
+            Just (DestBlockId id') -> shortcutJump' fn seen' (JXX cc id')
+            Just (DestImm imm)     -> shortcutJump' fn seen' (JXX_GBL cc imm)
+        where seen' = setInsert id seen
+    shortcutJump' fn _ (JMP_TBL addr blocks section tblId) =
+        let updateBlock (Just (DestBlockId bid))  =
+                case fn bid of
+                    Nothing   -> Just (DestBlockId bid )
+                    Just dest -> Just dest
+            updateBlock dest = dest
+            blocks' = map updateBlock blocks
+        in  JMP_TBL addr blocks' section tblId
+    shortcutJump' _ _ other = other
+
+-- Here because it knows about JumpDest
+shortcutStatics :: (BlockId -> Maybe JumpDest) -> (Alignment, RawCmmStatics) -> (Alignment, RawCmmStatics)
+shortcutStatics fn (align, RawCmmStatics lbl statics)
+  = (align, RawCmmStatics lbl $ map (shortcutStatic fn) statics)
+  -- we need to get the jump tables, so apply the mapping to the entries
+  -- of a CmmData too.
+
+shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel
+shortcutLabel fn lab
+  | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn emptyUniqSet blkId
+  | otherwise                              = lab
+
+shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic
+shortcutStatic fn (CmmStaticLit (CmmLabel lab))
+  = CmmStaticLit (CmmLabel (shortcutLabel fn lab))
+shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w))
+  = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w)
+        -- slightly dodgy, we're ignoring the second label, but this
+        -- works with the way we use CmmLabelDiffOff for jump tables now.
+shortcutStatic _ other_static
+        = other_static
+
+shortBlockId
+        :: (BlockId -> Maybe JumpDest)
+        -> UniqSet Unique
+        -> BlockId
+        -> CLabel
+
+shortBlockId fn seen blockid =
+  case (elementOfUniqSet uq seen, fn blockid) of
+    (True, _)    -> blockLbl blockid
+    (_, Nothing) -> blockLbl blockid
+    (_, Just (DestBlockId blockid'))  -> shortBlockId fn (addOneToUniqSet seen uq) blockid'
+    (_, Just (DestImm (ImmCLbl lbl))) -> lbl
+    (_, _other) -> panic "shortBlockId"
+  where uq = getUnique blockid
diff --git a/compiler/GHC/CmmToAsm/X86/Ppr.hs b/compiler/GHC/CmmToAsm/X86/Ppr.hs
new file mode 100644
index 0000000000..a5b9041974
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/Ppr.hs
@@ -0,0 +1,1014 @@
+{-# LANGUAGE CPP #-}
+
+-----------------------------------------------------------------------------
+--
+-- Pretty-printing assembly language
+--
+-- (c) The University of Glasgow 1993-2005
+--
+-----------------------------------------------------------------------------
+
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+module GHC.CmmToAsm.X86.Ppr (
+        pprNatCmmDecl,
+        pprData,
+        pprInstr,
+        pprFormat,
+        pprImm,
+        pprDataItem,
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.X86.Regs
+import GHC.CmmToAsm.X86.Instr
+import GHC.CmmToAsm.X86.Cond
+import GHC.CmmToAsm.Instr
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+import GHC.CmmToAsm.Ppr
+
+
+import GHC.Cmm.Dataflow.Collections
+import GHC.Cmm.Dataflow.Label
+import BasicTypes       (Alignment, mkAlignment, alignmentBytes)
+import GHC.Driver.Session
+import GHC.Cmm              hiding (topInfoTable)
+import GHC.Cmm.BlockId
+import GHC.Cmm.CLabel
+import Unique           ( pprUniqueAlways )
+import GHC.Platform
+import FastString
+import Outputable
+
+import Data.Word
+import Data.Bits
+
+-- -----------------------------------------------------------------------------
+-- Printing this stuff out
+--
+--
+-- Note [Subsections Via Symbols]
+--
+-- If we are using the .subsections_via_symbols directive
+-- (available on recent versions of Darwin),
+-- we have to make sure that there is some kind of reference
+-- from the entry code to a label on the _top_ of of the info table,
+-- so that the linker will not think it is unreferenced and dead-strip
+-- it. That's why the label is called a DeadStripPreventer (_dsp).
+--
+-- The LLVM code gen already creates `iTableSuf` symbols, where
+-- the X86 would generate the DeadStripPreventer (_dsp) symbol.
+-- Therefore all that is left for llvm code gen, is to ensure
+-- that all the `iTableSuf` symbols are marked as used.
+-- As of this writing the documentation regarding the
+-- .subsections_via_symbols and -dead_strip can be found at
+-- <https://developer.apple.com/library/mac/documentation/DeveloperTools/Reference/Assembler/040-Assembler_Directives/asm_directives.html#//apple_ref/doc/uid/TP30000823-TPXREF101>
+
+pprProcAlignment :: SDoc
+pprProcAlignment = sdocWithDynFlags $ \dflags ->
+  (maybe empty (pprAlign . mkAlignment) (cmmProcAlignment dflags))
+
+pprNatCmmDecl :: NatCmmDecl (Alignment, RawCmmStatics) Instr -> SDoc
+pprNatCmmDecl (CmmData section dats) =
+  pprSectionAlign section $$ pprDatas dats
+
+pprNatCmmDecl proc@(CmmProc top_info lbl _ (ListGraph blocks)) =
+  sdocWithDynFlags $ \dflags ->
+  pprProcAlignment $$
+  case topInfoTable proc of
+    Nothing ->
+        -- special case for code without info table:
+        pprSectionAlign (Section Text lbl) $$
+        pprProcAlignment $$
+        pprLabel lbl $$ -- blocks guaranteed not null, so label needed
+        vcat (map (pprBasicBlock top_info) blocks) $$
+        (if debugLevel dflags > 0
+         then ppr (mkAsmTempEndLabel lbl) <> char ':' else empty) $$
+        pprSizeDecl lbl
+
+    Just (RawCmmStatics info_lbl _) ->
+      sdocWithPlatform $ \platform ->
+      pprSectionAlign (Section Text info_lbl) $$
+      pprProcAlignment $$
+      (if platformHasSubsectionsViaSymbols platform
+          then ppr (mkDeadStripPreventer info_lbl) <> char ':'
+          else empty) $$
+      vcat (map (pprBasicBlock top_info) blocks) $$
+      -- above: Even the first block gets a label, because with branch-chain
+      -- elimination, it might be the target of a goto.
+      (if platformHasSubsectionsViaSymbols platform
+       then -- See Note [Subsections Via Symbols]
+                text "\t.long "
+            <+> ppr info_lbl
+            <+> char '-'
+            <+> ppr (mkDeadStripPreventer info_lbl)
+       else empty) $$
+      pprSizeDecl info_lbl
+
+-- | Output the ELF .size directive.
+pprSizeDecl :: CLabel -> SDoc
+pprSizeDecl lbl
+ = sdocWithPlatform $ \platform ->
+   if osElfTarget (platformOS platform)
+   then text "\t.size" <+> ppr lbl <> ptext (sLit ", .-") <> ppr lbl
+   else empty
+
+pprBasicBlock :: LabelMap RawCmmStatics -> NatBasicBlock Instr -> SDoc
+pprBasicBlock info_env (BasicBlock blockid instrs)
+  = maybe_infotable $
+    pprLabel asmLbl $$
+    vcat (map pprInstr instrs) $$
+    (sdocOption sdocDebugLevel $ \level ->
+        if level > 0
+           then ppr (mkAsmTempEndLabel asmLbl) <> char ':'
+           else empty
+    )
+  where
+    asmLbl = blockLbl blockid
+    maybe_infotable c = case mapLookup blockid info_env of
+       Nothing -> c
+       Just (RawCmmStatics infoLbl info) ->
+           pprAlignForSection Text $$
+           infoTableLoc $$
+           vcat (map pprData info) $$
+           pprLabel infoLbl $$
+           c $$
+           (sdocOption sdocDebugLevel $ \level ->
+               if level > 0
+                  then ppr (mkAsmTempEndLabel infoLbl) <> char ':'
+                  else empty
+           )
+    -- Make sure the info table has the right .loc for the block
+    -- coming right after it. See [Note: Info Offset]
+    infoTableLoc = case instrs of
+      (l@LOCATION{} : _) -> pprInstr l
+      _other             -> empty
+
+
+pprDatas :: (Alignment, RawCmmStatics) -> SDoc
+-- See note [emit-time elimination of static indirections] in CLabel.
+pprDatas (_, RawCmmStatics alias [CmmStaticLit (CmmLabel lbl), CmmStaticLit ind, _, _])
+  | lbl == mkIndStaticInfoLabel
+  , let labelInd (CmmLabelOff l _) = Just l
+        labelInd (CmmLabel l) = Just l
+        labelInd _ = Nothing
+  , Just ind' <- labelInd ind
+  , alias `mayRedirectTo` ind'
+  = pprGloblDecl alias
+    $$ text ".equiv" <+> ppr alias <> comma <> ppr (CmmLabel ind')
+
+pprDatas (align, (RawCmmStatics lbl dats))
+ = vcat (pprAlign align : pprLabel lbl : map pprData dats)
+
+pprData :: CmmStatic -> SDoc
+pprData (CmmString str) = pprBytes str
+
+pprData (CmmUninitialised bytes)
+ = sdocWithPlatform $ \platform ->
+   if platformOS platform == OSDarwin then text ".space " <> int bytes
+                                      else text ".skip "  <> int bytes
+
+pprData (CmmStaticLit lit) = pprDataItem lit
+
+pprGloblDecl :: CLabel -> SDoc
+pprGloblDecl lbl
+  | not (externallyVisibleCLabel lbl) = empty
+  | otherwise = text ".globl " <> ppr lbl
+
+pprLabelType' :: DynFlags -> CLabel -> SDoc
+pprLabelType' dflags lbl =
+  if isCFunctionLabel lbl || functionOkInfoTable then
+    text "@function"
+  else
+    text "@object"
+  where
+    {-
+    NOTE: This is a bit hacky.
+
+    With the `tablesNextToCode` info tables look like this:
+    ```
+      <info table data>
+    label_info:
+      <info table code>
+    ```
+    So actually info table label points exactly to the code and we can mark
+    the label as @function. (This is required to make perf and potentially other
+    tools to work on Haskell binaries).
+    This usually works well but it can cause issues with a linker.
+    A linker uses different algorithms for the relocation depending on
+    the symbol type.For some reason, a linker will generate JUMP_SLOT relocation
+    when constructor info table is referenced from a data section.
+    This only happens with static constructor call so
+    we mark _con_info symbols as `@object` to avoid the issue with relocations.
+
+    @SimonMarlow hack explanation:
+    "The reasoning goes like this:
+
+    * The danger when we mark a symbol as `@function` is that the linker will
+      redirect it to point to the PLT and use a `JUMP_SLOT` relocation when
+      the symbol refers to something outside the current shared object.
+      A PLT / JUMP_SLOT reference only works for symbols that we jump to, not
+      for symbols representing data,, nor for info table symbol references which
+      we expect to point directly to the info table.
+    * GHC generates code that might refer to any info table symbol from the text
+      segment, but that's OK, because those will be explicit GOT references
+      generated by the code generator.
+    * When we refer to info tables from the data segment, it's either
+      * a FUN_STATIC/THUNK_STATIC local to this module
+      * a `con_info` that could be from anywhere
+
+    So, the only info table symbols that we might refer to from the data segment
+    of another shared object are `con_info` symbols, so those are the ones we
+    need to exclude from getting the @function treatment.
+    "
+
+    A good place to check for more
+    https://gitlab.haskell.org/ghc/ghc/wikis/commentary/position-independent-code
+
+    Another possible hack is to create an extra local function symbol for
+    every code-like thing to give the needed information for to the tools
+    but mess up with the relocation. https://phabricator.haskell.org/D4730
+    -}
+    functionOkInfoTable = tablesNextToCode dflags &&
+      isInfoTableLabel lbl && not (isConInfoTableLabel lbl)
+
+
+pprTypeDecl :: CLabel -> SDoc
+pprTypeDecl lbl
+    = sdocWithPlatform $ \platform ->
+      if osElfTarget (platformOS platform) && externallyVisibleCLabel lbl
+      then
+        sdocWithDynFlags $ \df ->
+          text ".type " <> ppr lbl <> ptext (sLit  ", ") <> pprLabelType' df lbl
+      else empty
+
+pprLabel :: CLabel -> SDoc
+pprLabel lbl = pprGloblDecl lbl
+            $$ pprTypeDecl lbl
+            $$ (ppr lbl <> char ':')
+
+pprAlign :: Alignment -> SDoc
+pprAlign alignment
+        = sdocWithPlatform $ \platform ->
+          text ".align " <> int (alignmentOn platform)
+  where
+        bytes = alignmentBytes alignment
+        alignmentOn platform = if platformOS platform == OSDarwin
+                               then log2 bytes
+                               else      bytes
+
+        log2 :: Int -> Int  -- cache the common ones
+        log2 1 = 0
+        log2 2 = 1
+        log2 4 = 2
+        log2 8 = 3
+        log2 n = 1 + log2 (n `quot` 2)
+
+-- -----------------------------------------------------------------------------
+-- pprInstr: print an 'Instr'
+
+instance Outputable Instr where
+    ppr instr = pprInstr instr
+
+
+pprReg :: Format -> Reg -> SDoc
+pprReg f r
+  = case r of
+      RegReal    (RealRegSingle i) ->
+          sdocWithPlatform $ \platform ->
+          if target32Bit platform then ppr32_reg_no f i
+                                  else ppr64_reg_no f i
+      RegReal    (RealRegPair _ _) -> panic "X86.Ppr: no reg pairs on this arch"
+      RegVirtual (VirtualRegI  u)  -> text "%vI_"   <> pprUniqueAlways u
+      RegVirtual (VirtualRegHi u)  -> text "%vHi_"  <> pprUniqueAlways u
+      RegVirtual (VirtualRegF  u)  -> text "%vF_"   <> pprUniqueAlways u
+      RegVirtual (VirtualRegD  u)  -> text "%vD_"   <> pprUniqueAlways u
+
+  where
+    ppr32_reg_no :: Format -> Int -> SDoc
+    ppr32_reg_no II8   = ppr32_reg_byte
+    ppr32_reg_no II16  = ppr32_reg_word
+    ppr32_reg_no _     = ppr32_reg_long
+
+    ppr32_reg_byte i = ptext
+      (case i of {
+         0 -> sLit "%al";     1 -> sLit "%bl";
+         2 -> sLit "%cl";     3 -> sLit "%dl";
+        _  -> sLit $ "very naughty I386 byte register: " ++ show i
+      })
+
+    ppr32_reg_word i = ptext
+      (case i of {
+         0 -> sLit "%ax";     1 -> sLit "%bx";
+         2 -> sLit "%cx";     3 -> sLit "%dx";
+         4 -> sLit "%si";     5 -> sLit "%di";
+         6 -> sLit "%bp";     7 -> sLit "%sp";
+        _  -> sLit "very naughty I386 word register"
+      })
+
+    ppr32_reg_long i = ptext
+      (case i of {
+         0 -> sLit "%eax";    1 -> sLit "%ebx";
+         2 -> sLit "%ecx";    3 -> sLit "%edx";
+         4 -> sLit "%esi";    5 -> sLit "%edi";
+         6 -> sLit "%ebp";    7 -> sLit "%esp";
+         _  -> ppr_reg_float i
+      })
+
+    ppr64_reg_no :: Format -> Int -> SDoc
+    ppr64_reg_no II8   = ppr64_reg_byte
+    ppr64_reg_no II16  = ppr64_reg_word
+    ppr64_reg_no II32  = ppr64_reg_long
+    ppr64_reg_no _     = ppr64_reg_quad
+
+    ppr64_reg_byte i = ptext
+      (case i of {
+         0 -> sLit "%al";     1 -> sLit "%bl";
+         2 -> sLit "%cl";     3 -> sLit "%dl";
+         4 -> sLit "%sil";    5 -> sLit "%dil"; -- new 8-bit regs!
+         6 -> sLit "%bpl";    7 -> sLit "%spl";
+         8 -> sLit "%r8b";    9  -> sLit "%r9b";
+        10 -> sLit "%r10b";   11 -> sLit "%r11b";
+        12 -> sLit "%r12b";   13 -> sLit "%r13b";
+        14 -> sLit "%r14b";   15 -> sLit "%r15b";
+        _  -> sLit $ "very naughty x86_64 byte register: " ++ show i
+      })
+
+    ppr64_reg_word i = ptext
+      (case i of {
+         0 -> sLit "%ax";     1 -> sLit "%bx";
+         2 -> sLit "%cx";     3 -> sLit "%dx";
+         4 -> sLit "%si";     5 -> sLit "%di";
+         6 -> sLit "%bp";     7 -> sLit "%sp";
+         8 -> sLit "%r8w";    9  -> sLit "%r9w";
+        10 -> sLit "%r10w";   11 -> sLit "%r11w";
+        12 -> sLit "%r12w";   13 -> sLit "%r13w";
+        14 -> sLit "%r14w";   15 -> sLit "%r15w";
+        _  -> sLit "very naughty x86_64 word register"
+      })
+
+    ppr64_reg_long i = ptext
+      (case i of {
+         0 -> sLit "%eax";    1  -> sLit "%ebx";
+         2 -> sLit "%ecx";    3  -> sLit "%edx";
+         4 -> sLit "%esi";    5  -> sLit "%edi";
+         6 -> sLit "%ebp";    7  -> sLit "%esp";
+         8 -> sLit "%r8d";    9  -> sLit "%r9d";
+        10 -> sLit "%r10d";   11 -> sLit "%r11d";
+        12 -> sLit "%r12d";   13 -> sLit "%r13d";
+        14 -> sLit "%r14d";   15 -> sLit "%r15d";
+        _  -> sLit "very naughty x86_64 register"
+      })
+
+    ppr64_reg_quad i = ptext
+      (case i of {
+         0 -> sLit "%rax";      1 -> sLit "%rbx";
+         2 -> sLit "%rcx";      3 -> sLit "%rdx";
+         4 -> sLit "%rsi";      5 -> sLit "%rdi";
+         6 -> sLit "%rbp";      7 -> sLit "%rsp";
+         8 -> sLit "%r8";       9 -> sLit "%r9";
+        10 -> sLit "%r10";    11 -> sLit "%r11";
+        12 -> sLit "%r12";    13 -> sLit "%r13";
+        14 -> sLit "%r14";    15 -> sLit "%r15";
+        _  -> ppr_reg_float i
+      })
+
+ppr_reg_float :: Int -> PtrString
+ppr_reg_float i = case i of
+        16 -> sLit "%xmm0" ;   17 -> sLit "%xmm1"
+        18 -> sLit "%xmm2" ;   19 -> sLit "%xmm3"
+        20 -> sLit "%xmm4" ;   21 -> sLit "%xmm5"
+        22 -> sLit "%xmm6" ;   23 -> sLit "%xmm7"
+        24 -> sLit "%xmm8" ;   25 -> sLit "%xmm9"
+        26 -> sLit "%xmm10";   27 -> sLit "%xmm11"
+        28 -> sLit "%xmm12";   29 -> sLit "%xmm13"
+        30 -> sLit "%xmm14";   31 -> sLit "%xmm15"
+        _  -> sLit "very naughty x86 register"
+
+pprFormat :: Format -> SDoc
+pprFormat x
+ = ptext (case x of
+                II8   -> sLit "b"
+                II16  -> sLit "w"
+                II32  -> sLit "l"
+                II64  -> sLit "q"
+                FF32  -> sLit "ss"      -- "scalar single-precision float" (SSE2)
+                FF64  -> sLit "sd"      -- "scalar double-precision float" (SSE2)
+                )
+
+pprFormat_x87 :: Format -> SDoc
+pprFormat_x87 x
+  = ptext $ case x of
+                FF32  -> sLit "s"
+                FF64  -> sLit "l"
+                _     -> panic "X86.Ppr.pprFormat_x87"
+
+
+pprCond :: Cond -> SDoc
+pprCond c
+ = ptext (case c of {
+                GEU     -> sLit "ae";   LU    -> sLit "b";
+                EQQ     -> sLit "e";    GTT   -> sLit "g";
+                GE      -> sLit "ge";   GU    -> sLit "a";
+                LTT     -> sLit "l";    LE    -> sLit "le";
+                LEU     -> sLit "be";   NE    -> sLit "ne";
+                NEG     -> sLit "s";    POS   -> sLit "ns";
+                CARRY   -> sLit "c";   OFLO  -> sLit "o";
+                PARITY  -> sLit "p";   NOTPARITY -> sLit "np";
+                ALWAYS  -> sLit "mp"})
+
+
+pprImm :: Imm -> SDoc
+pprImm (ImmInt i)     = int i
+pprImm (ImmInteger i) = integer i
+pprImm (ImmCLbl l)    = ppr l
+pprImm (ImmIndex l i) = ppr l <> char '+' <> int i
+pprImm (ImmLit s)     = s
+
+pprImm (ImmFloat _)  = text "naughty float immediate"
+pprImm (ImmDouble _) = text "naughty double immediate"
+
+pprImm (ImmConstantSum a b) = pprImm a <> char '+' <> pprImm b
+pprImm (ImmConstantDiff a b) = pprImm a <> char '-'
+                            <> lparen <> pprImm b <> rparen
+
+
+
+pprAddr :: AddrMode -> SDoc
+pprAddr (ImmAddr imm off)
+  = let pp_imm = pprImm imm
+    in
+    if (off == 0) then
+        pp_imm
+    else if (off < 0) then
+        pp_imm <> int off
+    else
+        pp_imm <> char '+' <> int off
+
+pprAddr (AddrBaseIndex base index displacement)
+  = sdocWithPlatform $ \platform ->
+    let
+        pp_disp  = ppr_disp displacement
+        pp_off p = pp_disp <> char '(' <> p <> char ')'
+        pp_reg r = pprReg (archWordFormat (target32Bit platform)) r
+    in
+    case (base, index) of
+      (EABaseNone,  EAIndexNone) -> pp_disp
+      (EABaseReg b, EAIndexNone) -> pp_off (pp_reg b)
+      (EABaseRip,   EAIndexNone) -> pp_off (text "%rip")
+      (EABaseNone,  EAIndex r i) -> pp_off (comma <> pp_reg r <> comma <> int i)
+      (EABaseReg b, EAIndex r i) -> pp_off (pp_reg b <> comma <> pp_reg r
+                                       <> comma <> int i)
+      _                         -> panic "X86.Ppr.pprAddr: no match"
+
+  where
+    ppr_disp (ImmInt 0) = empty
+    ppr_disp imm        = pprImm imm
+
+-- | Print section header and appropriate alignment for that section.
+pprSectionAlign :: Section -> SDoc
+pprSectionAlign (Section (OtherSection _) _) =
+     panic "X86.Ppr.pprSectionAlign: unknown section"
+pprSectionAlign sec@(Section seg _) =
+  sdocWithPlatform $ \platform ->
+    pprSectionHeader platform sec $$
+    pprAlignForSection seg
+
+-- | Print appropriate alignment for the given section type.
+pprAlignForSection :: SectionType -> SDoc
+pprAlignForSection seg =
+  sdocWithPlatform $ \platform ->
+    text ".align " <>
+    case platformOS platform of
+      -- Darwin: alignments are given as shifts.
+      OSDarwin
+       | target32Bit platform ->
+          case seg of
+           ReadOnlyData16    -> int 4
+           CString           -> int 1
+           _                 -> int 2
+       | otherwise ->
+          case seg of
+           ReadOnlyData16    -> int 4
+           CString           -> int 1
+           _                 -> int 3
+      -- Other: alignments are given as bytes.
+      _
+       | target32Bit platform ->
+          case seg of
+           Text              -> text "4,0x90"
+           ReadOnlyData16    -> int 16
+           CString           -> int 1
+           _                 -> int 4
+       | otherwise ->
+          case seg of
+           ReadOnlyData16    -> int 16
+           CString           -> int 1
+           _                 -> int 8
+
+pprDataItem :: CmmLit -> SDoc
+pprDataItem lit = sdocWithDynFlags $ \dflags -> pprDataItem' dflags lit
+
+pprDataItem' :: DynFlags -> CmmLit -> SDoc
+pprDataItem' dflags lit
+  = vcat (ppr_item (cmmTypeFormat $ cmmLitType dflags lit) lit)
+    where
+        platform = targetPlatform dflags
+        imm = litToImm lit
+
+        -- These seem to be common:
+        ppr_item II8   _ = [text "\t.byte\t" <> pprImm imm]
+        ppr_item II16  _ = [text "\t.word\t" <> pprImm imm]
+        ppr_item II32  _ = [text "\t.long\t" <> pprImm imm]
+
+        ppr_item FF32  (CmmFloat r _)
+           = let bs = floatToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+        ppr_item FF64 (CmmFloat r _)
+           = let bs = doubleToBytes (fromRational r)
+             in  map (\b -> text "\t.byte\t" <> pprImm (ImmInt b)) bs
+
+        ppr_item II64 _
+            = case platformOS platform of
+              OSDarwin
+               | target32Bit platform ->
+                  case lit of
+                  CmmInt x _ ->
+                      [text "\t.long\t"
+                          <> int (fromIntegral (fromIntegral x :: Word32)),
+                       text "\t.long\t"
+                          <> int (fromIntegral
+                              (fromIntegral (x `shiftR` 32) :: Word32))]
+                  _ -> panic "X86.Ppr.ppr_item: no match for II64"
+               | otherwise ->
+                  [text "\t.quad\t" <> pprImm imm]
+              _
+               | target32Bit platform ->
+                  [text "\t.quad\t" <> pprImm imm]
+               | otherwise ->
+                  -- x86_64: binutils can't handle the R_X86_64_PC64
+                  -- relocation type, which means we can't do
+                  -- pc-relative 64-bit addresses. Fortunately we're
+                  -- assuming the small memory model, in which all such
+                  -- offsets will fit into 32 bits, so we have to stick
+                  -- to 32-bit offset fields and modify the RTS
+                  -- appropriately
+                  --
+                  -- See Note [x86-64-relative] in includes/rts/storage/InfoTables.h
+                  --
+                  case lit of
+                  -- A relative relocation:
+                  CmmLabelDiffOff _ _ _ _ ->
+                      [text "\t.long\t" <> pprImm imm,
+                       text "\t.long\t0"]
+                  _ ->
+                      [text "\t.quad\t" <> pprImm imm]
+
+        ppr_item _ _
+                = panic "X86.Ppr.ppr_item: no match"
+
+
+asmComment :: SDoc -> SDoc
+asmComment c = whenPprDebug $ text "# " <> c
+
+pprInstr :: Instr -> SDoc
+
+pprInstr (COMMENT s)
+   = asmComment (ftext s)
+
+pprInstr (LOCATION file line col _name)
+   = text "\t.loc " <> ppr file <+> ppr line <+> ppr col
+
+pprInstr (DELTA d)
+   = asmComment $ text ("\tdelta = " ++ show d)
+
+pprInstr (NEWBLOCK _)
+   = panic "PprMach.pprInstr: NEWBLOCK"
+
+pprInstr (UNWIND lbl d)
+   = asmComment (text "\tunwind = " <> ppr d)
+     $$ ppr lbl <> colon
+
+pprInstr (LDATA _ _)
+   = panic "PprMach.pprInstr: LDATA"
+
+{-
+pprInstr (SPILL reg slot)
+   = hcat [
+        text "\tSPILL",
+        char ' ',
+        pprUserReg reg,
+        comma,
+        text "SLOT" <> parens (int slot)]
+
+pprInstr (RELOAD slot reg)
+   = hcat [
+        text "\tRELOAD",
+        char ' ',
+        text "SLOT" <> parens (int slot),
+        comma,
+        pprUserReg reg]
+-}
+
+-- Replace 'mov $0x0,%reg' by 'xor %reg,%reg', which is smaller and cheaper.
+-- The code generator catches most of these already, but not all.
+pprInstr (MOV format (OpImm (ImmInt 0)) dst@(OpReg _))
+  = pprInstr (XOR format' dst dst)
+  where format' = case format of
+          II64 -> II32          -- 32-bit version is equivalent, and smaller
+          _    -> format
+pprInstr (MOV format src dst)
+  = pprFormatOpOp (sLit "mov") format src dst
+
+pprInstr (CMOV cc format src dst)
+  = pprCondOpReg (sLit "cmov") format cc src dst
+
+pprInstr (MOVZxL II32 src dst) = pprFormatOpOp (sLit "mov") II32 src dst
+        -- 32-to-64 bit zero extension on x86_64 is accomplished by a simple
+        -- movl.  But we represent it as a MOVZxL instruction, because
+        -- the reg alloc would tend to throw away a plain reg-to-reg
+        -- move, and we still want it to do that.
+
+pprInstr (MOVZxL formats src dst)
+  = pprFormatOpOpCoerce (sLit "movz") formats II32 src dst
+        -- zero-extension only needs to extend to 32 bits: on x86_64,
+        -- the remaining zero-extension to 64 bits is automatic, and the 32-bit
+        -- instruction is shorter.
+
+pprInstr (MOVSxL formats src dst)
+  = sdocWithPlatform $ \platform ->
+    pprFormatOpOpCoerce (sLit "movs") formats (archWordFormat (target32Bit platform)) src dst
+
+-- here we do some patching, since the physical registers are only set late
+-- in the code generation.
+pprInstr (LEA format (OpAddr (AddrBaseIndex (EABaseReg reg1) (EAIndex reg2 1) (ImmInt 0))) dst@(OpReg reg3))
+  | reg1 == reg3
+  = pprFormatOpOp (sLit "add") format (OpReg reg2) dst
+
+pprInstr (LEA format (OpAddr (AddrBaseIndex (EABaseReg reg1) (EAIndex reg2 1) (ImmInt 0))) dst@(OpReg reg3))
+  | reg2 == reg3
+  = pprFormatOpOp (sLit "add") format (OpReg reg1) dst
+
+pprInstr (LEA format (OpAddr (AddrBaseIndex (EABaseReg reg1) EAIndexNone displ)) dst@(OpReg reg3))
+  | reg1 == reg3
+  = pprInstr (ADD format (OpImm displ) dst)
+
+pprInstr (LEA format src dst) = pprFormatOpOp (sLit "lea") format src dst
+
+pprInstr (ADD format (OpImm (ImmInt (-1))) dst)
+  = pprFormatOp (sLit "dec") format dst
+pprInstr (ADD format (OpImm (ImmInt 1)) dst)
+  = pprFormatOp (sLit "inc") format dst
+pprInstr (ADD format src dst) = pprFormatOpOp (sLit "add") format src dst
+pprInstr (ADC format src dst) = pprFormatOpOp (sLit "adc") format src dst
+pprInstr (SUB format src dst) = pprFormatOpOp (sLit "sub") format src dst
+pprInstr (SBB format src dst) = pprFormatOpOp (sLit "sbb") format src dst
+pprInstr (IMUL format op1 op2) = pprFormatOpOp (sLit "imul") format op1 op2
+
+pprInstr (ADD_CC format src dst)
+  = pprFormatOpOp (sLit "add") format src dst
+pprInstr (SUB_CC format src dst)
+  = pprFormatOpOp (sLit "sub") format src dst
+
+{- A hack.  The Intel documentation says that "The two and three
+   operand forms [of IMUL] may also be used with unsigned operands
+   because the lower half of the product is the same regardless if
+   (sic) the operands are signed or unsigned.  The CF and OF flags,
+   however, cannot be used to determine if the upper half of the
+   result is non-zero."  So there.
+-}
+
+-- Use a 32-bit instruction when possible as it saves a byte.
+-- Notably, extracting the tag bits of a pointer has this form.
+-- TODO: we could save a byte in a subsequent CMP instruction too,
+-- but need something like a peephole pass for this
+pprInstr (AND II64 src@(OpImm (ImmInteger mask)) dst)
+  | 0 <= mask && mask < 0xffffffff
+    = pprInstr (AND II32 src dst)
+pprInstr (AND FF32 src dst) = pprOpOp (sLit "andps") FF32 src dst
+pprInstr (AND FF64 src dst) = pprOpOp (sLit "andpd") FF64 src dst
+pprInstr (AND format src dst) = pprFormatOpOp (sLit "and") format src dst
+pprInstr (OR  format src dst) = pprFormatOpOp (sLit "or")  format src dst
+
+pprInstr (XOR FF32 src dst) = pprOpOp (sLit "xorps") FF32 src dst
+pprInstr (XOR FF64 src dst) = pprOpOp (sLit "xorpd") FF64 src dst
+pprInstr (XOR format src dst) = pprFormatOpOp (sLit "xor")  format src dst
+
+pprInstr (POPCNT format src dst) = pprOpOp (sLit "popcnt") format src (OpReg dst)
+pprInstr (LZCNT format src dst)  = pprOpOp (sLit "lzcnt")  format src (OpReg dst)
+pprInstr (TZCNT format src dst)  = pprOpOp (sLit "tzcnt")  format src (OpReg dst)
+pprInstr (BSF format src dst)    = pprOpOp (sLit "bsf")    format src (OpReg dst)
+pprInstr (BSR format src dst)    = pprOpOp (sLit "bsr")    format src (OpReg dst)
+
+pprInstr (PDEP format src mask dst)   = pprFormatOpOpReg (sLit "pdep") format src mask dst
+pprInstr (PEXT format src mask dst)   = pprFormatOpOpReg (sLit "pext") format src mask dst
+
+pprInstr (PREFETCH NTA format src ) = pprFormatOp_ (sLit "prefetchnta") format src
+pprInstr (PREFETCH Lvl0 format src) = pprFormatOp_ (sLit "prefetcht0") format src
+pprInstr (PREFETCH Lvl1 format src) = pprFormatOp_ (sLit "prefetcht1") format src
+pprInstr (PREFETCH Lvl2 format src) = pprFormatOp_ (sLit "prefetcht2") format src
+
+pprInstr (NOT format op) = pprFormatOp (sLit "not") format op
+pprInstr (BSWAP format op) = pprFormatOp (sLit "bswap") format (OpReg op)
+pprInstr (NEGI format op) = pprFormatOp (sLit "neg") format op
+
+pprInstr (SHL format src dst) = pprShift (sLit "shl") format src dst
+pprInstr (SAR format src dst) = pprShift (sLit "sar") format src dst
+pprInstr (SHR format src dst) = pprShift (sLit "shr") format src dst
+
+pprInstr (BT  format imm src) = pprFormatImmOp (sLit "bt") format imm src
+
+pprInstr (CMP format src dst)
+  | isFloatFormat format =  pprFormatOpOp (sLit "ucomi") format src dst -- SSE2
+  | otherwise     =  pprFormatOpOp (sLit "cmp")   format src dst
+
+pprInstr (TEST format src dst) = sdocWithPlatform $ \platform ->
+  let format' = case (src,dst) of
+        -- Match instructions like 'test $0x3,%esi' or 'test $0x7,%rbx'.
+        -- We can replace them by equivalent, but smaller instructions
+        -- by reducing the size of the immediate operand as far as possible.
+        -- (We could handle masks larger than a single byte too,
+        -- but it would complicate the code considerably
+        -- and tag checks are by far the most common case.)
+        -- The mask must have the high bit clear for this smaller encoding
+        -- to be completely equivalent to the original; in particular so
+        -- that the signed comparison condition bits are the same as they
+        -- would be if doing a full word comparison. See #13425.
+        (OpImm (ImmInteger mask), OpReg dstReg)
+          | 0 <= mask && mask < 128 -> minSizeOfReg platform dstReg
+        _ -> format
+  in pprFormatOpOp (sLit "test") format' src dst
+  where
+    minSizeOfReg platform (RegReal (RealRegSingle i))
+      | target32Bit platform && i <= 3        = II8  -- al, bl, cl, dl
+      | target32Bit platform && i <= 7        = II16 -- si, di, bp, sp
+      | not (target32Bit platform) && i <= 15 = II8  -- al .. r15b
+    minSizeOfReg _ _ = format                 -- other
+
+pprInstr (PUSH format op) = pprFormatOp (sLit "push") format op
+pprInstr (POP format op) = pprFormatOp (sLit "pop") format op
+
+-- both unused (SDM):
+-- pprInstr PUSHA = text "\tpushal"
+-- pprInstr POPA = text "\tpopal"
+
+pprInstr NOP = text "\tnop"
+pprInstr (CLTD II8) = text "\tcbtw"
+pprInstr (CLTD II16) = text "\tcwtd"
+pprInstr (CLTD II32) = text "\tcltd"
+pprInstr (CLTD II64) = text "\tcqto"
+pprInstr (CLTD x) = panic $ "pprInstr: " ++ show x
+
+pprInstr (SETCC cond op) = pprCondInstr (sLit "set") cond (pprOperand II8 op)
+
+pprInstr (JXX cond blockid)
+  = pprCondInstr (sLit "j") cond (ppr lab)
+  where lab = blockLbl blockid
+
+pprInstr        (JXX_GBL cond imm) = pprCondInstr (sLit "j") cond (pprImm imm)
+
+pprInstr        (JMP (OpImm imm) _) = text "\tjmp " <> pprImm imm
+pprInstr (JMP op _)          = sdocWithPlatform $ \platform ->
+                               text "\tjmp *"
+                                   <> pprOperand (archWordFormat (target32Bit platform)) op
+pprInstr (JMP_TBL op _ _ _)  = pprInstr (JMP op [])
+pprInstr        (CALL (Left imm) _)    = text "\tcall " <> pprImm imm
+pprInstr (CALL (Right reg) _)   = sdocWithPlatform $ \platform ->
+                                  text "\tcall *"
+                                      <> pprReg (archWordFormat (target32Bit platform)) reg
+
+pprInstr (IDIV fmt op)   = pprFormatOp (sLit "idiv") fmt op
+pprInstr (DIV fmt op)    = pprFormatOp (sLit "div")  fmt op
+pprInstr (IMUL2 fmt op)  = pprFormatOp (sLit "imul") fmt op
+
+-- x86_64 only
+pprInstr (MUL format op1 op2) = pprFormatOpOp (sLit "mul") format op1 op2
+pprInstr (MUL2 format op) = pprFormatOp (sLit "mul") format op
+
+pprInstr (FDIV format op1 op2) = pprFormatOpOp (sLit "div") format op1 op2
+pprInstr (SQRT format op1 op2) = pprFormatOpReg (sLit "sqrt") format op1 op2
+
+pprInstr (CVTSS2SD from to)      = pprRegReg (sLit "cvtss2sd") from to
+pprInstr (CVTSD2SS from to)      = pprRegReg (sLit "cvtsd2ss") from to
+pprInstr (CVTTSS2SIQ fmt from to) = pprFormatFormatOpReg (sLit "cvttss2si") FF32 fmt from to
+pprInstr (CVTTSD2SIQ fmt from to) = pprFormatFormatOpReg (sLit "cvttsd2si") FF64 fmt from to
+pprInstr (CVTSI2SS fmt from to)   = pprFormatOpReg (sLit "cvtsi2ss") fmt from to
+pprInstr (CVTSI2SD fmt from to)   = pprFormatOpReg (sLit "cvtsi2sd") fmt from to
+
+    -- FETCHGOT for PIC on ELF platforms
+pprInstr (FETCHGOT reg)
+   = vcat [ text "\tcall 1f",
+            hcat [ text "1:\tpopl\t", pprReg II32 reg ],
+            hcat [ text "\taddl\t$_GLOBAL_OFFSET_TABLE_+(.-1b), ",
+                   pprReg II32 reg ]
+          ]
+
+    -- FETCHPC for PIC on Darwin/x86
+    -- get the instruction pointer into a register
+    -- (Terminology note: the IP is called Program Counter on PPC,
+    --  and it's a good thing to use the same name on both platforms)
+pprInstr (FETCHPC reg)
+   = vcat [ text "\tcall 1f",
+            hcat [ text "1:\tpopl\t", pprReg II32 reg ]
+          ]
+
+
+-- the
+-- GST fmt src addr ==> FLD dst ; FSTPsz addr
+pprInstr g@(X87Store fmt  addr)
+ = pprX87 g (hcat [gtab,
+                 text "fstp", pprFormat_x87 fmt, gsp, pprAddr addr])
+
+
+-- Atomics
+
+pprInstr (LOCK i) = text "\tlock" $$ pprInstr i
+
+pprInstr MFENCE = text "\tmfence"
+
+pprInstr (XADD format src dst) = pprFormatOpOp (sLit "xadd") format src dst
+
+pprInstr (CMPXCHG format src dst)
+   = pprFormatOpOp (sLit "cmpxchg") format src dst
+
+
+
+--------------------------
+-- some left over
+
+
+
+gtab :: SDoc
+gtab  = char '\t'
+
+gsp :: SDoc
+gsp   = char ' '
+
+
+
+pprX87 :: Instr -> SDoc -> SDoc
+pprX87 fake actual
+   = (char '#' <> pprX87Instr fake) $$ actual
+
+pprX87Instr :: Instr -> SDoc
+pprX87Instr (X87Store fmt  dst) = pprFormatAddr (sLit "gst") fmt  dst
+pprX87Instr _ = panic "X86.Ppr.pprX87Instr: no match"
+
+pprDollImm :: Imm -> SDoc
+pprDollImm i = text "$" <> pprImm i
+
+
+pprOperand :: Format -> Operand -> SDoc
+pprOperand f (OpReg r)   = pprReg f r
+pprOperand _ (OpImm i)   = pprDollImm i
+pprOperand _ (OpAddr ea) = pprAddr ea
+
+
+pprMnemonic_  :: PtrString -> SDoc
+pprMnemonic_ name =
+   char '\t' <> ptext name <> space
+
+
+pprMnemonic  :: PtrString -> Format -> SDoc
+pprMnemonic name format =
+   char '\t' <> ptext name <> pprFormat format <> space
+
+
+pprFormatImmOp :: PtrString -> Format -> Imm -> Operand -> SDoc
+pprFormatImmOp name format imm op1
+  = hcat [
+        pprMnemonic name format,
+        char '$',
+        pprImm imm,
+        comma,
+        pprOperand format op1
+    ]
+
+
+pprFormatOp_ :: PtrString -> Format -> Operand -> SDoc
+pprFormatOp_ name format op1
+  = hcat [
+        pprMnemonic_ name ,
+        pprOperand format op1
+    ]
+
+pprFormatOp :: PtrString -> Format -> Operand -> SDoc
+pprFormatOp name format op1
+  = hcat [
+        pprMnemonic name format,
+        pprOperand format op1
+    ]
+
+
+pprFormatOpOp :: PtrString -> Format -> Operand -> Operand -> SDoc
+pprFormatOpOp name format op1 op2
+  = hcat [
+        pprMnemonic name format,
+        pprOperand format op1,
+        comma,
+        pprOperand format op2
+    ]
+
+
+pprOpOp :: PtrString -> Format -> Operand -> Operand -> SDoc
+pprOpOp name format op1 op2
+  = hcat [
+        pprMnemonic_ name,
+        pprOperand format op1,
+        comma,
+        pprOperand format op2
+    ]
+
+
+
+pprRegReg :: PtrString -> Reg -> Reg -> SDoc
+pprRegReg name reg1 reg2
+  = sdocWithPlatform $ \platform ->
+    hcat [
+        pprMnemonic_ name,
+        pprReg (archWordFormat (target32Bit platform)) reg1,
+        comma,
+        pprReg (archWordFormat (target32Bit platform)) reg2
+    ]
+
+
+pprFormatOpReg :: PtrString -> Format -> Operand -> Reg -> SDoc
+pprFormatOpReg name format op1 reg2
+  = sdocWithPlatform $ \platform ->
+    hcat [
+        pprMnemonic name format,
+        pprOperand format op1,
+        comma,
+        pprReg (archWordFormat (target32Bit platform)) reg2
+    ]
+
+pprCondOpReg :: PtrString -> Format -> Cond -> Operand -> Reg -> SDoc
+pprCondOpReg name format cond op1 reg2
+  = hcat [
+        char '\t',
+        ptext name,
+        pprCond cond,
+        space,
+        pprOperand format op1,
+        comma,
+        pprReg format reg2
+    ]
+
+pprFormatFormatOpReg :: PtrString -> Format -> Format -> Operand -> Reg -> SDoc
+pprFormatFormatOpReg name format1 format2 op1 reg2
+  = hcat [
+        pprMnemonic name format2,
+        pprOperand format1 op1,
+        comma,
+        pprReg format2 reg2
+    ]
+
+pprFormatOpOpReg :: PtrString -> Format -> Operand -> Operand -> Reg -> SDoc
+pprFormatOpOpReg name format op1 op2 reg3
+  = hcat [
+        pprMnemonic name format,
+        pprOperand format op1,
+        comma,
+        pprOperand format op2,
+        comma,
+        pprReg format reg3
+    ]
+
+
+
+pprFormatAddr :: PtrString -> Format -> AddrMode -> SDoc
+pprFormatAddr name format  op
+  = hcat [
+        pprMnemonic name format,
+        comma,
+        pprAddr op
+    ]
+
+pprShift :: PtrString -> Format -> Operand -> Operand -> SDoc
+pprShift name format src dest
+  = hcat [
+        pprMnemonic name format,
+        pprOperand II8 src,  -- src is 8-bit sized
+        comma,
+        pprOperand format dest
+    ]
+
+
+pprFormatOpOpCoerce :: PtrString -> Format -> Format -> Operand -> Operand -> SDoc
+pprFormatOpOpCoerce name format1 format2 op1 op2
+  = hcat [ char '\t', ptext name, pprFormat format1, pprFormat format2, space,
+        pprOperand format1 op1,
+        comma,
+        pprOperand format2 op2
+    ]
+
+
+pprCondInstr :: PtrString -> Cond -> SDoc -> SDoc
+pprCondInstr name cond arg
+  = hcat [ char '\t', ptext name, pprCond cond, space, arg]
diff --git a/compiler/GHC/CmmToAsm/X86/RegInfo.hs b/compiler/GHC/CmmToAsm/X86/RegInfo.hs
new file mode 100644
index 0000000000..597efe1c3e
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/RegInfo.hs
@@ -0,0 +1,73 @@
+{-# LANGUAGE CPP #-}
+module GHC.CmmToAsm.X86.RegInfo (
+        mkVirtualReg,
+        regDotColor
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.CmmToAsm.Format
+import GHC.Platform.Reg
+
+import Outputable
+import GHC.Platform
+import Unique
+
+import UniqFM
+import GHC.CmmToAsm.X86.Regs
+
+
+mkVirtualReg :: Unique -> Format -> VirtualReg
+mkVirtualReg u format
+   = case format of
+        FF32    -> VirtualRegD u
+        -- for scalar F32, we use the same xmm as F64!
+        -- this is a hack that needs some improvement.
+        -- For now we map both to being allocated as "Double" Registers
+        -- on X86/X86_64
+        FF64    -> VirtualRegD u
+        _other  -> VirtualRegI u
+
+regDotColor :: Platform -> RealReg -> SDoc
+regDotColor platform reg
+ = case (lookupUFM (regColors platform) reg) of
+        Just str -> text str
+        _        -> panic "Register not assigned a color"
+
+regColors :: Platform -> UniqFM [Char]
+regColors platform = listToUFM (normalRegColors platform)
+
+normalRegColors :: Platform -> [(Reg,String)]
+normalRegColors platform =
+    zip (map regSingle [0..lastint platform]) colors
+        ++ zip (map regSingle [firstxmm..lastxmm platform]) greys
+  where
+    -- 16 colors - enough for amd64 gp regs
+    colors = ["#800000","#ff0000","#808000","#ffff00","#008000"
+             ,"#00ff00","#008080","#00ffff","#000080","#0000ff"
+             ,"#800080","#ff00ff","#87005f","#875f00","#87af00"
+             ,"#ff00af"]
+
+    -- 16 shades of grey, enough for the currently supported
+    -- SSE extensions.
+    greys = ["#0e0e0e","#1c1c1c","#2a2a2a","#383838","#464646"
+            ,"#545454","#626262","#707070","#7e7e7e","#8c8c8c"
+            ,"#9a9a9a","#a8a8a8","#b6b6b6","#c4c4c4","#d2d2d2"
+            ,"#e0e0e0"]
+
+
+
+--     32 shades of grey - use for avx 512 if we ever need it
+--     greys = ["#070707","#0e0e0e","#151515","#1c1c1c"
+--             ,"#232323","#2a2a2a","#313131","#383838","#3f3f3f"
+--             ,"#464646","#4d4d4d","#545454","#5b5b5b","#626262"
+--             ,"#696969","#707070","#777777","#7e7e7e","#858585"
+--             ,"#8c8c8c","#939393","#9a9a9a","#a1a1a1","#a8a8a8"
+--             ,"#afafaf","#b6b6b6","#bdbdbd","#c4c4c4","#cbcbcb"
+--             ,"#d2d2d2","#d9d9d9","#e0e0e0"]
+
+
diff --git a/compiler/GHC/CmmToAsm/X86/Regs.hs b/compiler/GHC/CmmToAsm/X86/Regs.hs
new file mode 100644
index 0000000000..87e31a1428
--- /dev/null
+++ b/compiler/GHC/CmmToAsm/X86/Regs.hs
@@ -0,0 +1,442 @@
+{-# LANGUAGE CPP #-}
+
+module GHC.CmmToAsm.X86.Regs (
+        -- squeese functions for the graph allocator
+        virtualRegSqueeze,
+        realRegSqueeze,
+
+        -- immediates
+        Imm(..),
+        strImmLit,
+        litToImm,
+
+        -- addressing modes
+        AddrMode(..),
+        addrOffset,
+
+        -- registers
+        spRel,
+        argRegs,
+        allArgRegs,
+        allIntArgRegs,
+        callClobberedRegs,
+        instrClobberedRegs,
+        allMachRegNos,
+        classOfRealReg,
+        showReg,
+
+        -- machine specific
+        EABase(..), EAIndex(..), addrModeRegs,
+
+        eax, ebx, ecx, edx, esi, edi, ebp, esp,
+
+
+        rax, rbx, rcx, rdx, rsi, rdi, rbp, rsp,
+        r8,  r9,  r10, r11, r12, r13, r14, r15,
+        lastint,
+        xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
+        xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
+        xmm,
+        firstxmm, lastxmm,
+
+        ripRel,
+        allFPArgRegs,
+
+        allocatableRegs
+)
+
+where
+
+#include "HsVersions.h"
+
+import GhcPrelude
+
+import GHC.Platform.Regs
+import GHC.Platform.Reg
+import GHC.Platform.Reg.Class
+
+import GHC.Cmm
+import GHC.Cmm.CLabel           ( CLabel )
+import GHC.Driver.Session
+import Outputable
+import GHC.Platform
+
+import qualified Data.Array as A
+
+-- | regSqueeze_class reg
+--      Calculate the maximum number of register colors that could be
+--      denied to a node of this class due to having this reg
+--      as a neighbour.
+--
+{-# INLINE virtualRegSqueeze #-}
+virtualRegSqueeze :: RegClass -> VirtualReg -> Int
+
+virtualRegSqueeze cls vr
+ = case cls of
+        RcInteger
+         -> case vr of
+                VirtualRegI{}           -> 1
+                VirtualRegHi{}          -> 1
+                _other                  -> 0
+
+        RcDouble
+         -> case vr of
+                VirtualRegD{}           -> 1
+                VirtualRegF{}           -> 0
+                _other                  -> 0
+
+
+        _other -> 0
+
+{-# INLINE realRegSqueeze #-}
+realRegSqueeze :: RegClass -> RealReg -> Int
+realRegSqueeze cls rr
+ = case cls of
+        RcInteger
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo < firstxmm -> 1
+                        | otherwise     -> 0
+
+                RealRegPair{}           -> 0
+
+        RcDouble
+         -> case rr of
+                RealRegSingle regNo
+                        | regNo >= firstxmm  -> 1
+                        | otherwise     -> 0
+
+                RealRegPair{}           -> 0
+
+
+        _other -> 0
+
+-- -----------------------------------------------------------------------------
+-- Immediates
+
+data Imm
+  = ImmInt      Int
+  | ImmInteger  Integer     -- Sigh.
+  | ImmCLbl     CLabel      -- AbstractC Label (with baggage)
+  | ImmLit      SDoc        -- Simple string
+  | ImmIndex    CLabel Int
+  | ImmFloat    Rational
+  | ImmDouble   Rational
+  | ImmConstantSum Imm Imm
+  | ImmConstantDiff Imm Imm
+
+strImmLit :: String -> Imm
+strImmLit s = ImmLit (text s)
+
+
+litToImm :: CmmLit -> Imm
+litToImm (CmmInt i w)        = ImmInteger (narrowS w i)
+                -- narrow to the width: a CmmInt might be out of
+                -- range, but we assume that ImmInteger only contains
+                -- in-range values.  A signed value should be fine here.
+litToImm (CmmFloat f W32)    = ImmFloat f
+litToImm (CmmFloat f W64)    = ImmDouble f
+litToImm (CmmLabel l)        = ImmCLbl l
+litToImm (CmmLabelOff l off) = ImmIndex l off
+litToImm (CmmLabelDiffOff l1 l2 off _)
+                             = ImmConstantSum
+                               (ImmConstantDiff (ImmCLbl l1) (ImmCLbl l2))
+                               (ImmInt off)
+litToImm _                   = panic "X86.Regs.litToImm: no match"
+
+-- addressing modes ------------------------------------------------------------
+
+data AddrMode
+        = AddrBaseIndex EABase EAIndex Displacement
+        | ImmAddr Imm Int
+
+data EABase       = EABaseNone  | EABaseReg Reg | EABaseRip
+data EAIndex      = EAIndexNone | EAIndex Reg Int
+type Displacement = Imm
+
+
+addrOffset :: AddrMode -> Int -> Maybe AddrMode
+addrOffset addr off
+  = case addr of
+      ImmAddr i off0      -> Just (ImmAddr i (off0 + off))
+
+      AddrBaseIndex r i (ImmInt n) -> Just (AddrBaseIndex r i (ImmInt (n + off)))
+      AddrBaseIndex r i (ImmInteger n)
+        -> Just (AddrBaseIndex r i (ImmInt (fromInteger (n + toInteger off))))
+
+      AddrBaseIndex r i (ImmCLbl lbl)
+        -> Just (AddrBaseIndex r i (ImmIndex lbl off))
+
+      AddrBaseIndex r i (ImmIndex lbl ix)
+        -> Just (AddrBaseIndex r i (ImmIndex lbl (ix+off)))
+
+      _ -> Nothing  -- in theory, shouldn't happen
+
+
+addrModeRegs :: AddrMode -> [Reg]
+addrModeRegs (AddrBaseIndex b i _) =  b_regs ++ i_regs
+  where
+   b_regs = case b of { EABaseReg r -> [r]; _ -> [] }
+   i_regs = case i of { EAIndex r _ -> [r]; _ -> [] }
+addrModeRegs _ = []
+
+
+-- registers -------------------------------------------------------------------
+
+-- @spRel@ gives us a stack relative addressing mode for volatile
+-- temporaries and for excess call arguments.  @fpRel@, where
+-- applicable, is the same but for the frame pointer.
+
+
+spRel :: DynFlags
+      -> Int -- ^ desired stack offset in bytes, positive or negative
+      -> AddrMode
+spRel dflags n
+ | target32Bit (targetPlatform dflags)
+    = AddrBaseIndex (EABaseReg esp) EAIndexNone (ImmInt n)
+ | otherwise
+    = AddrBaseIndex (EABaseReg rsp) EAIndexNone (ImmInt n)
+
+-- The register numbers must fit into 32 bits on x86, so that we can
+-- use a Word32 to represent the set of free registers in the register
+-- allocator.
+
+
+
+firstxmm :: RegNo
+firstxmm  = 16
+
+--  on 32bit platformOSs, only the first 8 XMM/YMM/ZMM registers are available
+lastxmm :: Platform -> RegNo
+lastxmm platform
+ | target32Bit platform = firstxmm + 7  -- xmm0 - xmmm7
+ | otherwise            = firstxmm + 15 -- xmm0 -xmm15
+
+lastint :: Platform -> RegNo
+lastint platform
+ | target32Bit platform = 7 -- not %r8..%r15
+ | otherwise            = 15
+
+intregnos :: Platform -> [RegNo]
+intregnos platform = [0 .. lastint platform]
+
+
+
+xmmregnos :: Platform -> [RegNo]
+xmmregnos platform = [firstxmm  .. lastxmm platform]
+
+floatregnos :: Platform -> [RegNo]
+floatregnos platform = xmmregnos platform
+
+-- argRegs is the set of regs which are read for an n-argument call to C.
+-- For archs which pass all args on the stack (x86), is empty.
+-- Sparc passes up to the first 6 args in regs.
+argRegs :: RegNo -> [Reg]
+argRegs _       = panic "MachRegs.argRegs(x86): should not be used!"
+
+-- | The complete set of machine registers.
+allMachRegNos :: Platform -> [RegNo]
+allMachRegNos platform = intregnos platform ++ floatregnos platform
+
+-- | Take the class of a register.
+{-# INLINE classOfRealReg #-}
+classOfRealReg :: Platform -> RealReg -> RegClass
+-- On x86, we might want to have an 8-bit RegClass, which would
+-- contain just regs 1-4 (the others don't have 8-bit versions).
+-- However, we can get away without this at the moment because the
+-- only allocatable integer regs are also 8-bit compatible (1, 3, 4).
+classOfRealReg platform reg
+    = case reg of
+        RealRegSingle i
+            | i <= lastint platform -> RcInteger
+            | i <= lastxmm platform -> RcDouble
+            | otherwise             -> panic "X86.Reg.classOfRealReg registerSingle too high"
+        _   -> panic "X86.Regs.classOfRealReg: RegPairs on this arch"
+
+-- | Get the name of the register with this number.
+-- NOTE: fixme, we dont track which "way" the XMM registers are used
+showReg :: Platform -> RegNo -> String
+showReg platform n
+        | n >= firstxmm && n <= lastxmm  platform = "%xmm" ++ show (n-firstxmm)
+        | n >= 8   && n < firstxmm      = "%r" ++ show n
+        | otherwise      = regNames platform A.! n
+
+regNames :: Platform -> A.Array Int String
+regNames platform
+    = if target32Bit platform
+      then A.listArray (0,8) ["%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi", "%ebp", "%esp"]
+      else A.listArray (0,8) ["%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", "%rbp", "%rsp"]
+
+
+
+-- machine specific ------------------------------------------------------------
+
+
+{-
+Intel x86 architecture:
+- All registers except 7 (esp) are available for use.
+- Only ebx, esi, edi and esp are available across a C call (they are callee-saves).
+- Registers 0-7 have 16-bit counterparts (ax, bx etc.)
+- Registers 0-3 have 8 bit counterparts (ah, bh etc.)
+
+The fp registers are all Double registers; we don't have any RcFloat class
+regs.  @regClass@ barfs if you give it a VirtualRegF, and mkVReg above should
+never generate them.
+
+TODO: cleanup modelling float vs double registers and how they are the same class.
+-}
+
+
+eax, ebx, ecx, edx, esp, ebp, esi, edi :: Reg
+
+eax   = regSingle 0
+ebx   = regSingle 1
+ecx   = regSingle 2
+edx   = regSingle 3
+esi   = regSingle 4
+edi   = regSingle 5
+ebp   = regSingle 6
+esp   = regSingle 7
+
+
+
+
+{-
+AMD x86_64 architecture:
+- All 16 integer registers are addressable as 8, 16, 32 and 64-bit values:
+
+  8     16    32    64
+  ---------------------
+  al    ax    eax   rax
+  bl    bx    ebx   rbx
+  cl    cx    ecx   rcx
+  dl    dx    edx   rdx
+  sil   si    esi   rsi
+  dil   si    edi   rdi
+  bpl   bp    ebp   rbp
+  spl   sp    esp   rsp
+  r10b  r10w  r10d  r10
+  r11b  r11w  r11d  r11
+  r12b  r12w  r12d  r12
+  r13b  r13w  r13d  r13
+  r14b  r14w  r14d  r14
+  r15b  r15w  r15d  r15
+-}
+
+rax, rbx, rcx, rdx, rsp, rbp, rsi, rdi,
+  r8, r9, r10, r11, r12, r13, r14, r15,
+  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
+  xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 :: Reg
+
+rax   = regSingle 0
+rbx   = regSingle 1
+rcx   = regSingle 2
+rdx   = regSingle 3
+rsi   = regSingle 4
+rdi   = regSingle 5
+rbp   = regSingle 6
+rsp   = regSingle 7
+r8    = regSingle 8
+r9    = regSingle 9
+r10   = regSingle 10
+r11   = regSingle 11
+r12   = regSingle 12
+r13   = regSingle 13
+r14   = regSingle 14
+r15   = regSingle 15
+xmm0  = regSingle 16
+xmm1  = regSingle 17
+xmm2  = regSingle 18
+xmm3  = regSingle 19
+xmm4  = regSingle 20
+xmm5  = regSingle 21
+xmm6  = regSingle 22
+xmm7  = regSingle 23
+xmm8  = regSingle 24
+xmm9  = regSingle 25
+xmm10 = regSingle 26
+xmm11 = regSingle 27
+xmm12 = regSingle 28
+xmm13 = regSingle 29
+xmm14 = regSingle 30
+xmm15 = regSingle 31
+
+ripRel :: Displacement -> AddrMode
+ripRel imm      = AddrBaseIndex EABaseRip EAIndexNone imm
+
+
+ -- so we can re-use some x86 code:
+{-
+eax = rax
+ebx = rbx
+ecx = rcx
+edx = rdx
+esi = rsi
+edi = rdi
+ebp = rbp
+esp = rsp
+-}
+
+xmm :: RegNo -> Reg
+xmm n = regSingle (firstxmm+n)
+
+
+
+
+-- | these are the regs which we cannot assume stay alive over a C call.
+callClobberedRegs       :: Platform -> [Reg]
+-- caller-saves registers
+callClobberedRegs platform
+ | target32Bit platform = [eax,ecx,edx] ++ map regSingle (floatregnos platform)
+ | platformOS platform == OSMinGW32
+   = [rax,rcx,rdx,r8,r9,r10,r11]
+   -- Only xmm0-5 are caller-saves registers on 64bit windows.
+   -- ( https://docs.microsoft.com/en-us/cpp/build/register-usage )
+   -- For details check the Win64 ABI.
+   ++ map xmm [0  .. 5]
+ | otherwise
+    -- all xmm regs are caller-saves
+    -- caller-saves registers
+    = [rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11]
+   ++ map regSingle (floatregnos platform)
+
+allArgRegs :: Platform -> [(Reg, Reg)]
+allArgRegs platform
+ | platformOS platform == OSMinGW32 = zip [rcx,rdx,r8,r9]
+                                          (map regSingle [firstxmm ..])
+ | otherwise = panic "X86.Regs.allArgRegs: not defined for this arch"
+
+allIntArgRegs :: Platform -> [Reg]
+allIntArgRegs platform
+ | (platformOS platform == OSMinGW32) || target32Bit platform
+    = panic "X86.Regs.allIntArgRegs: not defined for this platform"
+ | otherwise = [rdi,rsi,rdx,rcx,r8,r9]
+
+
+-- | on 64bit platforms we pass the first 8 float/double arguments
+-- in the xmm registers.
+allFPArgRegs :: Platform -> [Reg]
+allFPArgRegs platform
+ | platformOS platform == OSMinGW32
+    = panic "X86.Regs.allFPArgRegs: not defined for this platform"
+ | otherwise = map regSingle [firstxmm .. firstxmm + 7 ]
+
+
+-- Machine registers which might be clobbered by instructions that
+-- generate results into fixed registers, or need arguments in a fixed
+-- register.
+instrClobberedRegs :: Platform -> [Reg]
+instrClobberedRegs platform
+ | target32Bit platform = [ eax, ecx, edx ]
+ | otherwise            = [ rax, rcx, rdx ]
+
+--
+
+-- allocatableRegs is allMachRegNos with the fixed-use regs removed.
+-- i.e., these are the regs for which we are prepared to allow the
+-- register allocator to attempt to map VRegs to.
+allocatableRegs :: Platform -> [RealReg]
+allocatableRegs platform
+   = let isFree i = freeReg platform i
+     in  map RealRegSingle $ filter isFree (allMachRegNos platform)
+
author	Sylvain Henry <sylvain@haskus.fr>	2020-02-22 15:05:20 +0100
committer	Marge Bot <ben+marge-bot@smart-cactus.org>	2020-02-24 20:55:25 -0500
commit	1b1067d14b656bbbfa7c47f156ec2700c9751549 (patch)
tree	32346e3c4c3f89117190b36364144d85dc260e05 /compiler/GHC/CmmToAsm/X86
parent	354e2787be08fb6d973de1a39e58080ff8e107f8 (diff)
download	haskell-1b1067d14b656bbbfa7c47f156ec2700c9751549.tar.gz